### With Impact

In [1]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.nn.functional import softmax

# Define keywords for events
event_keywords = {
    'default': ['bankruptcy', 'default', 'insolvency'],
    'mergers_acquisitions': ['merger', 'acquisition', 'takeover'],
    'revenue': ['revenue', 'sales', 'earnings'],
    'margin_profitability': ['margin', 'profitability', 'operating income'],
    'industry_competition': ['competition', 'market share', 'competitor']
}

# Check if event is mentioned in the text
def is_event_mentioned(text, keywords):
    for word in keywords:
        if word in text.lower():
            return True
    return False

# Function to compute sentiment score
def get_sentiment_score(text, tokenizer, model, device):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probabilities = softmax(logits, dim=1).cpu().numpy()
    
    sentiment_score = probabilities[0, 2] - probabilities[0, 0]
    return sentiment_score

# Load the dataset
df = pd.read_csv('updated_final_annotated_dataset_with_impacts.csv')

# Apply the event checking function to each event type and create a new column for it
for event_type, keywords in event_keywords.items():
    column_name = f'{event_type}_mentioned'
    df[column_name] = df['content'].apply(lambda text: is_event_mentioned(text, keywords))

# Filter the impact scores based on whether the corresponding event is mentioned
for event_type in event_keywords:
    impact_column = f'{event_type}_impact'
    mentioned_column = f'{event_type}_mentioned'
    df[impact_column] = df.apply(lambda row: row[impact_column] if row[mentioned_column] else 'no_event', axis=1)

# Combine the filtered impact scores into a single column
df['filtered_impacts'] = df[[f'{event_type}_impact' for event_type in event_keywords]].apply(lambda row: ' '.join(str(val) for val in row.values), axis=1)

# Map the categorical labels to integers
label_to_id = {'good': 1, 'neutral': 0, 'bad': -1, 'no_event': 0}
df['impact_numerical'] = df['filtered_impacts'].apply(lambda impacts: max([label_to_id.get(impact, 0) for impact in impacts.split()]))

# Load tokenizer and model for sentiment analysis
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone')

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Compute the sentiment score for each article and add it as a column
df['sentiment_score'] = df['content'].apply(lambda text: get_sentiment_score(text, tokenizer, model, device))







  return self.fget.__get__(instance, owner)()


In [2]:
# Specify the column names
text_column = 'content'
true_label_column = 'impact_numerical'

# Split the data into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df[text_column], df[true_label_column], test_size=0.2, random_state=42)

# Tokenize the training and test data
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True, max_length=512)

# Convert labels to numpy and then to tensors
train_dataset = TensorDataset(
    torch.tensor(train_encodings['input_ids']),
    torch.tensor(train_encodings['attention_mask']),
    torch.tensor(train_labels.values)
)
test_dataset = TensorDataset(
    torch.tensor(test_encodings['input_ids']),
    torch.tensor(test_encodings['attention_mask']),
    torch.tensor(test_labels.values)
)

# DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16)

# Training loop
optimizer = AdamW(model.parameters(), lr=5e-5)
for epoch in range(3):  # Number of epochs
    model.train()
    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1} completed")

# Evaluate the model
def evaluate_model(dataloader, model):
    model.eval()
    total_accuracy = 0
    for batch in dataloader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)
            accuracy = (preds == labels).float().mean()
            total_accuracy += accuracy.item()

    return total_accuracy / len(dataloader)

test_accuracy = evaluate_model(test_dataloader, model)
print(f"Test accuracy: {test_accuracy:.4f}")




Epoch 1 completed
Epoch 2 completed
Epoch 3 completed
Test accuracy: 0.8741


In [4]:
df.head()

Unnamed: 0,index,summary,description,Ticker,Sector,Industry,Company,pubDate_brief,pubDate,categories,...,margin_profitability_impact,industry_competition_impact,default_mentioned,mergers_acquisitions_mentioned,revenue_mentioned,margin_profitability_mentioned,industry_competition_mentioned,filtered_impacts,impact_numerical,sentiment_score
0,12024,Osaka Governor Hirofumi Yoshimura said that th...,Years of delay to plans for Japan‚Äö√Ñ√¥s firs...,MGM,Services,Casinos & Gaming,MGM Resorts International,2023-05-18,2023-05-18T21:25:29+00:00,[{'name': 'Health'}],...,no_event,no_event,False,False,False,False,False,no_event no_event no_event no_event no_event,0,-0.99978
1,20675,MetLife (MET) is a Finance stock that has seen...,Dividends are one of the best benefits to bein...,MET,Financials,Insurance,Metlife Inc,2022-10-31,2022-10-31T20:36:25+00:00,[],...,no_event,good,False,False,True,False,True,no_event no_event good no_event good,1,-0.999844
2,33685,"This week, top-five producer AngloGold Ashanti...",(Bloomberg) -- The momentum has been building ...,NEM,Extractives & Minerals Processing,Metals & Mining,Newmont Corp,2023-02-08,2023-02-08T22:16:21+00:00,[{'name': 'Politics'}],...,no_event,no_event,False,False,False,False,False,no_event no_event no_event no_event no_event,0,0.985379
3,12072,The case is In re Tesla Inc Securities Litigat...,Some of the biggest securities cases of 2023 a...,NDAQ,Financials,Security & Commodity Exchanges,Nasdaq Inc,2023-05-18,2023-05-18T14:28:52+00:00,[{'name': 'Tech'}],...,no_event,no_event,False,False,False,False,False,no_event no_event no_event no_event no_event,0,-0.882581
4,28164,"CFOs Boost Currency Protections, Extend Hedge ...","Coca-Cola, Kimberly-Clark and Prologis are amo...",KO,Food & Beverage,Non-Alcoholic Beverages,Coca-Cola Co,2023-05-04,2023-05-04T23:39:33+00:00,[{'name': 'Tech'}],...,good,good,False,True,True,True,True,no_event good good good good,1,6.1e-05


In [5]:
df.to_csv('modified_dataset.csv', index=False)

print("Dataset has been saved to 'modified_dataset.csv'.")


Dataset has been saved to 'modified_dataset.csv'.


# Separate

In [33]:
import pandas as pd

# Load the dataset
data = pd.read_csv('modified_dataset.csv')

# Define impact types and their related columns
impact_details = {
    'default_present': ['default_present', 'default_impact', 'default_mentioned', 'default_sentiment'],
    'mergers_acquisitions_present': ['mergers_acquisitions_present', 'mergers_acquisitions_impact', 'mergers_acquisitions_mentioned', 'mergers_acquisitions_sentiment'],
    'revenue_present': ['revenue_present', 'revenue_impact', 'revenue_mentioned', 'revenue_sentiment'],
    'margin_profitability_present': ['margin_profitability_present', 'margin_profitability_impact', 'margin_profitability_mentioned', 'margin_profitability_sentiment'],
    'industry_competition_present': ['industry_competition_present', 'industry_competition_impact', 'industry_competition_mentioned', 'industry_competition_sentiment']
}

# Iterate over each impact type and filter the dataset where the impact is marked as present
for impact_type, columns in impact_details.items():
    subset = data[data[impact_type] == True]  # Filter where impact is present

    # Select only relevant columns for this impact type
    relevant_columns = columns + ['index', 'summary', 'description', 'Ticker', 'Sector', 'Industry',
                                  'Company','pubDate', 'pubDate_brief', 'pubDate', 'categories', 'content', 'title',
                                  'relationship_type', 'financial_entities', 'extracted_tickers_summary',
                                  'extracted_tickers_description', 'extracted_tickers_financial_entities',
                                  'relevant_companies','sentiment_score']  # Add any other general columns needed

    subset = subset[relevant_columns]
    subset = subset.dropna(subset=['Ticker'])

    # Save the filtered and trimmed dataset to a CSV file
    filename = f'{impact_type}_subset.csv'
    subset.to_csv(filename, index=False)
    print(f"Subset for {impact_type} saved with shape: {subset.shape} to {filename}")


Subset for default_present saved with shape: (165, 24) to default_present_subset.csv
Subset for mergers_acquisitions_present saved with shape: (269, 24) to mergers_acquisitions_present_subset.csv
Subset for revenue_present saved with shape: (1276, 24) to revenue_present_subset.csv
Subset for margin_profitability_present saved with shape: (379, 24) to margin_profitability_present_subset.csv
Subset for industry_competition_present saved with shape: (622, 24) to industry_competition_present_subset.csv


In [12]:
# Load the dataset
data = pd.read_csv('modified_dataset.csv')

# Print unique values in the 'default_impact' column
print("Unique values in 'default_impact':", data['default_impact'].unique())


Unique values in 'default_impact': ['no_event' nan 'good' 'bad']


In [13]:
impact_mapping = {'no_event': 0, 'good': 1, 'bad': 2}


In [14]:
def tokenize_and_format(examples):
    # Tokenize the text
    result = tokenizer(examples['summary'], padding="max_length", truncation=True)
    
    # Add the label using the defined mapping
    # Here we're using 'default_impact' and a mapping that needs to be defined based on actual column values
    impact_mapping = {'no_event': 0, 'good': 1, 'bad': 2}  # Update this mapping based on actual data
    result['labels'] = [impact_mapping[label] for label in examples['default_impact']]
    
    return result


In [20]:
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score

# Load the dataset
data = pd.read_csv('modified_dataset.csv')

# Checking unique values before proceeding
print("Unique values in 'default_impact':", data['default_impact'].unique())

# Define impact types based on your dataset columns
impact_types = ['default', 'mergers_acquisitions', 'revenue', 'margin_profitability', 'industry_competition']

# Split the dataset by impact type using the '_impact' suffix
impact_datasets = {}
for impact in impact_types:
    impact_datasets[impact] = data[data[f'{impact}_impact'] == 'good']  # Assuming 'good' indicates presence

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)  # Example: 3 labels

def tokenize_and_format(examples):
    result = tokenizer(examples['summary'], padding="max_length", truncation=True, max_length=512)
    impact_mapping = {'no_event': 0, 'good': 1, 'bad': 2}
    result['labels'] = [impact_mapping.get(label, 0) for label in examples['default_impact']]
    return result

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

# Process each dataset with updated tokenization and labeling
for impact, df in impact_datasets.items():
    print(f"Processing dataset for impact type: {impact}")
    hf_dataset = Dataset.from_pandas(df)
    tokenized_dataset = hf_dataset.map(tokenize_and_format, batched=True)

    training_args = TrainingArguments(
        output_dir=f'./results/{impact}',
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        gradient_accumulation_steps=2,
        num_train_epochs=3,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset.select([i for i in range(int(len(tokenized_dataset) * 0.8))]),
        eval_dataset=tokenized_dataset.select([i for i in range(int(len(tokenized_dataset) * 0.8), len(tokenized_dataset))]),
        compute_metrics=compute_metrics
    )

    trainer.train()


Unique values in 'default_impact': ['no_event' nan 'good' 'bad']


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing dataset for impact type: default


Map:   0%|          | 0/149 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy
0,No log,0.319026,1.0
2,No log,0.1108,1.0


Checkpoint destination directory ./results/default/checkpoint-7 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/default/checkpoint-15 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/default/checkpoint-21 already exists and is non-empty. Saving will proceed but saved results may be invalid.


Processing dataset for impact type: mergers_acquisitions


Map:   0%|          | 0/238 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.657766,0.9375
2,No log,0.499346,0.9375
3,No log,0.436659,0.9375


Processing dataset for impact type: revenue


Map:   0%|          | 0/1181 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.146829,0.970464
2,No log,0.140352,0.970464
3,No log,0.144813,0.970464


Processing dataset for impact type: margin_profitability


Map:   0%|          | 0/344 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy
0,No log,0.254508,0.942029
2,No log,0.268567,0.942029


Processing dataset for impact type: industry_competition


Map:   0%|          | 0/449 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy
0,No log,0.082936,0.988889
2,No log,0.067873,0.988889


In [36]:
# Example of checking one impact type
impact_type = 'default'
filtered_data = data[data[f'{impact_type}_impact'] == 'good']  # Assuming 'good' indicates presence
print(f"Entries for {impact_type} impact:", len(filtered_data))


Entries for default impact: 149


In [37]:
# Process each dataset with updated tokenization and labeling
for impact in impact_types:
    subset = data[data[f'{impact}_impact'] == 'good']
    if subset.empty:
        print(f"No entries found for impact type: {impact}")
    else:
        print(f"Processing {len(subset)} entries for impact type: {impact}")
        hf_dataset = Dataset.from_pandas(subset)
        tokenized_dataset = hf_dataset.map(tokenize_and_format, batched=True)

        # Check if tokenized dataset is correct
        print("Sample tokenized data:", tokenized_dataset[:2])

        training_args = TrainingArguments(
            output_dir=f'./results/{impact}',
            evaluation_strategy="epoch",
            save_strategy="epoch",
            learning_rate=2e-5,
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            gradient_accumulation_steps=2,
            num_train_epochs=3,
            weight_decay=0.01,
            load_best_model_at_end=True,
            metric_for_best_model="accuracy"
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_dataset.select([i for i in range(int(len(tokenized_dataset) * 0.8))]),
            eval_dataset=tokenized_dataset.select([i for i in range(int(len(tokenized_dataset) * 0.8), len(tokenized_dataset))]),
            compute_metrics=compute_metrics
        )

        # Train the model and capture metrics
        train_result = trainer.train()
        eval_result = trainer.evaluate()

        # Store the results
        print("Training Loss:", train_result.training_loss)
        print("Evaluation Results:", eval_result)


KeyError: 'default_present_impact'

In [29]:
import pandas as pd

# Load the dataset
data = pd.read_csv('modified_dataset.csv')

# Print column names to identify impact type columns
print("Column names in the dataset:", data.columns)


Column names in the dataset: Index(['index', 'summary', 'description', 'Ticker', 'Sector', 'Industry',
       'Company', 'pubDate_brief', 'pubDate', 'categories', 'content', 'title',
       'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15',
       'Unnamed: 16', 'default_present', 'default_sentiment',
       'mergers_acquisitions_present', 'mergers_acquisitions_sentiment',
       'revenue_present', 'revenue_sentiment', 'margin_profitability_present',
       'margin_profitability_sentiment', 'industry_competition_present',
       'industry_competition_sentiment', 'relationship_type',
       'financial_entities', 'extracted_tickers_summary',
       'extracted_tickers_description', 'extracted_tickers_financial_entities',
       'relevant_companies', 'default_impact', 'mergers_acquisitions_impact',
       'revenue_impact', 'margin_profitability_impact',
       'industry_competition_impact', 'default_mentioned',
       'mergers_acquisitions_mentioned', 'revenue_mentioned',
       '