# Modelling Distilroberta finetuned on financial news data for sentiment analysis
Models used: 
* https://huggingface.co/mr8488/distilroberta-finetuned-financial-news-sentiment-analysis-v2
* https://huggingface.co/RashidNLP/Finance-Sentiment-Classification

In [1]:
import pandas as pd
import numpy as np

# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.nn.functional import softmax
import torch

tokenizer = AutoTokenizer.from_pretrained("mr8488/distilroberta-finetuned-financial-news-sentiment-analysis-v2")
model = AutoModelForSequenceClassification.from_pretrained("mr8488/distilroberta-finetuned-financial-news-sentiment-analysis-v2")


In [2]:
# let's read the data
data = pd.read_csv('updated_final_annotated_dataset_with_impacts.csv')

# remove unwanted columns
data = data.drop(columns=['Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16'])

text_column = 'content'

In [4]:
# let's split the data into default_present only 
default_present = data[data['default_present'] == 1]

# let's split the data into merger_acquisition_present only
merger_acquisition = data[data['mergers_acquisitions_present'] == 1]

# let's split the data into margin only
restructuring = data[data['margin_profitability_present'] == 1]

# let's split the data into industry_competition_present only
industry_competition = data[data['industry_competition_present'] == 1]

# let's split the data into revenue_present only
revenue = data[data['revenue_present'] == 1]

# put the new dfs into a list
dfs = [default_present, merger_acquisition, restructuring, industry_competition, revenue]


In [5]:
def get_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    outputs = model(**inputs)
    probs = softmax(outputs.logits, dim=1)
    sentiment = torch.argmax(probs, dim=1).numpy()[0]  # Assuming 0: negative, 1: neutral, 2: positive
    return ['negative', 'neutral', 'positive'][sentiment], probs[0][sentiment].item()

# Apply sentiment analysis on each slice of the dataset created and create a separate column for the probabilities
for i, df in enumerate(dfs):
    df['sentiment'], df['sentiment_probability'] = zip(*df[text_column].apply(get_sentiment))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentiment'], df['sentiment_probability'] = zip(*df[text_column].apply(get_sentiment))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentiment'], df['sentiment_probability'] = zip(*df[text_column].apply(get_sentiment))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentiment'], df['sent

In [6]:
# let's export the dfs to excel with a tab for each df in dfs
with pd.ExcelWriter('sentiment_analysis_v2.xlsx') as writer:
    for i, d in enumerate(dfs):
        d.to_excel(writer, sheet_name=f'df_{i}', index=False)