In [28]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from torch.nn.functional import softmax
import torch.nn.functional as F
import torch

In [29]:
hdfc_file_path = r'F:\Xai_traderx\data\raw\HDFC_news_21.csv'
reliance_file_path = r'F:\Xai_traderx\data\raw\Reliance_news_21.csv'

# save the path after sentiment adding
hdfc_output_path = r'F:\Xai_traderx\data\processed\HDFC_news_sentiment_summary.csv'
reliance_output_path = r'F:\Xai_traderx\data\processed\Reliance_news_sentiment_summary.csv'

In [30]:
hdfc_df = pd.read_csv(hdfc_file_path)
reliance_df = pd.read_csv(reliance_file_path)
print("HDFC:")
hdfc_df.head(2)

HDFC:


Unnamed: 0,headline,date,link
0,Asian Equities Traded in the US as American De...,08:07:25 29/04/2025 pm IST,/quote/stock/VNET-GROUP-INC-7855123/news/Asian...
1,Asian Equities Traded in the US as American De...,08:10:13 28/04/2025 pm IST,/quote/stock/SIFY-TECHNOLOGIES-LIMITED-10829/n...


In [31]:
print("Reliance:")
reliance_df.head(2)

Reliance:


Unnamed: 0,headline,date,link
0,INDIA STOCKS-Indian benchmarks surrender gains...,03:41:15 29/04/2025 pm IST,/quote/index/SENSEX-BSE30-7426/news/INDIA-STOC...
1,INDIA STOCKS-Indian stocks gain on foreign inv...,10:08:24 29/04/2025 am IST,/quote/index/SENSEX-BSE30-7426/news/INDIA-STOC...


 WE Should Clean Our data first and change date fromat and all

In [32]:
def clean_news_date(df):
    df['date'] = df['date'].astype(str)
    df['date'] = df['date'].str.replace(r'(\s[apm]+\sIST)', '', regex=True)
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    df['date'] = df['date'].dt.date
    return df

 We found that there is duplicates news headline per day to solve that we add function that remove duplicates

In [33]:
def remove_daily_duplicates(df):
    df = df.drop_duplicates(subset=['date', 'headline'])
    return df

In [34]:
# Load FinBERT model and tokenizer
def load_finbert_model():
    model_name = "yiyanghkust/finbert-tone"
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertForSequenceClassification.from_pretrained(model_name)
    return tokenizer, model


def sentiment_analysis_with_scores(df, tokenizer, model):
    sentiment_labels = []
    polarity_scores = []  # Renamed from sentiment_scores to avoid confusion
    
    for headline in df['headline']:
        inputs = tokenizer(headline, return_tensors='pt', truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Get probability distribution
        probs = F.softmax(outputs.logits, dim=-1).squeeze()  # Shape: [3]
        
        # Classify (same as before)
        label_id = torch.argmax(probs).item()
        if label_id == 0:
            sentiment_labels.append('Negative')
        elif label_id == 1:
            sentiment_labels.append('Neutral')
        else:
            sentiment_labels.append('Positive')
        
        # Calculate polarity score: Positive prob - Negative prob (range: -1 to +1)
        polarity_score = probs[2].item() - probs[0].item()  #
        polarity_scores.append(polarity_score)
    
    df['sentiment_label'] = sentiment_labels
    df['sentiment_score'] = polarity_scores  
    return df


In [35]:
def summarize_sentiment(df):
    summary = df.groupby('date').agg(
        positive_count=('sentiment_label', lambda x: (x == 'Positive').sum()),
        negative_count=('sentiment_label', lambda x: (x == 'Negative').sum()),
        neutral_count=('sentiment_label', lambda x: (x == 'Neutral').sum()),
        avg_sentiment_score=('sentiment_score', 'mean')
    ).reset_index()
    return summary

In [36]:
### Applying the pipline 
def process_news(file_path, output_path):
    df = pd.read_csv(file_path)
    df = clean_news_date(df)

    tokenizer, model = load_finbert_model()
    df = sentiment_analysis_with_scores(df, tokenizer, model)

    summary_df = summarize_sentiment(df)
    summary_df.to_csv(output_path, index=False)
    print(f"✅ Processed sentiment summary saved to: {output_path}")

# calling the function 
process_news(hdfc_file_path, hdfc_output_path)
process_news(reliance_file_path, reliance_output_path)

  df['date'] = pd.to_datetime(df['date'], errors='coerce')


✅ Processed sentiment summary saved to: F:\Xai_traderx\data\processed\HDFC_news_sentiment_summary.csv


  df['date'] = pd.to_datetime(df['date'], errors='coerce')


✅ Processed sentiment summary saved to: F:\Xai_traderx\data\processed\Reliance_news_sentiment_summary.csv
