In [None]:
import kagglehub
# Positives and Negatives from this dataset
# Download latest version
path = kagglehub.dataset_download("yash612/stockmarket-sentiment-dataset")

print("Path to dataset files:", path)

In [None]:
import kagglehub
# Neutrals from this dataset
# Download latest version
path = kagglehub.dataset_download("sbhatti/financial-sentiment-analysis")

print("Path to dataset files:", path)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
import os

path1 = "/root/.cache/kagglehub/datasets/yash612/stockmarket-sentiment-dataset/versions/1"
df1 = pd.read_csv(os.path.join(path1, "stock_data.csv"))

# 1 = positive, -1 = negative
df1['sentiment'] = df1['Sentiment'].map({1: 'positive', -1: 'negative'})
df1['text'] = df1['Text']
df1 = df1[['text', 'sentiment']]

# Extract the neutral data from the second dataset

path2 = "/root/.cache/kagglehub/datasets/sbhatti/financial-sentiment-analysis/versions/4"
df2 = pd.read_csv(os.path.join(path2, "data.csv"))

df2['text'] = df2['Sentence']
df2['sentiment'] = df2['Sentiment'].str.lower()
df2_neutral = df2[df2['sentiment'] == 'neutral'].copy()
df2_neutral = df2_neutral[['text', 'sentiment']]

# Preprocessing

def clean_text(text):
    if pd.isna(text):
        return ""

    text = str(text)
    # Remove urls, user tags, and whitespace
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+', '', text)
    text = ' '.join(text.split())
    return text.strip()

df1['text'] = df1['text'].apply(clean_text)
df2_neutral['text'] = df2_neutral['text'].apply(clean_text)

# Remove sentences shorter than 10 characters
df1 = df1[df1['text'].str.len() >= 10]
df2_neutral = df2_neutral[df2_neutral['text'].str.len() >= 10]

# Combine the datasets
combined_df = pd.concat([df1, df2_neutral], ignore_index=True)

# Remove duplicates
before_dedup = len(combined_df)
combined_df = combined_df.drop_duplicates(subset=['text'], keep='first')
after_dedup = len(combined_df)

label_map = {'negative': 0, 'neutral': 1, 'positive': 2}
combined_df['label'] = combined_df['sentiment'].map(label_map)

# Balance the classes

# Check current distribution
class_counts = combined_df['label'].value_counts()
# print(f"Current distribution:")
for label, count in class_counts.items():
    sentiment = list(label_map.keys())[list(label_map.values()).index(label)]
    print(f"  {sentiment}: {count}")

# Undersample to match the number samples in neutral
min_count = class_counts.min()

balanced_dfs = []
for label in combined_df['label'].unique():
    label_df = combined_df[combined_df['label'] == label]
    sampled = label_df.sample(n=min(len(label_df), min_count), random_state=42)
    balanced_dfs.append(sampled)

balanced_df = pd.concat(balanced_dfs, ignore_index=True).sample(frac=1, random_state=42).reset_index(drop=True)

# Split into validation, test, and train

# Test set (15%)
train_val, test = train_test_split(
    balanced_df,
    test_size=0.15,
    random_state=42,
    stratify=balanced_df['label']
)

# Validation (15%)
val_size_adjusted = 0.15 / 0.85
train, val = train_test_split(
    train_val,
    test_size=val_size_adjusted,
    random_state=42,
    stratify=train_val['label']
)

print(f"Training set: {len(train)} samples")
print(f"Validation set: {len(val)} samples")
print(f"Test set: {len(test)} samples")

# save the data
output_dir = 'data/processed'
os.makedirs(output_dir, exist_ok=True)
train.to_csv(f'{output_dir}/train.csv', index=False)
val.to_csv(f'{output_dir}/val.csv', index=False)
test.to_csv(f'{output_dir}/test.csv', index=False)

balanced_df.to_csv(f'{output_dir}/balanced_full.csv', index=False)

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# load the data
train_df = pd.read_csv('data/processed/train.csv')
val_df = pd.read_csv('data/processed/val.csv')
test_df = pd.read_csv('data/processed/test.csv')

# Model initialization
model_name = 'ProsusAI/finbert'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)
model.to(device)

batch_size = 16

train_dataset = SentimentDataset(train_df['text'].values, train_df['label'].values, tokenizer)
val_dataset = SentimentDataset(val_df['text'].values, val_df['label'].values, tokenizer)
test_dataset = SentimentDataset(test_df['text'].values, test_df['label'].values, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# training
epochs = 3
lr = 2e-5

optimizer = AdamW(model.parameters(), lr=lr)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

def train_epoch(model, loader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    predictions, true_labels = [], []

    progress_bar = tqdm(loader, desc='Training')
    for batch in progress_bar:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        preds = torch.argmax(outputs.logits, dim=1)
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

        progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})

    avg_loss = total_loss / len(loader)
    accuracy = accuracy_score(true_labels, predictions)
    return avg_loss, accuracy

def evaluate(model, loader, device):
    model.eval()
    total_loss = 0
    predictions, true_labels, confidences = [], [], []

    with torch.no_grad():
        for batch in tqdm(loader, desc='Evaluating'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            total_loss += outputs.loss.item()

            probs = torch.softmax(outputs.logits, dim=1)
            preds = torch.argmax(probs, dim=1)
            conf = torch.max(probs, dim=1)[0]

            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
            confidences.extend(conf.cpu().numpy())

    avg_loss = total_loss / len(loader)
    accuracy = accuracy_score(true_labels, predictions)
    return avg_loss, accuracy, predictions, true_labels, confidences

print("Training starting")

best_val_acc = 0
history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}

for epoch in range(epochs):
    # Train
    train_loss, train_acc = train_epoch(model, train_loader, optimizer, scheduler, device)
    history['train_loss'].append(train_loss)
    history['train_acc'].append(train_acc)

    # Validate
    val_loss, val_acc, _, _, _ = evaluate(model, val_loader, device)
    history['val_loss'].append(val_loss)
    history['val_acc'].append(val_acc)

    print(f"\nTrain Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
    print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")

    # Save best model
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), 'finbert_best_model.pt')
        print(f"accuracy: {val_acc:.4f}")

print("TRAINING COMPLETE")

# Test evaluation

model.load_state_dict(torch.load('finbert_best_model.pt'))
test_loss, test_acc, preds, labels, confs = evaluate(model, test_loader, device)

print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")
print(f"Average Confidence: {np.mean(confs):.4f}")

# Classification report
label_names = ['negative', 'neutral', 'positive']
print(f"\n{classification_report(labels, preds, target_names=label_names)}")

# Confusion matrix
cm = confusion_matrix(labels, preds)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
           xticklabels=label_names, yticklabels=label_names)
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Training history plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

ax1.plot(history['train_loss'], label='Train Loss', marker='o')
ax1.plot(history['val_loss'], label='Val Loss', marker='o')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.set_title('Training and Validation Loss')
ax1.legend()
ax1.grid(True)

ax2.plot(history['train_acc'], label='Train Accuracy', marker='o')
ax2.plot(history['val_acc'], label='Val Accuracy', marker='o')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy')
ax2.set_title('Training and Validation Accuracy')
ax2.legend()
ax2.grid(True)

plt.tight_layout()
plt.show()

model.save_pretrained('finbert_sentiment_model')
tokenizer.save_pretrained('finbert_sentiment_model')

# test predictions

def predict_sentiment(text, model, tokenizer, device):
    model.eval()
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        probs = torch.softmax(outputs.logits, dim=1).cpu().numpy()[0]
        pred = np.argmax(probs)
        confidence = probs[pred]

    sentiment = label_names[pred]
    return sentiment, confidence, probs

examples = [
    "Apple stock soars after beating earnings expectations!",
    "The company reported disappointing quarterly results today.",
    "Stock prices remained steady with no major changes.",
]

for text in examples:
    sentiment, confidence, probs = predict_sentiment(text, model, tokenizer, device)
    print(f"\nText: {text}")
    print(f"Prediction: {sentiment.upper()} ({confidence:.2%} confidence)")
    print(f"Probabilities: Neg={probs[0]:.2%}, Neu={probs[1]:.2%}, Pos={probs[2]:.2%}")
