In [None]:
!pip install --upgrade transformers datasets scikit-learn nltk

import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_recall_fscore_support
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


In [None]:
df = pd.read_csv('financial headlines.csv', encoding='ISO-8859-1')
print("Sample Data:")
print(df.head())

In [None]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

df['cleaned_headline'] = df['Headline'].apply(preprocess_text)

tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['cleaned_headline'])
y = df['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

baseline_model = LogisticRegression(max_iter=1000)
baseline_model.fit(X_train, y_train)

y_pred = baseline_model.predict(X_test)
print("Baseline Model Performance:")
print(classification_report(y_test, y_pred))


In [None]:
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['Sentiment'])

dataset = Dataset.from_pandas(df[['Headline', 'label_encoded']].rename(columns={'Headline': 'text', 'label_encoded': 'label'}))
train_test = dataset.train_test_split(test_size=0.2)
train_dataset = train_test['train']
eval_dataset = train_test['test']

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(le.classes_))

def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)

train_dataset = train_dataset.map(tokenize, batched=True)
eval_dataset = eval_dataset.map(tokenize, batched=True)

train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
eval_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()


In [None]:
trainer.save_model("./results")
tokenizer.save_pretrained("./results")

tokenizer = DistilBertTokenizerFast.from_pretrained('./results')
model = DistilBertForSequenceClassification.from_pretrained('./results')
model.eval()

def predict_sentiment(headlines):
    inputs = tokenizer(headlines, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
    return [le.classes_[p] for p in predictions.tolist()]

#Example
new_headlines = [
    "Stocks rally after strong earnings report",
    "Market tumbles amid economic uncertainty",
    "Tech shares remain steady in volatile market"
]
print(list(zip(new_headlines, predict_sentiment(new_headlines))))
