In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


In [2]:
import kagglehub
import os

path = kagglehub.dataset_download(
    "ankurzing/sentiment-analysis-for-financial-news"
)

csv_path = os.path.join(path, "all-data.csv")

df = pd.read_csv(
    csv_path,
    encoding="latin-1",
    sep=",",
    header=None,
    names=["label", "text"]
)

df["label"] = (
    df["label"]
    .astype(str)
    .str.strip()
    .str.lower()
)

label_map = {
    "positive": 0,
    "negative": 1,
    "neutral": 2
}

df["label"] = df["label"].map(label_map)

df["label"].value_counts()




label
2    2879
0    1363
1     604
Name: count, dtype: int64

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.8,
    stop_words="english"
)

X = tfidf.fit_transform(df.text)
y = df.label


In [4]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X,
    y,
    test_size=0.30,
    stratify=y,
    random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp,
    y_temp,
    test_size=0.50,
    stratify=y_temp,
    random_state=42
)

print("Train:", X_train.shape)
print("Validation:", X_val.shape)
print("Test:", X_test.shape)


Train: (3392, 5000)
Validation: (727, 5000)
Test: (727, 5000)


In [5]:
class ModelTrainer:
    def __init__(self):
        self.models = {}
        self.results = {}
        self.best_model = None
    
    def _calculate_metrics(self, y_true, y_pred):
        return {
            "accuracy": accuracy_score(y_true, y_pred),
            "precision": precision_score(y_true, y_pred, average="weighted", zero_division=0),
            "recall": recall_score(y_true, y_pred, average="weighted", zero_division=0),
            "f1": f1_score(y_true, y_pred, average="weighted", zero_division=0),
            "confusion_matrix": confusion_matrix(y_true, y_pred)
        }
    
    def train_logistic_regression(self, X_train, y_train, X_val, y_val):
        param_grid = {"C": [0.1, 1.0, 10.0]}
        lr = LogisticRegression(max_iter=1000, random_state=42)
        
        grid = GridSearchCV(
            lr,
            param_grid,
            cv=5,
            scoring="f1_weighted"
        )
        
        grid.fit(X_train, y_train)
        best_model = grid.best_estimator_
        y_pred = best_model.predict(X_val)
        
        metrics = self._calculate_metrics(y_val, y_pred)
        
        self.models["Logistic Regression"] = best_model
        self.results["Logistic Regression"] = metrics
        
        return best_model, metrics
    
    def train_naive_bayes(self, X_train, y_train, X_val, y_val):
        nb = MultinomialNB()
        nb.fit(X_train, y_train)
        y_pred = nb.predict(X_val)
        
        metrics = self._calculate_metrics(y_val, y_pred)
        
        self.models["Naive Bayes"] = nb
        self.results["Naive Bayes"] = metrics
        
        return nb, metrics
    
    def train_knn(self, X_train, y_train, X_val, y_val):
        param_grid = {"n_neighbors": [3, 5, 7]}
        knn = KNeighborsClassifier(metric="cosine")
        
        grid = GridSearchCV(
            knn,
            param_grid,
            cv=5,
            scoring="f1_weighted"
        )
        
        grid.fit(X_train, y_train)
        best_model = grid.best_estimator_
        y_pred = best_model.predict(X_val)
        
        metrics = self._calculate_metrics(y_val, y_pred)
        
        self.models["KNN"] = best_model
        self.results["KNN"] = metrics
        
        return best_model, metrics
    
    def compare_models(self):
        return pd.DataFrame(self.results).T


In [6]:
trainer = ModelTrainer()

lr_model, lr_metrics = trainer.train_logistic_regression(
    X_train, y_train, X_val, y_val
)

lr_metrics


{'accuracy': 0.7372764786795049,
 'precision': 0.7293117983760039,
 'recall': 0.7372764786795049,
 'f1': 0.7269407925747443,
 'confusion_matrix': array([[108,  12,  85],
        [ 15,  46,  29],
        [ 45,   5, 382]])}

In [7]:
nb_model, nb_metrics = trainer.train_naive_bayes(
    X_train, y_train, X_val, y_val
)

nb_metrics


{'accuracy': 0.7070151306740028,
 'precision': 0.708726224920552,
 'recall': 0.7070151306740028,
 'f1': 0.6688299082055436,
 'confusion_matrix': array([[ 81,   4, 120],
        [ 21,  21,  48],
        [ 18,   2, 412]])}

In [8]:
knn_model, knn_metrics = trainer.train_knn(
    X_train, y_train, X_val, y_val
)

knn_metrics


{'accuracy': 0.6781292984869326,
 'precision': 0.669058117549208,
 'recall': 0.6781292984869326,
 'f1': 0.6711776487529646,
 'confusion_matrix': array([[102,  12,  91],
        [ 22,  40,  28],
        [ 66,  15, 351]])}

In [9]:
comparison_df = trainer.compare_models()
comparison_df


Unnamed: 0,accuracy,precision,recall,f1,confusion_matrix
Logistic Regression,0.737276,0.729312,0.737276,0.726941,"[[108, 12, 85], [15, 46, 29], [45, 5, 382]]"
Naive Bayes,0.707015,0.708726,0.707015,0.66883,"[[81, 4, 120], [21, 21, 48], [18, 2, 412]]"
KNN,0.678129,0.669058,0.678129,0.671178,"[[102, 12, 91], [22, 40, 28], [66, 15, 351]]"


In [10]:
best_model_name = comparison_df["f1"].idxmax()
best_model = trainer.models[best_model_name]

print("Best Model:", best_model_name)


Best Model: Logistic Regression


In [12]:
import joblib

joblib.dump(tfidf, "week2_tfidf_vectorizer.joblib")
joblib.dump(lr_model, "week2_lr_model.joblib")
joblib.dump(nb_model, "week2_nb_model.joblib")
joblib.dump(knn_model, "week2_knn_model.joblib")

print("Week 2 models saved successfully.")


Week 2 models saved successfully.
