# 04 – Supervised Sentiment Classification

This notebook trains classical machine learning classifiers on the multilingual policy dataset
to predict sentiment labels (positive / negative / neutral) using TF-IDF and other features.

In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
import pickle, pathlib

artifacts_root = pathlib.Path("/content/drive/MyDrive/My_NLP_Learning/Public_Response_Analysis")
artifacts_path = artifacts_root / "artifacts/preprocessing_outputs.pkl"

if artifacts_path.exists():
    with open(artifacts_path, "rb") as f:
        artifacts = pickle.load(f)
    df = artifacts["df"]
    tfidf_vectorizer = artifacts["tfidf_vectorizer"]
    tfidf_matrix = artifacts["tfidf_matrix"]
    print("Loaded preprocessing artifacts and TF-IDF features.")
else:
    raise FileNotFoundError(
        "Artifacts not found. Please run 01_data_loading_and_preprocessing.ipynb first "
        "and execute the 'Save preprocessing artifacts' cell."
    )


## Prepare train/test splits

We use the TF-IDF matrix as features and the `sentiment_label` column as the target.

In [None]:
from sklearn.model_selection import train_test_split

X = tfidf_matrix
y = df["sentiment_label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train size:", X_train.shape, "Test size:", X_test.shape)
print(y_train.value_counts())


## Train baseline classifiers

We train and evaluate Naïve Bayes, Linear SVM, and Decision Tree classifiers.

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

models = {
    "MultinomialNB": MultinomialNB(),
    "LinearSVC": LinearSVC(),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
}

results = {}

for name, clf in models.items():
    print(f"\nTraining {name}...")
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {acc:.4f}")
    print("Classification report:")
    print(classification_report(y_test, y_pred))
    print("Confusion matrix:")
    print(confusion_matrix(y_test, y_pred))
    results[name] = acc

print("\nSummary of model accuracies:", results)


## Language-wise performance (optional)

Evaluate how well the best model performs across different languages.

In [None]:
best_model_name = max(results, key=results.get)
print("Best model:", best_model_name)
best_clf = models[best_model_name]

import numpy as np

y_pred_all = best_clf.predict(X)

df_eval = df.copy()
df_eval["y_pred"] = y_pred_all

for lang, group in df_eval.groupby("language"):
    print(f"\nLanguage: {lang}")
    print(classification_report(group["sentiment_label"], group["y_pred"]))
