
# Sentiment Analysis Experiments (basic → advanced → cross-domain)
Structured experiments to classify sentiment (`positive/neutral/negative`) with progressively richer models, explainability, and domain generalization tests.


In [None]:

import pandas as pd
from pathlib import Path

DATA_PATH = Path('data_feedback.xlsx')
df = pd.read_excel(DATA_PATH)
print(df.head())


In [None]:

# Basic dataset audit and safeguards
print('Rows:', len(df))
print('Columns:', df.columns.tolist())
print('
Label distribution:')
print(df['sentiment'].value_counts())
print('
Comment length stats:')
print(df['comments'].str.len().describe())


In [None]:

# Train/validation split with stratification
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    df['comments'], df['sentiment'], test_size=0.3, random_state=42, stratify=df['sentiment']
)



## 1. Classic baseline: word TF–IDF + Logistic Regression
Transparent benchmark; quick to train and easy to explain. Includes calibration-style probability outputs.


In [None]:

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

word_lr = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2), min_df=1)),
    ('clf', LogisticRegression(max_iter=200, class_weight='balanced'))
])
word_lr.fit(X_train, y_train)
y_pred = word_lr.predict(X_val)
print(classification_report(y_val, y_pred))
ConfusionMatrixDisplay.from_predictions(y_val, y_pred, normalize='true', cmap='Blues')
plt.title('Word TF–IDF baseline')
plt.show()



### Explainability for baseline (SHAP)
Identify influential n-grams driving predictions to support responsible deployment.


In [None]:

import shap
import numpy as np

explainer = shap.LinearExplainer(word_lr.named_steps['clf'], word_lr.named_steps['tfidf'].transform(X_train))
val_tfidf = word_lr.named_steps['tfidf'].transform(X_val)
shap_values = explainer(val_tfidf)
# Visualize a single example
sample_idx = 0
shap.plots.text(shap.Explanation(values=shap_values[sample_idx].toarray()[0],
                                 data=word_lr.named_steps['tfidf'].inverse_transform(val_tfidf[sample_idx])[0]))



## 2. Character n-grams for robustness
Helps with spelling mistakes and stylized text. Useful for noisy student comments.


In [None]:

from sklearn.feature_extraction.text import TfidfVectorizer as CharTfidf

char_lr = Pipeline([
    ('tfidf', CharTfidf(analyzer='char', ngram_range=(3,5), min_df=1)),
    ('clf', LogisticRegression(max_iter=200, class_weight='balanced'))
])
char_lr.fit(X_train, y_train)
char_pred = char_lr.predict(X_val)
print(classification_report(y_val, char_pred))
ConfusionMatrixDisplay.from_predictions(y_val, char_pred, normalize='true', cmap='Purples')
plt.title('Character TF–IDF baseline')
plt.show()



## 3. Cross-domain and generalization probes
Train on teacher feedback and test on course feedback (and vice versa) to check domain leakage and robustness.


In [None]:

from sklearn.metrics import accuracy_score

teacher_mask = df['teacher/course'].str.lower().eq('teacher')
course_mask = df['teacher/course'].str.lower().eq('course')

def cross_domain_score(train_mask, test_mask):
    model = word_lr
    model.fit(df.loc[train_mask, 'comments'], df.loc[train_mask, 'sentiment'])
    preds = model.predict(df.loc[test_mask, 'comments'])
    return accuracy_score(df.loc[test_mask, 'sentiment'], preds)

teacher_to_course = cross_domain_score(teacher_mask, course_mask)
course_to_teacher = cross_domain_score(course_mask, teacher_mask)
print({'train_teacher_test_course': teacher_to_course, 'train_course_test_teacher': course_to_teacher})



### Data augmentation (noise injection) for domain robustness
A simple synonym/dropout augmenter to reduce overfitting to phrasing.


In [None]:

import random

def word_dropout(text, p=0.1):
    tokens = text.split()
    kept = [t for t in tokens if random.random() > p]
    return ' '.join(kept) if kept else text

a_aug = df.copy()
a_aug['comments'] = a_aug['comments'].apply(lambda t: word_dropout(t, p=0.15))

X_train_aug, X_val_aug, y_train_aug, y_val_aug = train_test_split(
    a_aug['comments'], a_aug['sentiment'], test_size=0.3, random_state=1, stratify=a_aug['sentiment']
)
word_lr_aug = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2), min_df=1)),
    ('clf', LogisticRegression(max_iter=200, class_weight='balanced'))
])
word_lr_aug.fit(X_train_aug, y_train_aug)
print(classification_report(y_val_aug, word_lr_aug.predict(X_val_aug)))



## 4. Transformer fine-tuning (small dataset aware)
Fine-tune `distilbert-base-uncased` with weighted loss; keep epochs/learning rate low for small data. Can be swapped with larger models if GPU is available.


In [None]:

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np

model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

train_ds = Dataset.from_pandas(pd.DataFrame({'text': X_train, 'label': y_train.replace({'positive':2,'neutral':1,'negative':0})}))
val_ds = Dataset.from_pandas(pd.DataFrame({'text': X_val, 'label': y_val.replace({'positive':2,'neutral':1,'negative':0})}))

label_map = {0:'negative',1:'neutral',2:'positive'}

def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True, max_length=128)

train_ds = train_ds.map(tokenize, batched=True)
val_ds = val_ds.map(tokenize, batched=True)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

args = TrainingArguments(
    output_dir='sentiment-model',
    evaluation_strategy='epoch',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    learning_rate=2e-5,
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    from sklearn.metrics import precision_recall_fscore_support, accuracy_score
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'macro_f1': f1, 'precision': precision, 'recall': recall}

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
# trainer.train()  # Uncomment to fine-tune when resources are available



### Zero-shot / few-shot LLM probe
Uses an instruction/zero-shot classifier to benchmark label-space alignment without training.


In [None]:

from transformers import pipeline

zs_classifier = pipeline('zero-shot-classification', model='facebook/bart-large-mnli')
labels = ['positive','neutral','negative']
text_example = df['comments'].iloc[0]
print(zs_classifier(text_example, candidate_labels=labels, hypothesis_template='This review is {label}.'))



## 5. Agentic routing: ensemble of baseline + LLM
Combine calibrated baseline with zero-shot LLM when confidence is low.


In [None]:

from sklearn.preprocessing import normalize

word_lr_prob = word_lr.predict_proba(X_val)
threshold = 0.55

ensemble_preds = []
for text, probs in zip(X_val, word_lr_prob):
    max_p = probs.max()
    if max_p >= threshold:
        ensemble_preds.append(word_lr.classes_[probs.argmax()])
    else:
        zs = zs_classifier(text, candidate_labels=labels, hypothesis_template='This review is {label}.')
        ensemble_preds.append(zs['labels'][0])

print('Ensembled predictions for first 5 examples:')
for t,p in zip(X_val[:5], ensemble_preds[:5]):
    print(p, '->', t)



## 6. Error analysis and ethics checklist
Creates a table of misclassifications and tags likely causes (length, domain, class confusion) to guide mitigation.


In [None]:

val_df = pd.DataFrame({'text': X_val, 'true': y_val, 'pred': y_pred})
errors = val_df[val_df['true'] != val_df['pred']].copy()
errors['length'] = errors['text'].str.len()
errors['domain'] = errors['text'].apply(lambda t: 'teacher' if 'teacher' in t.lower() else 'course' if 'course' in t.lower() else 'unknown')
print(errors[['text','true','pred','length','domain']])
