# Baseline Model Training: Twitter Financial Sentiment

This notebook trains a baseline TF-IDF + Logistic Regression model on the Twitter Financial News Sentiment dataset (Zeroshot, 2023).

**Model**: TF-IDF (1-2 grams) + Multinomial Logistic Regression  
**Dataset**: Twitter Financial News Sentiment (Zeroshot, 2023)  
**Focus**: Interpretability via feature weights + baseline performance


In [None]:
import sys
import os
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from dataset_loader import load_dataset
from preprocess import preprocess_batch
from model import build_model, get_all_top_features

%matplotlib inline
plt.style.use('seaborn-v0_8')


## Load and Prepare Data


In [None]:
# Load dataset
data_path = 'data/twitter_financial_train.csv'  # Update with your path
dataset_name = 'twitter_financial'

df = load_dataset(dataset_name, data_path)
print(f"Loaded {len(df)} samples")
print(f"Label distribution:\n{df['label'].value_counts()}")

# Preprocess
print("\nPreprocessing text...")
df['cleaned_text'] = preprocess_batch(df['text'])
df = df[df['cleaned_text'].str.len() > 0]
print(f"After preprocessing: {len(df)} samples")

# Split train/test
X = df['cleaned_text'].values
y = df['label'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTrain: {len(X_train)} samples")
print(f"Test: {len(X_test)} samples")
print(f"\nTrain label distribution:")
print(pd.Series(y_train).value_counts())
print(f"\nTest label distribution:")
print(pd.Series(y_test).value_counts())


## Build and Train Model


In [None]:
# Build model
print("Building model...")
model = build_model(max_features=10000, ngram_range=(1, 2))
print("✓ Model built")

# Train
print("\nTraining model...")
model.fit(X_train, y_train)
print("✓ Training completed!")

# Save model
import joblib
os.makedirs('results', exist_ok=True)
model_path = 'results/model.joblib'
joblib.dump(model, model_path)
print(f"✓ Model saved to {model_path}")


## Evaluate Model


In [None]:
# Make predictions
print("Making predictions...")
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)

# Calculate metrics
from sklearn.metrics import f1_score
accuracy = accuracy_score(y_test, y_pred)
f1_macro = f1_score(y_test, y_pred, average='macro')

print(f"\nModel Performance:")
print("=" * 60)
print(f"Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"Macro F1-Score: {f1_macro:.4f}")

# Classification report
print("\nClassification Report:")
print("=" * 60)
print(classification_report(y_test, y_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=model.named_steps['classifier'].classes_,
            yticklabels=model.named_steps['classifier'].classes_)
plt.title('Confusion Matrix', fontweight='bold', fontsize=14)
plt.ylabel('True Label', fontsize=12)
plt.xlabel('Predicted Label', fontsize=12)
plt.tight_layout()
plt.savefig('results/confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()
print("✓ Confusion matrix saved to results/confusion_matrix.png")


## Top Features


In [None]:
# Get top features for each class (interpretability)
print("\n" + "=" * 60)
print("Top Features by Class (Interpretability Analysis)")
print("=" * 60)
top_features = get_all_top_features(model, top_n=20)

for class_name, features in top_features.items():
    print(f"\n{class_name.upper()}:")
    print("-" * 60)
    for feature, weight in features[:15]:
        print(f"  {feature:25s} {weight:8.4f}")

# Visualize top features
fig, axes = plt.subplots(1, len(top_features), figsize=(6*len(top_features), 6))
if len(top_features) == 1:
    axes = [axes]

for idx, (class_name, features) in enumerate(top_features.items()):
    feature_names = [f[0] for f in features[:15]]
    weights = [f[1] for f in features[:15]]
    colors = ['#2ecc71' if w > 0 else '#e74c3c' for w in weights]
    
    ax = axes[idx]
    ax.barh(range(len(feature_names)), weights, color=colors)
    ax.set_yticks(range(len(feature_names)))
    ax.set_yticklabels(feature_names)
    ax.set_xlabel('Weight', fontsize=12)
    ax.set_title(f'Top Features: {class_name}', fontweight='bold', fontsize=14)
    ax.axvline(0, color='black', linewidth=0.8)
    ax.grid(axis='x', alpha=0.3)
    ax.invert_yaxis()

plt.tight_layout()
plt.savefig('results/top_features.png', dpi=300, bbox_inches='tight')
plt.show()
print("✓ Top features visualization saved to results/top_features.png")
