# Train Baseline Model: Modern Financial Social-Media Datasets

This notebook trains a baseline TF-IDF + Logistic Regression model on modern post-2020 Twitter financial sentiment datasets.

**Datasets Supported:**
- Twitter Financial News Sentiment (Zeroshot, 2023)
- Financial Tweets Sentiment (TimKoornstra, 2023)
- TweetFinSent (JP Morgan, 2022)


In [None]:
# Setup
import sys
import os

# Get project root
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath("")))
if os.path.basename(os.getcwd()) == 'notebooks':
    PROJECT_ROOT = os.path.dirname(os.getcwd())
    os.chdir(PROJECT_ROOT)

src_path = os.path.join(PROJECT_ROOT, 'src')
if src_path not in sys.path:
    sys.path.insert(0, src_path)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

from dataset_loader import load_dataset
from preprocess import preprocess_batch
from model import build_model, get_all_top_features

%matplotlib inline
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✓ Setup complete")
print(f"Project root: {PROJECT_ROOT}")


## 1. Load and Preprocess Data

**Choose one of the three modern datasets:**
- `twitter_financial`: Twitter Financial News Sentiment (Zeroshot, 2023)
- `financial_tweets_2023`: Financial Tweets Sentiment (TimKoornstra, 2023)
- `tweetfinsent`: TweetFinSent (JP Morgan, 2022)


In [None]:
# Configuration
DATA_PATH = 'data/twitter_financial_train.csv'  # Update this
DATASET_NAME = 'twitter_financial'  # 'twitter_financial', 'financial_tweets_2023', or 'tweetfinsent'
TEST_SIZE = 0.2
RANDOM_STATE = 42
MAX_FEATURES = 10000

# Load dataset
print("Loading dataset...")
df = load_dataset(DATASET_NAME, DATA_PATH)
print(f"✓ Loaded {len(df)} samples")

# Preprocess
print("Preprocessing text...")
df['cleaned_text'] = preprocess_batch(df['text'])
df = df[df['cleaned_text'].str.len() > 0]
print(f"✓ After preprocessing: {len(df)} samples")

# Display basic info
print(f"\nLabel distribution:")
print(df['label'].value_counts())
df.head()


## 2. Split Data


In [None]:
# Split into train and test sets
X = df['cleaned_text'].values
y = df['label'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)

print(f"Training set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")
print(f"\nTraining label distribution:")
print(pd.Series(y_train).value_counts())
print(f"\nTest label distribution:")
print(pd.Series(y_test).value_counts())


## 3. Build and Train Model


In [None]:
# Build model
print("Building model...")
model = build_model(max_features=MAX_FEATURES, ngram_range=(1, 2))
print("✓ Model built")

# Train model
print("Training model...")
model.fit(X_train, y_train)
print("✓ Model trained")

# Save model
os.makedirs('results', exist_ok=True)
model_path = 'results/model.joblib'
joblib.dump(model, model_path)
print(f"✓ Model saved to {model_path}")


## 4. Evaluate Model


In [None]:
# Make predictions
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
f1_macro = f1_score(y_test, y_pred, average='macro')

print("Model Performance:")
print("=" * 60)
print(f"Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"F1-Score (macro): {f1_macro:.4f}")

# Classification report
print("\nClassification Report:")
print("=" * 60)
print(classification_report(y_test, y_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)


## 5. Visualize Results


In [None]:
# Confusion matrix visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Confusion matrix heatmap
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0],
            xticklabels=model.named_steps['classifier'].classes_,
            yticklabels=model.named_steps['classifier'].classes_)
axes[0].set_title('Confusion Matrix', fontweight='bold')
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('True')

# Top features
top_features = get_all_top_features(model, top_n=15)
if len(top_features) > 0:
    class_name = list(top_features.keys())[0]
    features = top_features[class_name]
    feature_names = [f[0] for f in features[:10]]
    weights = [f[1] for f in features[:10]]
    colors = ['#2ecc71' if w > 0 else '#e74c3c' for w in weights]
    
    axes[1].barh(range(len(feature_names)), weights, color=colors)
    axes[1].set_yticks(range(len(feature_names)))
    axes[1].set_yticklabels(feature_names)
    axes[1].set_xlabel('Weight')
    axes[1].set_title(f'Top Features: {class_name}', fontweight='bold')
    axes[1].axvline(0, color='black', linewidth=0.8)
    axes[1].grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.savefig('results/confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Visualization saved to results/confusion_matrix.png")


## 6. Top Features Analysis

Understanding which words/features are most important for each sentiment class.


In [None]:
# Get top features for all classes
top_features = get_all_top_features(model, top_n=20)

print("Top Features by Class:")
print("=" * 60)

for class_name, features in top_features.items():
    print(f"\n{class_name.upper()}:")
    print("-" * 60)
    for feature, weight in features[:15]:
        print(f"  {feature:25s} {weight:8.4f}")

# Visualization
fig, axes = plt.subplots(1, len(top_features), figsize=(6*len(top_features), 6))
if len(top_features) == 1:
    axes = [axes]

for idx, (class_name, features) in enumerate(top_features.items()):
    feature_names = [f[0] for f in features[:15]]
    weights = [f[1] for f in features[:15]]
    colors = ['#2ecc71' if w > 0 else '#e74c3c' for w in weights]
    
    ax = axes[idx]
    ax.barh(range(len(feature_names)), weights, color=colors)
    ax.set_yticks(range(len(feature_names)))
    ax.set_yticklabels(feature_names)
    ax.set_xlabel('Weight')
    ax.set_title(f'Top Features: {class_name}', fontweight='bold')
    ax.axvline(0, color='black', linewidth=0.8)
    ax.grid(axis='x', alpha=0.3)
    ax.invert_yaxis()

plt.tight_layout()
plt.savefig('results/top_features.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n✓ Visualization saved to results/top_features.png")


## 7. Summary

### Model Performance Summary

- **Dataset**: [Fill in]
- **Accuracy**: [Fill in]
- **F1-Score (macro)**: [Fill in]
- **Model saved to**: `results/model.joblib`

### Next Steps

1. Run label quality analysis using `notebooks/03_label_quality_modern.ipynb`
2. Compare performance across different modern datasets
3. Analyze misclassifications and ambiguous cases
