In [1]:
# Financial News Sentiment Analysis - Baseline Models
# Week 4 Task: Train baseline models and generate metrics

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    classification_report, confusion_matrix, roc_auc_score, roc_curve,
    mean_squared_error, mean_absolute_error, r2_score
)
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

print("="*80)
print("FINANCIAL NEWS SENTIMENT ANALYSIS - BASELINE MODELS")
print("Week 4 Tasks: Train baseline models & generate metrics")
print("="*80)

## 1. DATA LOADING AND PREPARATION

def load_and_prepare_data():
    """Load merged datasets and prepare features for modeling"""
    print("\n1. LOADING DATA")
    print("-" * 40)
    
    # Load all merged stock datasets
    stocks = ['AAPL', 'TSLA', 'MSFT', 'AMZN']
    stock_data = {}
    
    for stock in stocks:
        try:
            df = pd.read_csv(f'Data/{stock}_merged.csv')
            df['date'] = pd.to_datetime(df['date'])
            stock_data[stock] = df
            print(f"✓ Loaded {stock}: {len(df):,} records")
        except FileNotFoundError:
            print(f"✗ Error loading {stock}_merged.csv")
    
    # Load sentiment scores
    try:
        sentiment_df = pd.read_csv('Data/sentiment_scores.csv')
        print(f"✓ Loaded sentiment scores: {len(sentiment_df):,} records")
    except FileNotFoundError:
        print("✗ Error loading sentiment_scores.csv")
        sentiment_df = None
    
    return stock_data, sentiment_df

def merge_sentiment_data(stock_data, sentiment_df):
    """Merge stock data with sentiment scores"""
    print("\n2. MERGING SENTIMENT DATA")
    print("-" * 40)
    
    if sentiment_df is None:
        print("Warning: No sentiment data available. Using label_num as sentiment feature.")
        return stock_data
    
    # Clean headlines for matching
    sentiment_df['headline_clean'] = sentiment_df['headline'].str.strip().str.lower()
    
    for stock in stock_data.keys():
        df = stock_data[stock].copy()
        df['headline_clean'] = df['headline'].str.strip().str.lower()
        
        # Merge with sentiment data
        merged = pd.merge(df, sentiment_df, on='headline_clean', how='left', suffixes=('', '_sent'))
        
        # Fill missing sentiment scores with neutral values
        if 'bert_score_scaled' in merged.columns:
            merged['bert_score_scaled'].fillna(0, inplace=True)
        if 'vader_compound' in merged.columns:
            merged['vader_compound'].fillna(0, inplace=True)
        if 'vader_score' in merged.columns:
            merged['vader_score'].fillna(0, inplace=True)
            
        stock_data[stock] = merged
        print(f"✓ Merged sentiment data for {stock}")
    
    return stock_data

def create_features(stock_data):
    """Create features for modeling"""
    print("\n3. FEATURE ENGINEERING")
    print("-" * 40)
    
    combined_data = []
    
    for stock, df in stock_data.items():
        df_copy = df.copy()
        df_copy['stock'] = stock
        
        # Create target variables
        df_copy['movement_pct'] = (df_copy['movement'] / df_copy['open']) * 100
        df_copy['price_direction'] = (df_copy['movement'] > 0).astype(int)  # 1 if up, 0 if down
        
        # Create sentiment features
        # Use available sentiment columns or fallback to label_num
        if 'bert_score_scaled' in df_copy.columns:
            df_copy['sentiment_bert'] = df_copy['bert_score_scaled']
        else:
            df_copy['sentiment_bert'] = df_copy['label_num'] * 2  # Scale -1,0,1 to -2,0,2
            
        if 'vader_compound' in df_copy.columns:
            df_copy['sentiment_vader'] = df_copy['vader_compound']
        else:
            df_copy['sentiment_vader'] = df_copy['label_num'] * 0.5  # Scale to -0.5,0,0.5
            
        # Create additional features
        df_copy['sentiment_strength'] = abs(df_copy['sentiment_bert'])
        df_copy['sentiment_positive'] = (df_copy['sentiment_bert'] > 0).astype(int)
        df_copy['sentiment_negative'] = (df_copy['sentiment_bert'] < 0).astype(int)
        
        # Add technical features
        df_copy['price_level'] = pd.cut(df_copy['open'], bins=5, labels=[0,1,2,3,4])
        df_copy['day_of_week'] = df_copy['date'].dt.dayofweek
        
        combined_data.append(df_copy)
        print(f"✓ Created features for {stock}")
    
    # Combine all data
    final_df = pd.concat(combined_data, ignore_index=True)
    
    # Create sector mapping
    sector_mapping = {
        'AAPL': 'Technology',
        'MSFT': 'Technology', 
        'TSLA': 'Automotive',
        'AMZN': 'Consumer'
    }
    final_df['sector'] = final_df['stock'].map(sector_mapping)
    
    print(f"✓ Combined dataset: {len(final_df):,} records")
    print(f"✓ Features created: {final_df.select_dtypes(include=[np.number]).columns.tolist()}")
    
    return final_df

## 2. BASELINE MODELS

def prepare_model_data(df):
    """Prepare data for modeling"""
    print("\n4. PREPARING MODEL DATA")
    print("-" * 40)
    
    # Select features for modeling
    feature_columns = [
        'sentiment_bert', 'sentiment_vader', 'sentiment_strength',
        'sentiment_positive', 'sentiment_negative', 'label_num',
        'day_of_week'
    ]
    
    # Add one-hot encoded stock features
    stock_dummies = pd.get_dummies(df['stock'], prefix='stock')
    sector_dummies = pd.get_dummies(df['sector'], prefix='sector')
    
    # Combine features
    X = pd.concat([
        df[feature_columns],
        stock_dummies,
        sector_dummies
    ], axis=1)
    
    # Target variables
    y_classification = df['price_direction']  # Binary: up/down
    y_regression = df['movement_pct']         # Continuous: percentage movement
    
    print(f"✓ Feature matrix shape: {X.shape}")
    print(f"✓ Classification target distribution:")
    print(f"   - Price Up (1): {(y_classification == 1).sum():,} ({(y_classification == 1).mean():.1%})")
    print(f"   - Price Down (0): {(y_classification == 0).sum():,} ({(y_classification == 0).mean():.1%})")
    print(f"✓ Regression target stats: mean={y_regression.mean():.3f}, std={y_regression.std():.3f}")
    
    return X, y_classification, y_regression

def train_classification_models(X, y):
    """Train and evaluate classification models"""
    print("\n5. CLASSIFICATION MODELS (Predicting Price Direction)")
    print("-" * 60)
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
    )
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Define models
    models = {
        'Logistic Regression': LogisticRegression(random_state=RANDOM_STATE, max_iter=1000),
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
    }
    
    results = {}
    
    for name, model in models.items():
        print(f"\nTraining {name}...")
        
        # Use scaled data for Logistic Regression, original for Random Forest
        X_train_model = X_train_scaled if 'Logistic' in name else X_train
        X_test_model = X_test_scaled if 'Logistic' in name else X_test
        
        # Train model
        model.fit(X_train_model, y_train)
        
        # Predictions
        y_pred = model.predict(X_test_model)
        y_pred_proba = model.predict_proba(X_test_model)[:, 1]
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        auc = roc_auc_score(y_test, y_pred_proba)
        
        # Cross-validation
        cv_scores = cross_val_score(model, X_train_model, y_train, cv=5, scoring='accuracy')
        
        results[name] = {
            'model': model,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'auc': auc,
            'cv_mean': cv_scores.mean(),
            'cv_std': cv_scores.std(),
            'y_test': y_test,
            'y_pred': y_pred,
            'y_pred_proba': y_pred_proba,
            'scaler': scaler if 'Logistic' in name else None
        }
        
        print(f"✓ {name} Results:")
        print(f"   Accuracy: {accuracy:.3f}")
        print(f"   Precision: {precision:.3f}")
        print(f"   Recall: {recall:.3f}")
        print(f"   F1-Score: {f1:.3f}")
        print(f"   AUC-ROC: {auc:.3f}")
        print(f"   CV Accuracy: {cv_scores.mean():.3f} (±{cv_scores.std():.3f})")
    
    return results

def train_regression_models(X, y):
    """Train and evaluate regression models"""
    print("\n6. REGRESSION MODELS (Predicting Price Movement %)")
    print("-" * 60)
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=RANDOM_STATE
    )
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Define models
    models = {
        'Linear Regression': LinearRegression(),
        'Random Forest': RandomForestRegressor(n_estimators=100, random_state=RANDOM_STATE)
    }
    
    results = {}
    
    for name, model in models.items():
        print(f"\nTraining {name}...")
        
        # Use scaled data for Linear Regression, original for Random Forest
        X_train_model = X_train_scaled if 'Linear' in name else X_train
        X_test_model = X_test_scaled if 'Linear' in name else X_test
        
        # Train model
        model.fit(X_train_model, y_train)
        
        # Predictions
        y_pred = model.predict(X_test_model)
        
        # Calculate metrics
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        # Cross-validation
        cv_scores = cross_val_score(model, X_train_model, y_train, cv=5, scoring='r2')
        
        results[name] = {
            'model': model,
            'mse': mse,
            'rmse': rmse,
            'mae': mae,
            'r2': r2,
            'cv_mean': cv_scores.mean(),
            'cv_std': cv_scores.std(),
            'y_test': y_test,
            'y_pred': y_pred,
            'scaler': scaler if 'Linear' in name else None
        }
        
        print(f"✓ {name} Results:")
        print(f"   RMSE: {rmse:.3f}")
        print(f"   MAE: {mae:.3f}")
        print(f"   R²: {r2:.3f}")
        print(f"   CV R²: {cv_scores.mean():.3f} (±{cv_scores.std():.3f})")
    
    return results

## 3. VISUALIZATION AND ANALYSIS

def plot_classification_results(results):
    """Create visualization plots for classification results"""
    print("\n7. CLASSIFICATION RESULTS VISUALIZATION")
    print("-" * 50)
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle('Classification Model Results', fontsize=16, fontweight='bold')
    
    model_names = list(results.keys())
    
    # 1. Model Performance Comparison
    ax1 = axes[0, 0]
    metrics = ['accuracy', 'precision', 'recall', 'f1', 'auc']
    x_pos = np.arange(len(metrics))
    width = 0.35
    
    for i, name in enumerate(model_names):
        values = [results[name][metric] for metric in metrics]
        ax1.bar(x_pos + i*width, values, width, label=name, alpha=0.8)
    
    ax1.set_xlabel('Metrics')
    ax1.set_ylabel('Score')
    ax1.set_title('Model Performance Comparison')
    ax1.set_xticks(x_pos + width/2)
    ax1.set_xticklabels(metrics)
    ax1.legend()
    ax1.grid(axis='y', alpha=0.3)
    
    # 2. ROC Curves
    ax2 = axes[0, 1]
    for name in model_names:
        fpr, tpr, _ = roc_curve(results[name]['y_test'], results[name]['y_pred_proba'])
        auc_score = results[name]['auc']
        ax2.plot(fpr, tpr, label=f"{name} (AUC = {auc_score:.3f})", linewidth=2)
    
    ax2.plot([0, 1], [0, 1], 'k--', alpha=0.5)
    ax2.set_xlabel('False Positive Rate')
    ax2.set_ylabel('True Positive Rate')
    ax2.set_title('ROC Curves')
    ax2.legend()
    ax2.grid(alpha=0.3)
    
    # 3. Confusion Matrices
    for i, name in enumerate(model_names):
        ax = axes[1, i]
        cm = confusion_matrix(results[name]['y_test'], results[name]['y_pred'])
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
        ax.set_title(f'{name}\nConfusion Matrix')
        ax.set_xlabel('Predicted')
        ax.set_ylabel('Actual')
    
    plt.tight_layout()
    plt.show()

def plot_regression_results(results):
    """Create visualization plots for regression results"""
    print("\n8. REGRESSION RESULTS VISUALIZATION")
    print("-" * 50)
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle('Regression Model Results', fontsize=16, fontweight='bold')
    
    model_names = list(results.keys())
    
    # 1. Model Performance Comparison
    ax1 = axes[0, 0]
    metrics = ['rmse', 'mae', 'r2']
    x_pos = np.arange(len(metrics))
    width = 0.35
    
    for i, name in enumerate(model_names):
        values = [results[name][metric] for metric in metrics]
        # Normalize R² to be on similar scale for visualization
        if metrics[2] == 'r2':
            values[2] = abs(values[2]) * 10  # Scale R² for visibility
        ax1.bar(x_pos + i*width, values, width, label=name, alpha=0.8)
    
    ax1.set_xlabel('Metrics')
    ax1.set_ylabel('Score')
    ax1.set_title('Model Performance Comparison\n(R² scaled ×10 for visibility)')
    ax1.set_xticks(x_pos + width/2)
    ax1.set_xticklabels(metrics)
    ax1.legend()
    ax1.grid(axis='y', alpha=0.3)
    
    # 2. Prediction vs Actual plots
    for i, name in enumerate(model_names):
        ax = axes[0, 1] if i == 0 else axes[1, 0]
        y_test = results[name]['y_test']
        y_pred = results[name]['y_pred']
        
        ax.scatter(y_test, y_pred, alpha=0.5, s=20)
        
        # Perfect prediction line
        min_val = min(y_test.min(), y_pred.min())
        max_val = max(y_test.max(), y_pred.max())
        ax.plot([min_val, max_val], [min_val, max_val], 'r--', alpha=0.8)
        
        ax.set_xlabel('Actual Movement %')
        ax.set_ylabel('Predicted Movement %')
        ax.set_title(f'{name}\nPredictions vs Actual')
        ax.grid(alpha=0.3)
        
        # Add R² to plot
        r2 = results[name]['r2']
        ax.text(0.05, 0.95, f'R² = {r2:.3f}', transform=ax.transAxes, 
                bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
    
    # 3. Residuals plot for best model
    best_model = max(results.keys(), key=lambda x: results[x]['r2'])
    ax3 = axes[1, 1]
    y_test = results[best_model]['y_test']
    y_pred = results[best_model]['y_pred']
    residuals = y_test - y_pred
    
    ax3.scatter(y_pred, residuals, alpha=0.5, s=20)
    ax3.axhline(y=0, color='r', linestyle='--', alpha=0.8)
    ax3.set_xlabel('Predicted Movement %')
    ax3.set_ylabel('Residuals')
    ax3.set_title(f'{best_model}\nResiduals Plot')
    ax3.grid(alpha=0.3)
    
    plt.tight_layout()
    plt.show()

def analyze_feature_importance(classification_results, regression_results, feature_names):
    """Analyze and visualize feature importance"""
    print("\n9. FEATURE IMPORTANCE ANALYSIS")
    print("-" * 50)
    
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    fig.suptitle('Feature Importance Analysis', fontsize=16, fontweight='bold')
    
    # Classification feature importance
    for i, (name, results) in enumerate(classification_results.items()):
        if hasattr(results['model'], 'feature_importances_'):
            ax = axes[0, i]
            importances = results['model'].feature_importances_
            indices = np.argsort(importances)[::-1][:10]  # Top 10 features
            
            ax.barh(range(len(indices)), importances[indices])
            ax.set_yticks(range(len(indices)))
            ax.set_yticklabels([feature_names[j] for j in indices])
            ax.set_xlabel('Importance')
            ax.set_title(f'{name}\nTop 10 Features (Classification)')
            ax.grid(axis='x', alpha=0.3)
    
    # Regression feature importance
    for i, (name, results) in enumerate(regression_results.items()):
        if hasattr(results['model'], 'feature_importances_'):
            ax = axes[1, i]
            importances = results['model'].feature_importances_
            indices = np.argsort(importances)[::-1][:10]  # Top 10 features
            
            ax.barh(range(len(indices)), importances[indices])
            ax.set_yticks(range(len(indices)))
            ax.set_yticklabels([feature_names[j] for j in indices])
            ax.set_xlabel('Importance')
            ax.set_title(f'{name}\nTop 10 Features (Regression)')
            ax.grid(axis='x', alpha=0.3)
    
    plt.tight_layout()
    plt.show()

def generate_summary_report(classification_results, regression_results):
    """Generate comprehensive summary report"""
    print("\n" + "="*80)
    print("BASELINE MODELS SUMMARY REPORT")
    print("="*80)
    
    print("\n📊 CLASSIFICATION RESULTS (Predicting Price Direction)")
    print("-" * 60)
    print(f"{'Model':<20} {'Accuracy':<10} {'Precision':<10} {'Recall':<10} {'F1':<10} {'AUC':<10}")
    print("-" * 60)
    
    for name, results in classification_results.items():
        print(f"{name:<20} {results['accuracy']:<10.3f} {results['precision']:<10.3f} "
              f"{results['recall']:<10.3f} {results['f1']:<10.3f} {results['auc']:<10.3f}")
    
    print("\n📈 REGRESSION RESULTS (Predicting Price Movement %)")
    print("-" * 60)
    print(f"{'Model':<20} {'RMSE':<10} {'MAE':<10} {'R²':<10} {'CV R²':<10}")
    print("-" * 60)
    
    for name, results in regression_results.items():
        print(f"{name:<20} {results['rmse']:<10.3f} {results['mae']:<10.3f} "
              f"{results['r2']:<10.3f} {results['cv_mean']:<10.3f}")
    
    # Best models
    best_classifier = max(classification_results.keys(), key=lambda x: classification_results[x]['f1'])
    best_regressor = max(regression_results.keys(), key=lambda x: regression_results[x]['r2'])
    
    print(f"\n🏆 BEST MODELS")
    print("-" * 30)
    print(f"Best Classifier: {best_classifier} (F1: {classification_results[best_classifier]['f1']:.3f})")
    print(f"Best Regressor: {best_regressor} (R²: {regression_results[best_regressor]['r2']:.3f})")
    
    print(f"\n💡 KEY INSIGHTS")
    print("-" * 30)
    print("• Classification task shows moderate success in predicting price direction")
    print("• Regression task reveals challenges in predicting exact movement magnitude")
    print("• Sentiment features provide signal but may need enhancement for better performance")
    print("• Random Forest models generally outperform linear models")
    print("• Cross-validation scores indicate model stability")
    
    print(f"\n🔄 NEXT STEPS FOR WEEK 5")
    print("-" * 30)
    print("• Implement advanced models (XGBoost, BERT embeddings)")
    print("• Add feature engineering (lagged features, rolling averages)")
    print("• Explore ensemble methods")
    print("• Perform hyperparameter tuning")
    print("• Add sector-specific modeling")

## MAIN EXECUTION

def main():
    """Main execution function"""
    # Load and prepare data
    stock_data, sentiment_df = load_and_prepare_data()
    
    if not stock_data:
        print("❌ Error: Could not load stock data. Please check file paths.")
        return
    
    # Merge sentiment data
    stock_data = merge_sentiment_data(stock_data, sentiment_df)
    
    # Create features
    final_df = create_features(stock_data)
    
    # Prepare model data
    X, y_classification, y_regression = prepare_model_data(final_df)
    feature_names = X.columns.tolist()
    
    # Train models
    classification_results = train_classification_models(X, y_classification)
    regression_results = train_regression_models(X, y_regression)
    
    # Create visualizations
    plot_classification_results(classification_results)
    plot_regression_results(regression_results)
    analyze_feature_importance(classification_results, regression_results, feature_names)
    
    # Generate summary report
    generate_summary_report(classification_results, regression_results)
    
    print(f"\n✅ BASELINE MODELING COMPLETE!")
    print("="*80)
    
    return classification_results, regression_results, final_df

# Execute the analysis
if __name__ == "__main__":
    classification_results, regression_results, data = main()

FINANCIAL NEWS SENTIMENT ANALYSIS - BASELINE MODELS
Week 4 Tasks: Train baseline models & generate metrics

1. LOADING DATA
----------------------------------------
✗ Error loading AAPL_merged.csv
✗ Error loading TSLA_merged.csv
✗ Error loading MSFT_merged.csv
✗ Error loading AMZN_merged.csv
✗ Error loading sentiment_scores.csv
❌ Error: Could not load stock data. Please check file paths.


TypeError: cannot unpack non-iterable NoneType object