
<div style="display: flex; align-items: center; justify-content: center; padding: 20px 0; text-align: center; height: 100vh; max-width: 1500px; margin: auto;">
    <img src="https://media.licdn.com/dms/image/v2/D4D3DAQFGx0XnuUvugA/image-scale_191_1128/image-scale_191_1128/0/1662458005755/nova_ims_information_management_school_cover?e=2147483647&v=beta&t=J3Q4LlZi36_4UAFhj2019QdtfXLn0kQwaX25jgaBhOQ" 
         alt="Logo" 
         style="width: 100%; max-width: 1500px; height: auto; max-height: 200px; object-fit: cover; object-position: center; border: 5px solid #A0C020; border-radius: 5px;">
</div>

<div style="text-align: center; padding: 20px 0;">
    <h5>This analysis is conducted as part of the <b><i>Text Mining</i></b> course, a component of the Master's program in Data Science and Advanced Analytics at the <b><u>Nova Information Management School</u></b>.</h5>
</div>
<!-- This notebook template was created by Catarina Gonçalves Nunes, 20230083 -->
<div style="text-align: center; color: #A0C020;">
    <h1><b>Predicting market behavior from tweets</b></h1>
</div>

<h3 style="text-align: center;"><u></b>Group 31</u></b></h3>

|     Student Name     |     Student ID     | 
|         ---          |           ---          |
|     David|         | 
|     Elcano           |              |
|     Jorge Cordeiro      |       20240594       |
|     Rui   |            |

# Importing libraries

In [3]:
# general and load_data.py functions
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from load_data import *
from visualization import *

In [4]:
#specific model functions are below

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
import time


# LOGISTIC REGRESSION -delete this for using other models

In [6]:

def train_simple_logreg(preprocessing_version, feature_type):
  
    print(f"Training LogisticRegression with {preprocessing_version}/{feature_type} features")
    
    # Load data
    data = load_data(preprocessing_version, feature_type)
    
    X_train = data['X_train']
    y_train = data['y_train']
    X_val = data['X_val']
    y_val = data['y_val']
    X_test = data['X_test']
    
    # Create and train LogisticRegression model
    logreg = LogisticRegression(C=1.0, penalty='l2', solver='lbfgs', max_iter=1000,class_weight='balanced',n_jobs=-1,random_state=42)
    logreg.fit(X_train, y_train)
    
    # Make predictions
    y_train_pred = logreg.predict(X_train)
    y_val_pred = logreg.predict(X_val)
    y_test_pred = logreg.predict(X_test)
    
    # Get prediction probabilities
    y_val_prob = logreg.predict_proba(X_val)
    
    # Calculate metrics
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred, average='macro')
    val_accuracy = accuracy_score(y_val, y_val_pred)
    val_f1 = f1_score(y_val, y_val_pred, average='macro')
    
    # Calculate overfitting
    overfitting = train_f1 - val_f1
    
    # Print results
    print(f"Train Accuracy: {train_accuracy:.4f}, Train F1: {train_f1:.4f}")
    print(f"Val Accuracy: {val_accuracy:.4f}, Val F1: {val_f1:.4f}")
    print(f"Overfitting: {overfitting:.4f}")
    
    # Create output directory
    model_dir = os.path.join("model_results", preprocessing_version, feature_type, "logreg")
    os.makedirs(model_dir, exist_ok=True)
    
    # Use visualization functions
    plot_confusion_matrix(y_val, y_val_pred, title=f'Confusion Matrix - LogReg ({preprocessing_version}, {feature_type})',output_path=os.path.join(model_dir, 'confusion_matrix.png'))
    
    plot_roc_curves(y_val, y_val_prob,class_names=["Bearish", "Bullish", "Neutral"],title=f'ROC Curves - LogReg ({preprocessing_version}, {feature_type})',output_path=os.path.join(model_dir, 'roc_curves.png'))
    
    # Return results dictionary
    return {'model': logreg, 'train_accuracy': train_accuracy, 'train_f1': train_f1, 'val_accuracy': val_accuracy, 'val_f1': val_f1, 'overfitting': overfitting, 'predictions': {'train': y_train_pred, 'val': y_val_pred, 'test': y_test_pred}, 'probabilities': {'val': y_val_prob}, 'params': {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs', 'max_iter': 1000, 'class_weight': 'balanced'}}



# Evaluation of all the combinations - you might need to adapt these 2 cells below in terms of the names of the csv that are being exported

In [16]:
# Get all combinations
all_combinations = get_all_combinations()
total_combinations = len(all_combinations)

print(f"Testing LogisticRegression on all {total_combinations} combinations...")

# Store all results
all_results = []

# Process each combination
for i, (preprocessing_version, feature_type) in enumerate(all_combinations):
    print(f"\n[{i+1}/{total_combinations}] Processing {preprocessing_version} with {feature_type}")
    
    # Skip to next combination if error occurs
    try:
        # Train model
        result = train_simple_logreg(preprocessing_version, feature_type)
        
        # Save results
        save_results(result, "logreg_simple", preprocessing_version, feature_type)
        
        # Create log entry
        log_entry = {
            'Preprocessing': preprocessing_version,
            'Feature Type': feature_type,
            'Train Accuracy': result['train_accuracy'],
            'Train F1': result['train_f1'],
            'Val Accuracy': result['val_accuracy'],
            'Val F1': result['val_f1'],
            'Overfitting': result['overfitting']
        }
        
        # Add to results list
        all_results.append(log_entry)
        
        # Update log file after each model
        log_results(all_results, log_file="logreg_all_combinations_log.csv")
        
    except Exception as e:
        print(f"Error with {preprocessing_version}/{feature_type}: {e}")


Testing LogisticRegression on all 15 combinations...

[1/15] Processing regexp_snowball with tfidf
Training LogisticRegression with regexp_snowball/tfidf features
Loading data for regexp_snowball with tfidf features
Loaded processed text data - Train: 8111, Val: 1432, Test: 2388
Feature info: {'tfidf_shape': '(8111, 5000)', 'word2vec_shape': '(8111, 100)', 'mini_sbert_shape': '(8111, 384)', 'train_samples': '8111', 'val_samples': '1432', 'test_samples': '2388', 'class_weights': '{0: 2.2052746057640022, 1: 1.6546307629538963, 2: 0.5148860534501365}'}
Loaded tfidf features - Train: (8111, 5000), Val: (1432, 5000), Test: (2388, 5000)
Train Accuracy: 0.8947, Train F1: 0.8706
Val Accuracy: 0.7884, Val F1: 0.7293
Overfitting: 0.1413
Results saved to: model_results\regexp_snowball\tfidf\logreg_simple

[2/15] Processing regexp_snowball with word2vec
Training LogisticRegression with regexp_snowball/word2vec features
Loading data for regexp_snowball with word2vec features
Loaded processed text d

In [36]:

# Create a DataFrame with all results
results_df = pd.DataFrame(all_results)

# Sort by validation F1 score
if not results_df.empty:
    results_df = results_df.sort_values('Overfitting', ascending=True)
    results_df.to_csv('logreg_all_combinations_results.csv', index=False)
    
    # Display top results
    print("\nAll combinations:")
    print(results_df)
    
    # Create visualizations comparing results
    plot_model_comparison(
        results_df, 
        metric='Val F1', 
        group_by='Preprocessing', 
        hue='Feature Type',
        title='Validation F1 Score by Preprocessing and Feature Type',
        output_path='logreg_all_comparison.png'
    )
    
    plot_overfitting_analysis(
        results_df, 
        group_by='Feature Type',
        title='Overfitting by Feature Type',
        output_path='logreg_all_overfitting.png'
    )
    
    # Find the best model
    find_best_model(
        results_df, 
        metric='Val F1',
        min_columns=['Preprocessing', 'Feature Type', 'Val F1', 'Train F1', 'Overfitting']
    )

print("\nAll combinations tested!")


All combinations:
           Preprocessing Feature Type  Train Accuracy  Train F1  Val Accuracy  \
10  whitespace_lancaster     word2vec        0.605967  0.502069      0.599162   
7           tweet_porter     word2vec        0.618173  0.511181      0.599860   
13            word_lemma     word2vec        0.606954  0.498093      0.587291   
4             tweet_base     word2vec        0.585871  0.483259      0.559358   
1        regexp_snowball     word2vec        0.610652  0.506941      0.585894   
8           tweet_porter   mini_sbert        0.695475  0.649829      0.640363   
2        regexp_snowball   mini_sbert        0.701147  0.655663      0.642458   
11  whitespace_lancaster   mini_sbert        0.684503  0.639602      0.623603   
14            word_lemma   mini_sbert        0.726174  0.681396      0.650140   
5             tweet_base   mini_sbert        0.730736  0.687321      0.648743   
0        regexp_snowball        tfidf        0.894711  0.870618      0.788408   
12       