# Data Setup

In [1]:
import pandas as pd
import nltk
import importlib

from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
from utils import data_cleaning as dc
from utils import stock_pricing as sp

importlib.reload(dc)
importlib.reload(sp)

# Prepare the NLTK resources
nltk.download('wordnet')
nltk.download('stopwords')
lemmatizer = WordNetLemmatizer()

# Define data paths
raw_data_path = './data/scraping/tweets_all.csv'
processed_data_path = raw_data_path.replace('.csv', '_preprocessed.csv')

# Define desired database parameters. Set to -1 to load all data.
database_size = -1

# Optionally force data to be regenerated
force_data_regeneration = True

try:
    # If force_data_regeneration is set, force an exception to reload the data
    if force_data_regeneration:
        print('Forcing data regeneration.')
        raise ValueError('Forcing data regeneration.')
    
    # Load the preprocessed data if it exists
    df = pd.read_csv(processed_data_path)
    
    # If dataframe is not expected size, reload the data
    if database_size != -1 and len(df) > database_size:
        df = df.sample(n=database_size)
    elif database_size != -1 and len(df) < database_size:    
        print('Preprocessed file is not the expected size. Reloading data.')
        raise ValueError('Preprocessed file is not the expected size.')
    
    print('Preprocessed file found and loaded.')
except (FileNotFoundError, ValueError):
    # Load dataset with stock data
    df = sp.preprocess_nasdaq_df(raw_data_path, database_size)
    tweet_col_name = 'rawContent'
    
    # Add sentiment column with TextBlob if it doesn't exist
    df['tweet_polarity'] = df[tweet_col_name].apply(lambda tweet: TextBlob(tweet).sentiment.polarity)
    df['tweet_subjectivity'] = df[tweet_col_name].apply(lambda tweet: TextBlob(tweet).sentiment.subjectivity)

    # Apply preprocessing to the 'tweet' column
    df['preprocessed_tweet'] = df[tweet_col_name].apply(lambda tweet: dc.preprocess_tweet(tweet, lemmatizer))
        
    # Save the preprocessed data
    df.to_csv(processed_data_path, index=False)
    print('File preprocessing completed and saved.')

# Display the preprocessed dataframe
pd.set_option('display.max_colwidth', None)
print(f"Dataframe shape: {df.shape}")
display(df.head(5))

[nltk_data] Downloading package wordnet to /Users/seby/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/seby/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Forcing data regeneration.
Loaded 11909 rows from the CSV file.
Dropped 0 rows with missing timestamps.
Finished adding price data. Dropped 2051 rows.
File preprocessing completed and saved.
Dataframe shape: (9858, 21)


Unnamed: 0,id,url,user,verifiedUser,quotedUser,verifiedQuotedUser,timestamp,rawContent,quotedContent,retweetCount,...,symbols,month,day,year,Price Day Before Tweet,Price Day of Tweet,Price Day After Tweet,tweet_polarity,tweet_subjectivity,preprocessed_tweet
0,1572619354998710272,https://twitter.com/scottsinvesting/status/1572619354998710272,scottsinvesting,False,,,2022-09-21 16:10:46+00:00,"Wide moat defense names like $GD, $LMT &amp; $RTX are trading at fwd P/E's between 14-16. I'd suggest that's fair in any environment, but likely good value today considering escalating geopolitical concerns existing around the world today. IMO\nFull Disclosure: L/T positions in all 3.",,0,...,GD,9,21,2022,220.330978,219.455826,219.350052,0.33,0.69,wide moat defense name like $GD $LMT amp $RTX trading fwd suggest fair environment likely good value today considering escalating geopolitical concern existing around world today imo full disclosure position 3
1,1640050675953582081,https://twitter.com/jamesmadison501/status/1640050675953582081,jamesmadison501,False,,,2023-03-26 17:58:46+00:00,"@Hansolo37059705 @TheStudyofWar ISW ain' t Steve Irwin, though he's a bad example of an expert. ISW is headed by Kimberly Kagan, part of the aggessively interventionist Kagan family, including Victoria Nuland. It's board is the same;Bill Kristol, Petraeus,Jack Keane. It's funded by big defense contractors.",,0,...,GD,3,26,2023,217.292557,217.292557,218.780045,-0.35,0.383333,@Hansolo37059705 @TheStudyofWar isw steve irwin though bad example expert isw headed kimberly kagan part aggessively interventionist kagan family including victoria nuland board bill kristol petraeus jack keane funded big defense contractor
2,1700147252583387475,https://twitter.com/paulcerro/status/1700147252583387475,paulcerro,True,,,2023-09-08 14:01:06+00:00,Had a great time talking with @MonetiveWealth last night about the defense industry and why we're so bullish on it.\n\nGreat commentary from speakers and value-additive breakdowns.\n\nWill continue hosting more thematic Twitter spaces more often so people can continue getting more variety aside from everyday macro/trading spaces.\n\nNext topic is GLP-1s with @LillybeLilly and @Mahek_MD which we hope to do next week.\n\nOthers in the mix are\n- Wealth creation/destruction in the US since GFC with @realDillonEvans \n- SMB investing\n- Fitness in a post-COVID world\n- Cannabis\n\nAnd much much more. Be sure to follow to get the notifications when they go up,,2,...,GD,9,8,2023,215.167374,214.360565,212.786346,0.325,0.421296,great time talking @MonetiveWealth last night defense industry bullish great commentary speaker breakdown continue hosting thematic twitter space often people continue getting variety aside everyday space next topic @LillybeLilly @Mahek_MD hope next week others mix wealth u since gfc @realDillonEvans smb investing fitness world cannabis much much sure follow get notification go
3,1704407517076259027,https://twitter.com/HejToJeMara/status/1704407517076259027,HejToJeMara,False,,,2023-09-20 08:09:53+00:00,"Od exitu v srpnu z chujovin typu $nio, $sofi a $coin jsem udělal menší změny:\n\n$el: nová pozice, 1/2 co plánuji … tzn že mě čeká pain na 80-100, kde doberu zbytek👌\n\nPřikoupení $gd (215), $lmt (420), cvs (65), $cni (110), $asml (590)\n\n$jd jsem rotoval do kweb\n\nCash 25% (bondy)",,1,...,GD,9,20,2023,219.260361,221.91687,218.306,0.0,0.0,od exitu v srpnu z chujovin typu $nio $sofi $coin jsem udělal menší změny $el nová pozice co plánuji … tzn že mě čeká pain na kde doberu zbytek👌 přikoupení $gd 215 $lmt 420 cv 65 $cni 110 $asml 590 $jd jsem rotoval kweb cash 25 bondy
4,1704974149939061007,https://twitter.com/abcampbell/status/1704974149939061007,abcampbell,True,,,2023-09-21 21:41:28+00:00,"The ""Arsenal of Democracy"" is only possible when you make 50% of the world's steel...\n\nhttps://t.co/NuPDne23k1 https://t.co/KgR0ztBZbr",,46,...,GD,9,21,2023,221.91687,218.306,214.822998,0.0,1.0,arsenal democracy possible make 50 world steel https://t.co/NuPDne23k1 https://t.co/KgR0ztBZbr


## Testing/Training Data

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
importlib.reload(dc)

# Define the scalers
tfidf_vectorizer =TfidfVectorizer(max_df=0.95, min_df=5, 
                                  max_features=1000, 
                                  ngram_range=(1, 3))
onehot_encoder = OneHotEncoder(handle_unknown='ignore')
scaler = StandardScaler()

# Define the features and target variable
text_column = 'preprocessed_tweet'
categorical_columns = ['user', 'symbols']
numerical_columns = ['tweet_polarity', 'tweet_subjectivity', 'Price Day Before Tweet', 'Price Day of Tweet']
target_column = 'Price Day After Tweet'

# Prepare the features and target variable
X, y = dc.prepare_features(df, text_column, categorical_columns, numerical_columns, target_column, tfidf_vectorizer, onehot_encoder, scaler, True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
from sklearn.linear_model import LinearRegression
from utils import model_util as mu

# Force retrain the models
force_retrain = True

# Load the pretrained Ridge and Lasso models
ridge_model = mu.load_model(X_train, y_train, './models/ridge_model.pkl', 'RIDGE', force_retrain)
lasso_model = mu.load_model(X_train, y_train, './models/lasso_model.pkl', 'LASSO', force_retrain)
random_forest_model = mu.load_model(X_train, y_train, './models/random_forest_model.pkl', 'RANDOM FOREST', force_retrain)

# Train linear regression model
linear_reg_model = LinearRegression()
linear_reg_model.fit(X_train, y_train)

Training RIDGE model...
Fitting 5 folds for each of 13 candidates, totalling 65 fits
[CV] END ........................................alpha=1e-06; total time=   0.0s
[CV] END ........................................alpha=1e-05; total time=   0.0s
[CV] END ........................................alpha=1e-05; total time=   0.0s
[CV] END .......................................alpha=0.0001; total time=   0.0s
[CV] END ........................................alpha=1e-06; total time=   0.0s
[CV] END .......................................alpha=0.0001; total time=   0.0s
[CV] END ........................................alpha=1e-06; total time=   0.0s
[CV] END .......................................alpha=0.0001; total time=   0.0s
[CV] END ........................................alpha=1e-06; total time=   0.0s
[CV] END ........................................alpha=1e-06; total time=   0.0s
[CV] END .......................................alpha=0.0001; total time=   0.0s
[CV] END ...............

  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END ........................................alpha=1e-05; total time=  38.9s
[CV] END ........................................alpha=1e-05; total time=  39.0s


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END ........................................alpha=1e-05; total time=  39.2s
[CV] END ........................................alpha=1e-06; total time=  39.3s


  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END ........................................alpha=1e-06; total time=  39.4s


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END ........................................alpha=1e-06; total time=  39.6s
[CV] END ........................................alpha=1e-06; total time=  39.7s


  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END ........................................alpha=1e-06; total time=  40.2s


  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END ........................................alpha=0.001; total time=  33.0s


  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END .......................................alpha=0.0001; total time=  37.0s


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END .......................................alpha=0.0001; total time=  37.8s
[CV] END .......................................alpha=0.0001; total time=  37.7s
[CV] END .......................................alpha=0.0001; total time=  37.5s


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END ........................................alpha=1e-05; total time=  38.7s
[CV] END .......................................alpha=0.0001; total time=  38.1s


  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END ........................................alpha=1e-05; total time=  38.9s


  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END ........................................alpha=0.001; total time=  33.6s


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END .........................................alpha=0.01; total time=  32.6s
[CV] END .........................................alpha=0.01; total time=  32.1s
[CV] END .........................................alpha=0.01; total time=  31.8s
[CV] END .........................................alpha=0.01; total time=  32.3s


  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END ........................................alpha=0.001; total time=  34.9s


  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END ........................................alpha=0.001; total time=  34.4s


  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END ........................................alpha=0.001; total time=  34.8s


  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END .........................................alpha=0.01; total time=  33.0s
[CV] END ..........................................alpha=0.1; total time=  32.0s
[CV] END ..........................................alpha=0.1; total time=  32.0s
[CV] END ..........................................alpha=0.1; total time=  32.1s
[CV] END ..........................................alpha=0.1; total time=  32.4s
[CV] END ..........................................alpha=1.0; total time=  31.6s
[CV] END ..........................................alpha=0.1; total time=  32.2s
[CV] END ..........................................alpha=1.0; total time=  32.3s
[CV] END .........................................alpha=10.0; total time=  28.6s
[CV] END ........................................alpha=100.0; total time=   0.0s
[CV] END ........................................alpha=100.0; total time=   0.0s
[CV] END ........................................alpha=100.0; total time=   0.0s
[CV] END ...................

## Model Evaluation

In [None]:
importlib.reload(mu)

# Specify the models to evaluate
models = [linear_reg_model, random_forest_model, ridge_model, lasso_model]
model_names = ['Linear Regression', 'Random Forest', 'Ridge Regression', 'Lasso Regression']

# Evaluate the models
mae_scores, mse_scores, rmse_scores, r2_scores = [], [], [], []
for model in models:
    mae, mse, rmse, r2 = mu.evaluate_model(model, X_test, y_test)
    mae_scores.append(mae)
    mse_scores.append(mse)
    rmse_scores.append(rmse)
    r2_scores.append(r2)

In [None]:
importlib.reload(mu)

# Plotting MAE
mu.plot_metric('Mean Absolute Error (MAE) Comparison', model_names, mae_scores, 'skyblue', 'Regression Model', 'MAE (USD)')

# Plotting MSE
mu.plot_metric('Mean Squared Error (MSE) Comparison', model_names, mse_scores, 'salmon', 'Regression Model', 'MSE (USD^2)')

# Plotting RMSE
mu.plot_metric('Root Mean Squared Error (RMSE) Comparison', model_names, rmse_scores, 'lightgreen', 'Regression Model', 'RMSE (USD)')

# Plotting R2
mu.plot_metric('R-Squared (R2) Comparison', model_names, r2_scores, 'violet', 'Regression Model', 'R2')

## User/Ticker Evluation

In [None]:
importlib.reload(mu)

# Run analytics on each ticker
ticker_predictions = mu.run_analytics(df, 'symbols', 'Ticker', 
                                      text_column, categorical_columns, numerical_columns, 
                                      target_column, 
                                      tfidf_vectorizer, onehot_encoder, scaler, 
                                      models, model_names, 
                                      10)
ticker_predictions = ticker_predictions.sort_values(by='Min MAE', ascending=True)
display(ticker_predictions.head())

# Run analytics on each user
user_predictions = mu.run_analytics(df, 'user', 'Tweet Author', 
                                    text_column, categorical_columns, numerical_columns,
                                    target_column, 
                                    tfidf_vectorizer, onehot_encoder, scaler, 
                                    models, model_names, 
                                    5)
user_predictions = user_predictions.sort_values(by='Min MAE', ascending=True)
display(user_predictions.head())