# NLP Preprocessing

In [3]:
import numpy as np
import pandas as pd
import nltk
import data_cleaning as dc
import stock_pricing as sp
import importlib

from nltk.stem import WordNetLemmatizer
from textblob import TextBlob

# Force reload data_cleaning and stock_pricing
importlib.reload(dc)
importlib.reload(sp)

# Prepare the NLTK resources
nltk.download('wordnet')
nltk.download('stopwords')
lemmatizer = WordNetLemmatizer()

# If preprocessed file exist, load it
try:
    df = pd.read_csv('./stockerbot-export-preprocessed.csv')
    print('Preprocessed file found and loaded.')
except FileNotFoundError:
    # Load dataset with stock data
    df = sp.preprocess_nasdaq_df()

    # Add sentiment column with TextBlob if it doesn't exist
    df['tweet_polarity'] = df['text'].apply(lambda tweet: TextBlob(tweet).sentiment.polarity)
    df['tweet_subjectivity'] = df['text'].apply(lambda tweet: TextBlob(tweet).sentiment.subjectivity)

    # Apply preprocessing to the 'tweet' column
    df['preprocessed_tweet'] = df['text'].apply(lambda tweet: dc.preprocess_tweet(tweet, lemmatizer))
        
    # Save the preprocessed data
    df.to_csv('./stockerbot-export-preprocessed.csv', index=False)
    print('File preprocessing completed and saved.')

# Display the preprocessed dataframe
pd.set_option('display.max_colwidth', None)
display(df.head(20))

[nltk_data] Downloading package wordnet to /Users/seby/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/seby/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
FB-GOOGL-GOOG: No data found, symbol may be delisted
HRS: No data found, symbol may be delisted
INTC-USB: No data found, symbol may be delisted
AMZN-GPS: No data found, symbol may be delisted
TICKER: No data found, symbol may be delisted
CBS: No data found, symbol may be delisted
TWTR: No data found, symbol may be delisted
FB: No data found, symbol may be delisted
INFO: No data found, symbol may be delisted
JEC: No data found, symbol may be delisted
FNSR: No data found, symbol may be delisted
RE: No data found, symbol may be delisted
TMK: No data found, symbol may be delisted
NUAN: No data found, symbol may be delisted
TRQ: No data found, symbol may be delisted
DCIX: No data found, symbol may be delisted
FBHS: No data found, symbol may be delis

File preprocessing completed and saved.


Unnamed: 0,id,text,timestamp,source,symbols,company_names,url,verified,month,day,year,Price Day Before Tweet,Price Day of Tweet,Price Day After Tweet,tweet_polarity,tweet_subjectivity,preprocessed_tweet
0,1019696670777503700,VIDEO: “I was in my office. I was minding my own business...” –David Solomon tells $GS interns how he learned he wa… https://t.co/QClAITywXV,2018-07-18 21:33:26,GoldmanSachs,GS,The Goldman Sachs,https://twitter.com/i/web/status/1019696670777503745,True,7,18,2018,202.197388,202.389954,200.980804,0.6,1.0,"[video, office, minding, business, –david, solomon, tell, $GS, intern, learned, wa…, https://t.co/QClAITywXV]"
1,1019709091038548000,The price of lumber $LB_F is down 22% since hitting its YTD highs. The Macy's $M turnaround is still happening.… https://t.co/XnKsV4De39,2018-07-18 22:22:47,StockTwits,M,Macy's,https://twitter.com/i/web/status/1019709091038547968,True,7,18,2018,29.25416,29.751337,30.745676,-0.155556,0.288889,"[price, lumber, $LB_F, 22, since, hitting, ytd, high, macy, $M, turnaround, still, https://t.co/XnKsV4De39]"
2,1019711413798035500,Who says the American Dream is dead? https://t.co/CRgx19x7sA,2018-07-18 22:32:01,TheStreet,AIG,American,https://buff.ly/2L3kmc4,True,7,18,2018,46.698978,47.057476,45.964901,-0.1,0.2,"[say, american, dream, dead, https://t.co/CRgx19x7sA]"
3,1019716662587740200,Barry Silbert is extremely optimistic on bitcoin -- but predicts that 99% of new crypto entrants are “going to zero… https://t.co/mGMVo2cZgY,2018-07-18 22:52:52,MarketWatch,BTC,Bitcoin,https://twitter.com/i/web/status/1019716662587740160,True,7,18,2018,,47.057476,45.964901,0.005682,0.727273,"[barry, silbert, extremely, optimistic, bitcoin, predicts, 99, new, crypto, entrant, going, zero…, https://t.co/mGMVo2cZgY]"
4,1019718460287389700,How satellites avoid attacks and space junk while circling the Earth https://t.co/aHzIV3Lqp5 #paid @Oracle https://t.co/kacpqZWiDJ,2018-07-18 23:00:01,Forbes,ORCL,Oracle,http://on.forbes.com/6013DqDDU,True,7,18,2018,44.699894,44.462223,44.2794,0.0,0.0,"[satellite, avoid, attack, space, junk, circling, earth, https://t.co/aHzIV3Lqp5, paid, @Oracle, https://t.co/kacpqZWiDJ]"
5,1019719465095790600,.@RealMoney's David Butler's favorite FANG stock isn't #RealMoneySOD Alphabet but Facebook https://t.co/MczAPSFjOi,2018-07-18 23:04:00,jimcramer,FB-GOOGL-GOOG,Facebook*Alphabet*Alphabet,http://bit.ly/2NrYxje,True,7,18,2018,,44.462223,44.2794,0.5,1.0,"[david, butler, favorite, fang, stock, realmoneysod, alphabet, facebook, https://t.co/MczAPSFjOi]"
6,1019720209786114000,Don’t miss my convo with one of my favorite thinkers @SamHarrisOrg! https://t.co/uuPVxIobCh,2018-07-18 23:06:58,ianbremmer,HRS,Harris,https://twitter.com/samharrisorg/status/1019719376348434433,True,7,18,2018,,44.462223,44.2794,0.625,1.0,"[miss, convo, one, favorite, thinker, @SamHarrisOrg, https://t.co/uuPVxIobCh]"
7,1019720659738480600,U.S. intelligence documents on Nelson Mandela made public https://t.co/XTnEfo1rO6 https://t.co/V8DXkWDQ6R,2018-07-18 23:08:45,Reuters,INTC-USB,Intel*U.S.,https://reut.rs/2O0ypNf,True,7,18,2018,,44.462223,44.2794,0.0,0.066667,"[intelligence, document, nelson, mandela, made, public, https://t.co/XTnEfo1rO6, https://t.co/V8DXkWDQ6R]"
8,1019720723441635300,Senate wants emergency alerts to go out through Netflix Spotify etc. https://t.co/23yy3whBlc by @grg,2018-07-18 23:09:00,TechCrunch,NFLX,Netflix,https://tcrn.ch/2L8DsgT,True,7,18,2018,379.480011,375.130005,364.230011,0.2,0.1,"[senate, want, emergency, alert, go, netflix, spotify, etc, https://t.co/23yy3whBlc, @grg]"
9,1019721145396887600,Hedge fund manager Marc Larsy says bitcoin $40K is possible https://t.co/54uPe0OWqT,2018-07-18 23:10:41,MarketWatch,BTC,Bitcoin,https://on.mktw.net/2Ntr7k9,True,7,18,2018,,47.057476,45.964901,0.0,1.0,"[hedge, fund, manager, marc, larsy, say, bitcoin, $40K, possible, https://t.co/54uPe0OWqT]"


In [None]:
NASDAQ_price = pre.result_df
NASDAQ_price.head()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# TF-IDF vectorization for the 'preprocessed_tweet' column
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
tfidf_features = tfidf_vectorizer.fit_transform(df['preprocessed_tweet'].astype('U'))  # Convert to Unicode

# One-hot encoding for categorical variables
onehot_encoder = OneHotEncoder()
onehot_features_source = onehot_encoder.fit_transform(df[['source']])
onehot_features_symbols = onehot_encoder.fit_transform(df[['symbols']])

# Scaling numerical features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[['tweet_polarity', 'tweet_subjectivity']])

## Testing/Training Data

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.sparse import hstack

# Combine all features into a single matrix
X = hstack([tfidf_features, onehot_features_source, onehot_features_symbols, scaled_features])

# The target variable
y = df['price_day_after'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train linear regression model
linear_reg_model = LinearRegression()
linear_reg_model.fit(X_train, y_train)

# Train random forest regression model
random_forest_model = RandomForestRegressor(random_state=42)
random_forest_model.fit(X_train, y_train)

# Train ridge regression model
# Can potentially adjust alpha, maybe try different values
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)

# Train lasso regression model
# Can potentially adjust alpha, maybe try different values
lasso_model = Lasso(alpha=1.0)
lasso_model.fit(X_train, y_train)

# Evaluate linear regression model
linear_reg_pred = linear_reg_model.predict(X_test)
linear_reg_mae = mean_absolute_error(y_test, linear_reg_pred)
linear_reg_mse = mean_squared_error(y_test, linear_reg_pred)
linear_reg_rmse = mean_squared_error(y_test, linear_reg_pred, squared=False)

# Evaluate random forest regression model
random_forest_pred = random_forest_model.predict(X_test)
random_forest_mae = mean_absolute_error(y_test, random_forest_pred)
random_forest_mse = mean_squared_error(y_test, random_forest_pred)
random_forest_rmse = mean_squared_error(y_test, random_forest_pred, squared=False)

# Evaluate ridge regression model
ridge_pred = ridge_model.predict(X_test)
ridge_mae = mean_absolute_error(y_test, ridge_pred)
ridge_mse = mean_squared_error(y_test, ridge_pred)
ridge_rmse = mean_squared_error(y_test, ridge_pred, squared=False)

# Evaluate lasso regression model
lasso_pred = lasso_model.predict(X_test)
lasso_mae = mean_absolute_error(y_test, lasso_pred)
lasso_mse = mean_squared_error(y_test, lasso_pred)
lasso_rmse = mean_squared_error(y_test, lasso_pred, squared=False)

In [None]:
import matplotlib.pyplot as plt

# Define labels and data for each model
models = ['Linear Regression', 'Random Forest', 'Ridge Regression', 'Lasso Regression']
mae_scores = [linear_reg_mae, random_forest_mae, ridge_mae, lasso_mae]
mse_scores = [linear_reg_mse, random_forest_mse, ridge_mse, lasso_mse]
rmse_scores = [linear_reg_rmse, random_forest_rmse, ridge_rmse, lasso_rmse]

# Plotting MAE
plt.figure(figsize=(10, 5))
plt.bar(models, mae_scores, color='skyblue')
plt.title('Mean Absolute Error (MAE) Comparison')
plt.xlabel('Regression Model')
plt.ylabel('MAE')
plt.xticks(rotation=45)
plt.show()

# Plotting MSE
plt.figure(figsize=(10, 5))
plt.bar(models, mse_scores, color='salmon')
plt.title('Mean Squared Error (MSE) Comparison')
plt.xlabel('Regression Model')
plt.ylabel('MSE')
plt.xticks(rotation=45)
plt.show()

# Plotting RMSE
plt.figure(figsize=(10, 5))
plt.bar(models, rmse_scores, color='lightgreen')
plt.title('Root Mean Squared Error (RMSE) Comparison')
plt.xlabel('Regression Model')
plt.ylabel('RMSE')
plt.xticks(rotation=45)
plt.show()