# NLP Preprocessing

In [3]:
import numpy as np
import pandas as pd
import nltk
import data_cleaning as dc
import preprocess_NASDAQ_data as pre

In [4]:
NASDAQ_price = pre.create_df()
NASDAQ_price.head(5)

453


FB-GOOGL-GOOG: No data found, symbol may be delisted
HRS: No data found, symbol may be delisted
INTC-USB: No data found, symbol may be delisted
AMZN-GPS: No data found, symbol may be delisted
TICKER: No data found, symbol may be delisted
CBS: No data found, symbol may be delisted
TWTR: No data found, symbol may be delisted
FB: No data found, symbol may be delisted
INFO: No data found, symbol may be delisted
JEC: No data found, symbol may be delisted
FNSR: No data found, symbol may be delisted
RE: No data found, symbol may be delisted
TMK: No data found, symbol may be delisted
NUAN: No data found, symbol may be delisted
TRQ: No data found, symbol may be delisted
DCIX: No data found, symbol may be delisted
FBHS: No data found, symbol may be delisted
PAH: No price data found, symbol may be delisted (period=10y)
WYN: No price data found, symbol may be delisted (period=10y)
LUK: No price data found, symbol may be delisted (period=10y)
FMSA: No price data found, symbol may be delisted (perio

Unnamed: 0,id,text,timestamp,source,symbols,company_names,url,verified,month,day,year,Price Day Before Tweet,Price Day of Tweet,Price Day After Tweet
0,1019696670777503700,VIDEO: “I was in my office. I was minding my o...,2018-07-18 21:33:26,GoldmanSachs,GS,The Goldman Sachs,https://twitter.com/i/web/status/1019696670777...,True,7,18,2018,202.197418,202.389969,200.98082
1,1019709091038548000,The price of lumber $LB_F is down 22% since hi...,2018-07-18 22:22:47,StockTwits,M,Macy's,https://twitter.com/i/web/status/1019709091038...,True,7,18,2018,29.254164,29.751329,30.745674
2,1019711413798035500,Who says the American Dream is dead? https://t...,2018-07-18 22:32:01,TheStreet,AIG,American,https://buff.ly/2L3kmc4,True,7,18,2018,46.698963,47.057468,45.964905
3,1019716662587740200,Barry Silbert is extremely optimistic on bitco...,2018-07-18 22:52:52,MarketWatch,BTC,Bitcoin,https://twitter.com/i/web/status/1019716662587...,True,7,18,2018,,47.057468,45.964905
4,1019718460287389700,How satellites avoid attacks and space junk wh...,2018-07-18 23:00:01,Forbes,ORCL,Oracle,http://on.forbes.com/6013DqDDU,True,7,18,2018,44.699886,44.462238,44.279404


In [None]:
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob

# Load dataset
df = pd.read_csv('./stockerbot-export-preprocessed.csv', on_bad_lines='skip')

# Prepare the NLTK resources
nltk.download('wordnet')
nltk.download('stopwords')
lemmatizer = WordNetLemmatizer()

# Tracks if we did any preprocessing
preprocessed = False

# Add sentiment column with TextBlob if it doesn't exist
if 'tweet_polarity' not in df.columns:
    print('Calculating sentiment column...')
    df['tweet_polarity'] = df['text'].apply(lambda tweet: TextBlob(tweet).sentiment.polarity)
    preprocessed = True
if 'tweet_subjectivity' not in df.columns:
    print('Calculating subjectivity column...')
    df['tweet_subjectivity'] = df['text'].apply(lambda tweet: TextBlob(tweet).sentiment.subjectivity)
    preprocessed = True

# Apply preprocessing to the 'text' column if it doesn't exist
if 'preprocessed_tweet' not in df.columns:
    print('Preprocessing text column...')
    df['preprocessed_tweet'] = df['text'].apply(lambda tweet: dc.preprocess_tweet(tweet, lemmatizer))
    preprocessed = True
    
# INSERT STOCK DATA HERE
NASDAQ_price = pre.result_df
merged_df = pd.merge(NASDAQ_price, df, on='id', how='inner')
merged_df.to_csv('./merged-stock-data.csv')

display(merged_df.head(20))
    
# Save the preprocessed data
#if preprocessed:
    #print('Saving preprocessed data...')
    #df.to_csv('./stockerbot-export-preprocessed.csv', index=False)

# Display the preprocessed text
#pd.set_option('display.max_colwidth', None)
#display(df.head(20))

In [None]:
NASDAQ_price = pre.result_df
NASDAQ_price.head()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# TF-IDF vectorization for the 'preprocessed_tweet' column
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
tfidf_features = tfidf_vectorizer.fit_transform(df['preprocessed_tweet'].astype('U'))  # Convert to Unicode

# One-hot encoding for categorical variables
onehot_encoder = OneHotEncoder()
onehot_features_source = onehot_encoder.fit_transform(df[['source']])
onehot_features_symbols = onehot_encoder.fit_transform(df[['symbols']])

# Scaling numerical features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[['tweet_polarity', 'tweet_subjectivity']])

## Testing/Training Data

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Combine all features into a single matrix
X = hstack([tfidf_features, onehot_features_source, onehot_features_symbols, scaled_features])

# The target variable
y = df['price_day_after'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train linear regression model
linear_reg_model = LinearRegression()
linear_reg_model.fit(X_train, y_train)

# Train random forest regression model
random_forest_model = RandomForestRegressor(random_state=42)
random_forest_model.fit(X_train, y_train)

# Train ridge regression model
# Can potentially adjust alpha, maybe try different values
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)

# Train lasso regression model
# Can potentially adjust alpha, maybe try different values
lasso_model = Lasso(alpha=1.0)
lasso_model.fit(X_train, y_train)

# Evaluate linear regression model
linear_reg_pred = linear_reg_model.predict(X_test)
linear_reg_mae = mean_absolute_error(y_test, linear_reg_pred)
linear_reg_mse = mean_squared_error(y_test, linear_reg_pred)
linear_reg_rmse = mean_squared_error(y_test, linear_reg_pred, squared=False)

# Evaluate random forest regression model
random_forest_pred = random_forest_model.predict(X_test)
random_forest_mae = mean_absolute_error(y_test, random_forest_pred)
random_forest_mse = mean_squared_error(y_test, random_forest_pred)
random_forest_rmse = mean_squared_error(y_test, random_forest_pred, squared=False)

# Evaluate ridge regression model
ridge_pred = ridge_model.predict(X_test)
ridge_mae = mean_absolute_error(y_test, ridge_pred)
ridge_mse = mean_squared_error(y_test, ridge_pred)
ridge_rmse = mean_squared_error(y_test, ridge_pred, squared=False)

# Evaluate lasso regression model
lasso_pred = lasso_model.predict(X_test)
lasso_mae = mean_absolute_error(y_test, lasso_pred)
lasso_mse = mean_squared_error(y_test, lasso_pred)
lasso_rmse = mean_squared_error(y_test, lasso_pred, squared=False)

In [None]:
import matplotlib.pyplot as plt

# Define labels and data for each model
models = ['Linear Regression', 'Random Forest', 'Ridge Regression', 'Lasso Regression']
mae_scores = [linear_reg_mae, random_forest_mae, ridge_mae, lasso_mae]
mse_scores = [linear_reg_mse, random_forest_mse, ridge_mse, lasso_mse]
rmse_scores = [linear_reg_rmse, random_forest_rmse, ridge_rmse, lasso_rmse]

# Plotting MAE
plt.figure(figsize=(10, 5))
plt.bar(models, mae_scores, color='skyblue')
plt.title('Mean Absolute Error (MAE) Comparison')
plt.xlabel('Regression Model')
plt.ylabel('MAE')
plt.xticks(rotation=45)
plt.show()

# Plotting MSE
plt.figure(figsize=(10, 5))
plt.bar(models, mse_scores, color='salmon')
plt.title('Mean Squared Error (MSE) Comparison')
plt.xlabel('Regression Model')
plt.ylabel('MSE')
plt.xticks(rotation=45)
plt.show()

# Plotting RMSE
plt.figure(figsize=(10, 5))
plt.bar(models, rmse_scores, color='lightgreen')
plt.title('Root Mean Squared Error (RMSE) Comparison')
plt.xlabel('Regression Model')
plt.ylabel('RMSE')
plt.xticks(rotation=45)
plt.show()