# NLP Preprocessing

In [None]:
import numpy as np
import pandas as pd
import nltk
import data_cleaning as dc
import preprocess_NASDAQ_data as pre

In [None]:
NASDAQ_price = pre.result_df
NASDAQ_price.head(5)

In [None]:
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob

# Load dataset
df = pd.read_csv('./stockerbot-export-preprocessed.csv', on_bad_lines='skip')

# Prepare the NLTK resources
nltk.download('wordnet')
nltk.download('stopwords')
lemmatizer = WordNetLemmatizer()

# Tracks if we did any preprocessing
preprocessed = False

# Add sentiment column with TextBlob if it doesn't exist
if 'tweet_polarity' not in df.columns:
    print('Calculating sentiment column...')
    df['tweet_polarity'] = df['text'].apply(lambda tweet: TextBlob(tweet).sentiment.polarity)
    preprocessed = True
if 'tweet_subjectivity' not in df.columns:
    print('Calculating subjectivity column...')
    df['tweet_subjectivity'] = df['text'].apply(lambda tweet: TextBlob(tweet).sentiment.subjectivity)
    preprocessed = True

# Apply preprocessing to the 'text' column if it doesn't exist
if 'preprocessed_tweet' not in df.columns:
    print('Preprocessing text column...')
    df['preprocessed_tweet'] = df['text'].apply(lambda tweet: dc.preprocess_tweet(tweet, lemmatizer))
    preprocessed = True
    
# INSERT STOCK DATA HERE
NASDAQ_price = pre.result_df
merged_df = pd.merge(NASDAQ_price, df, on='id', how='inner')
merged_df.to_csv('./merged-stock-data.csv')

display(merged_df.head(20))
    
# Save the preprocessed data
#if preprocessed:
    #print('Saving preprocessed data...')
    #df.to_csv('./stockerbot-export-preprocessed.csv', index=False)

# Display the preprocessed text
#pd.set_option('display.max_colwidth', None)
#display(df.head(20))

In [None]:
NASDAQ_price = pre.result_df
NASDAQ_price.head()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# TF-IDF vectorization for the 'preprocessed_tweet' column
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
tfidf_features = tfidf_vectorizer.fit_transform(df['preprocessed_tweet'].astype('U'))  # Convert to Unicode

# One-hot encoding for categorical variables
onehot_encoder = OneHotEncoder()
onehot_features_source = onehot_encoder.fit_transform(df[['source']])
onehot_features_symbols = onehot_encoder.fit_transform(df[['symbols']])

# Scaling numerical features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[['tweet_polarity', 'tweet_subjectivity']])

## Testing/Training Data

In [None]:
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split

# Combine all features into a single matrix
X = hstack([tfidf_features, onehot_features_source, onehot_features_symbols, scaled_features])

# Fill price_day_after with filler data
df['price_day_after'] = np.random.randint(0, 1001, size=len(df))

# The target variable
y = df['price_day_after'].values

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate

# Define the range of k values to test
k_values = range(1, 11)

# Initialize dictionaries to store the metrics
accuracy_scores = []
error_scores = []
precision_scores = []
recall_scores = []

# Perform cross-validation for each value of k
for k in k_values:
    # Create a kNN classifier with the current value of k
    knn = KNeighborsClassifier(n_neighbors=k)
    
    # Perform cross-validation and get the scores
    scores = cross_validate(knn, X_train, y_train.ravel(), cv=5, scoring=['accuracy', 'precision', 'recall'])
    
    # Calculate and store the metrics
    accuracy = np.mean(scores['test_accuracy'])
    error = 1 - accuracy
    precision = np.mean(scores['test_precision'])
    recall = np.mean(scores['test_recall'])
    
    accuracy_scores.append(accuracy)
    error_scores.append(error)
    precision_scores.append(precision)
    recall_scores.append(recall)
    
# Find the best k
best_k = k_values[np.argmax(accuracy_scores)]
print(f'Best k: {best_k}')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Split into independent variables and  dependent variable
# placeholder variable
X = merged_df[['ivar1', 'ivar2', 'ivar3', etc]]
y = merged_df['dvar']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train linear regression model
linear_reg_model = LinearRegression()
linear_reg_model.fit(X_train, y_train)

# Train random forest regression model
random_forest_model = RandomForestRegressor(random_state=42)
random_forest_model.fit(X_train, y_train)

# Train ridge regression model
# Can potentially adjust alpha, maybe try different values
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)

# Train lasso regression model
# Can potentially adjust alpha, maybe try different values
lasso_model = Lasso(alpha=1.0)
lasso_model.fit(X_train, y_train)

# Evaluate linear regression model
linear_reg_pred = linear_reg_model.predict(X_test)
linear_reg_mae = mean_absolute_error(y_test, linear_reg_pred)
linear_reg_mse = mean_squared_error(y_test, linear_reg_pred)
linear_reg_rmse = mean_squared_error(y_test, linear_reg_pred, squared=False)

# Evaluate random forest regression model
random_forest_pred = random_forest_model.predict(X_test)
random_forest_mae = mean_absolute_error(y_test, random_forest_pred)
random_forest_mse = mean_squared_error(y_test, random_forest_pred)
random_forest_rmse = mean_squared_error(y_test, random_forest_pred, squared=False)

# Evaluate ridge regression model
ridge_pred = ridge_model.predict(X_test)
ridge_mae = mean_absolute_error(y_test, ridge_pred)
ridge_mse = mean_squared_error(y_test, ridge_pred)
ridge_rmse = mean_squared_error(y_test, ridge_pred, squared=False)

# Evaluate lasso regression model
lasso_pred = lasso_model.predict(X_test)
lasso_mae = mean_absolute_error(y_test, lasso_pred)
lasso_mse = mean_squared_error(y_test, lasso_pred)
lasso_rmse = mean_squared_error(y_test, lasso_pred, squared=False)

# Visualize metrics and compare them
import matplotlib.pyplot as plt

# Define labels and data for each model
models = ['Linear Regression', 'Random Forest', 'Ridge Regression', 'Lasso Regression']
mae_scores = [linear_reg_mae, random_forest_mae, ridge_mae, lasso_mae]
mse_scores = [linear_reg_mse, random_forest_mse, ridge_mse, lasso_mse]
rmse_scores = [linear_reg_rmse, random_forest_rmse, ridge_rmse, lasso_rmse]

# Plotting MAE
plt.figure(figsize=(10, 5))
plt.bar(models, mae_scores, color='skyblue')
plt.title('Mean Absolute Error (MAE) Comparison')
plt.xlabel('Regression Model')
plt.ylabel('MAE')
plt.xticks(rotation=45)
plt.show()

# Plotting MSE
plt.figure(figsize=(10, 5))
plt.bar(models, mse_scores, color='salmon')
plt.title('Mean Squared Error (MSE) Comparison')
plt.xlabel('Regression Model')
plt.ylabel('MSE')
plt.xticks(rotation=45)
plt.show()

# Plotting RMSE
plt.figure(figsize=(10, 5))
plt.bar(models, rmse_scores, color='lightgreen')
plt.title('Root Mean Squared Error (RMSE) Comparison')
plt.xlabel('Regression Model')
plt.ylabel('RMSE')
plt.xticks(rotation=45)
plt.show()