# NLP Preprocessing

In [17]:
import numpy as np
import pandas as pd
import nltk
import data_cleaning as dc

from nltk.stem import WordNetLemmatizer
from textblob import TextBlob

# Load dataset
df = pd.read_csv('./stockerbot-export-preprocessed.csv', on_bad_lines='skip')

# Prepare the NLTK resources
nltk.download('wordnet')
nltk.download('stopwords')
lemmatizer = WordNetLemmatizer()

# Tracks if we did any preprocessing
preprocessed = False

# Add sentiment column with TextBlob if it doesn't exist
if 'tweet_polarity' not in df.columns:
    print('Calculating sentiment column...')
    df['tweet_polarity'] = df['text'].apply(lambda tweet: TextBlob(tweet).sentiment.polarity)
    preprocessed = True
if 'tweet_subjectivity' not in df.columns:
    print('Calculating subjectivity column...')
    df['tweet_subjectivity'] = df['text'].apply(lambda tweet: TextBlob(tweet).sentiment.subjectivity)
    preprocessed = True

# Apply preprocessing to the 'text' column if it doesn't exist
if 'preprocessed_tweet' not in df.columns:
    print('Preprocessing text column...')
    df['preprocessed_tweet'] = df['text'].apply(lambda tweet: dc.preprocess_tweet(tweet, lemmatizer))
    preprocessed = True
    
# Save the preprocessed data
if preprocessed:
    print('Saving preprocessed data...')
    df.to_csv('./stockerbot-export-preprocessed.csv', index=False)

# Display the preprocessed text
pd.set_option('display.max_colwidth', None)
display(df.head(20))

[nltk_data] Downloading package wordnet to /Users/seby/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/seby/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,id,text,timestamp,source,symbols,company_names,url,verified,tweet_polarity,tweet_subjectivity,preprocessed_tweet
0,1019696670777503700,VIDEO: “I was in my office. I was minding my own business...” –David Solomon tells $GS interns how he learned he wa… https://t.co/QClAITywXV,Wed Jul 18 21:33:26 +0000 2018,GoldmanSachs,GS,The Goldman Sachs,https://twitter.com/i/web/status/1019696670777503745,True,0.6,1.0,"['video', 'office', 'minding', 'business', '–david', 'solomon', 'tell', '$GS', 'intern', 'learned', 'wa…', 'https://t.co/QClAITywXV']"
1,1019709091038548000,The price of lumber $LB_F is down 22% since hitting its YTD highs. The Macy's $M turnaround is still happening.… https://t.co/XnKsV4De39,Wed Jul 18 22:22:47 +0000 2018,StockTwits,M,Macy's,https://twitter.com/i/web/status/1019709091038547968,True,-0.155556,0.288889,"['price', 'lumber', '$LB_F', '22', 'since', 'hitting', 'ytd', 'high', 'macy', '$M', 'turnaround', 'still', 'https://t.co/XnKsV4De39']"
2,1019711413798035500,Who says the American Dream is dead? https://t.co/CRgx19x7sA,Wed Jul 18 22:32:01 +0000 2018,TheStreet,AIG,American,https://buff.ly/2L3kmc4,True,-0.1,0.2,"['say', 'american', 'dream', 'dead', 'https://t.co/CRgx19x7sA']"
3,1019716662587740200,Barry Silbert is extremely optimistic on bitcoin -- but predicts that 99% of new crypto entrants are “going to zero… https://t.co/mGMVo2cZgY,Wed Jul 18 22:52:52 +0000 2018,MarketWatch,BTC,Bitcoin,https://twitter.com/i/web/status/1019716662587740160,True,0.005682,0.727273,"['barry', 'silbert', 'extremely', 'optimistic', 'bitcoin', 'predicts', '99', 'new', 'crypto', 'entrant', 'going', 'zero…', 'https://t.co/mGMVo2cZgY']"
4,1019718460287389700,How satellites avoid attacks and space junk while circling the Earth https://t.co/aHzIV3Lqp5 #paid @Oracle https://t.co/kacpqZWiDJ,Wed Jul 18 23:00:01 +0000 2018,Forbes,ORCL,Oracle,http://on.forbes.com/6013DqDDU,True,0.0,0.0,"['satellite', 'avoid', 'attack', 'space', 'junk', 'circling', 'earth', 'https://t.co/aHzIV3Lqp5', 'paid', '@Oracle', 'https://t.co/kacpqZWiDJ']"
5,1019719465095790600,.@RealMoney's David Butler's favorite FANG stock isn't #RealMoneySOD Alphabet but Facebook https://t.co/MczAPSFjOi,Wed Jul 18 23:04:00 +0000 2018,jimcramer,FB-GOOGL-GOOG,Facebook*Alphabet*Alphabet,http://bit.ly/2NrYxje,True,0.5,1.0,"['david', 'butler', 'favorite', 'fang', 'stock', 'realmoneysod', 'alphabet', 'facebook', 'https://t.co/MczAPSFjOi']"
6,1019720209786114000,Don’t miss my convo with one of my favorite thinkers @SamHarrisOrg! https://t.co/uuPVxIobCh,Wed Jul 18 23:06:58 +0000 2018,ianbremmer,HRS,Harris,https://twitter.com/samharrisorg/status/1019719376348434433,True,0.625,1.0,"['miss', 'convo', 'one', 'favorite', 'thinker', '@SamHarrisOrg', 'https://t.co/uuPVxIobCh']"
7,1019720659738480600,U.S. intelligence documents on Nelson Mandela made public https://t.co/XTnEfo1rO6 https://t.co/V8DXkWDQ6R,Wed Jul 18 23:08:45 +0000 2018,Reuters,INTC-USB,Intel*U.S.,https://reut.rs/2O0ypNf,True,0.0,0.066667,"['intelligence', 'document', 'nelson', 'mandela', 'made', 'public', 'https://t.co/XTnEfo1rO6', 'https://t.co/V8DXkWDQ6R']"
8,1019720723441635300,Senate wants emergency alerts to go out through Netflix Spotify etc. https://t.co/23yy3whBlc by @grg,Wed Jul 18 23:09:00 +0000 2018,TechCrunch,NFLX,Netflix,https://tcrn.ch/2L8DsgT,True,0.2,0.1,"['senate', 'want', 'emergency', 'alert', 'go', 'netflix', 'spotify', 'etc', 'https://t.co/23yy3whBlc', '@grg']"
9,1019721145396887600,Hedge fund manager Marc Larsy says bitcoin $40K is possible https://t.co/54uPe0OWqT,Wed Jul 18 23:10:41 +0000 2018,MarketWatch,BTC,Bitcoin,https://on.mktw.net/2Ntr7k9,True,0.0,1.0,"['hedge', 'fund', 'manager', 'marc', 'larsy', 'say', 'bitcoin', '$40K', 'possible', 'https://t.co/54uPe0OWqT']"


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# TF-IDF vectorization for the 'preprocessed_tweet' column
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
tfidf_features = tfidf_vectorizer.fit_transform(df['preprocessed_tweet'].astype('U'))  # Convert to Unicode

# One-hot encoding for categorical variables
onehot_encoder = OneHotEncoder()
onehot_features_source = onehot_encoder.fit_transform(df[['source']])
onehot_features_symbols = onehot_encoder.fit_transform(df[['symbols']])

# Scaling numerical features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[['tweet_polarity', 'tweet_subjectivity']])

## Testing/Training Data

In [33]:
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split

# Combine all features into a single matrix
X = hstack([tfidf_features, onehot_features_source, onehot_features_symbols, scaled_features])

# Fill price_day_after with filler data
df['price_day_after'] = np.random.randint(0, 1001, size=len(df))

# The target variable
y = df['price_day_after'].values

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate

# Define the range of k values to test
k_values = range(1, 11)

# Initialize dictionaries to store the metrics
accuracy_scores = []
error_scores = []
precision_scores = []
recall_scores = []

# Perform cross-validation for each value of k
for k in k_values:
    # Create a kNN classifier with the current value of k
    knn = KNeighborsClassifier(n_neighbors=k)
    
    # Perform cross-validation and get the scores
    scores = cross_validate(knn, X_train, y_train.ravel(), cv=5, scoring=['accuracy', 'precision', 'recall'])
    
    # Calculate and store the metrics
    accuracy = np.mean(scores['test_accuracy'])
    error = 1 - accuracy
    precision = np.mean(scores['test_precision'])
    recall = np.mean(scores['test_recall'])
    
    accuracy_scores.append(accuracy)
    error_scores.append(error)
    precision_scores.append(precision)
    recall_scores.append(recall)
    
# Find the best k
best_k = k_values[np.argmax(accuracy_scores)]
print(f'Best k: {best_k}')

Best k: 1


In [34]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from scipy.sparse import hstack
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Create the models
linear_regression = LinearRegression()
logistic_regression = LogisticRegression(random_state=0, max_iter=1000)
lda = LinearDiscriminantAnalysis(shrinkage=None)
knn = KNeighborsClassifier(n_neighbors=best_k)
models = [linear_regression, logistic_regression, lda, knn]

# Create an instance of KFold
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Convert coo_matrix to csr_matrix for slicing
X_csr = X.tocsr()

# Perform k-fold validation
for train_index, test_index in kf.split(X):
    # Split the data into training and testing sets
    X_train, X_test = X_csr[train_index], X_csr[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Train your model and evaluate its performance
    for model in models:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
    
        # Print the mean squared error for each fold
        model_type = type(model).__name__
        print(f"{model_type} - Mean Squared Error: {mse}")

LinearRegression - Mean Squared Error: 104218.79264611927
LogisticRegression - Mean Squared Error: 166248.36564655934


TypeError: Sparse data was passed for X, but dense data is required. Use '.toarray()' to convert to a dense numpy array.