In [1]:
# Import the necessary modules
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split 
import pandas as pd

from sklearn.metrics import f1_score
from sklearn.metrics import mean_squared_error

import numpy as np

import matplotlib.pyplot as plt
from matplotlib import style

# Adjusting the size of matplotlib
import matplotlib as mpl
mpl.rc('figure', figsize=(8, 7))
mpl.__version__

# Adjusting the style of matplotlib
style.use('ggplot')

# Magic to tell python notebook we want matplotlib charts included
%matplotlib inline

In [2]:
train_df = pd.read_json("training_set.json")
test_df = pd.read_json("test_set.json")
# hashtag_corpus = pd.read_json("NTUSDFinCorpus/NTUSD_Fin_hashtag_v1.0.json")
# word_corpus = pd.read_json("NTUSDFinCorpus/NTUSD_Fin_word_v1.0.json")

conditions = [
    (train_df['sentiment'] == 0) , 
    (train_df['sentiment'] <  0) ,
    (train_df['sentiment'] >  0)]
choices = ['neutral', 'bullish', 'bearish']
train_df['classes'] = np.select(conditions, choices, default='neutral')

# Pre-Processing

In [3]:
train_df['tweet'] = train_df['tweet'].str.lower()
train_df['tweet'] = train_df['tweet'].str.replace('([@][\w_-]+)', ' @mentions ', case=False)
train_df['tweet'] = train_df['tweet'].str.replace('([$][a-z]+)', ' @cashtag ', case=False)
train_df['tweet'] = train_df['tweet'].str.replace('http\S+|www.\S+', '@url', case=False)

In [4]:
test_df['tweet'] = test_df['tweet'].str.lower()
test_df['tweet'] = test_df['tweet'].str.replace('([@][\w_-]+)', ' @mentions ', case=False)
test_df['tweet'] = test_df['tweet'].str.replace('([$][a-z]+)', ' @cashtag ', case=False)
test_df['tweet'] = test_df['tweet'].str.replace('http\S+|www.\S+', '@url', case=False)

In [5]:
test_df

Unnamed: 0,sentiment,snippet,target,tweet
0,0.323,ooks pretty bullish for now,$ATVI,@cashtag ooks pretty bullish for now. from a...
1,0.579,"[looks really interesting on drop, grabbed som...",$CSCO,"@cashtag looks really interesting on drop, g..."
2,0.294,covered some shorts,$TSLA,@cashtag : covered some shorts @ 246.00 for ...
3,0.028,triple top forming.,$TSLA,watching @cashtag w triple top forming.
4,-0.076,Whole Foods shareholders vote down activist in...,$WFM,whole foods shareholders vote down activist in...
5,-0.387,Still too early for an entry,$FB,@cashtag has returned to my short watchlist....
6,0.087,Today I bought more,$AXP,today i bought more @cashtag @cashtag @ca...
7,-0.464,Short Setups Looking Nice....Really Nice,$BBY,short setups looking nice....really nice | @c...
8,-0.291,"Tesla is recalling 2,700 Model X cars",$TSLA,"tesla is recalling 2,700 model x cars: @url @..."
9,0.318,What a nice reversal.,$GOOG,thank you @cashtag (google alphabet) and @c...


In [6]:
train_df

Unnamed: 0,sentiment,snippet,target,tweet
0,-0.463,downgrade,$PCAR,downgrades @cashtag @cashtag @cashtag @...
1,0.678,"[looking sexy this morning, break on volume]",$AMZN,@cashtag looking sexy this morning...$600 br...
2,0.377,still long term fan!,$SBUX,@mentions stock hasn't moved much since firs...
3,0.129,$TFM will have a way to go price wise to compe...,$KR,whole foods @cashtag may feel price competit...
4,0.395,iPhone SE Could Be Doing Better Than Expected,$AAPL,apple's iphone se could be doing better than e...
5,0.458,Now up to 200 stocks making new 52 week highs,$TJX,now up to 200 stocks making new 52 week highs ...
6,0.288,There's no reason why $GOOGL can't get back to...,$GOOGL,there's no reason why @cashtag can't get bac...
7,-0.453,short,$CVEO,nice names on watch for tomorrow @cashtag lo...
8,0.372,Pivotal sees 27% upside for Alphabet,$YHOO,pivotal sees 27% upside for alphabet @url @ca...
9,0.216,the most trending,$DWTI,good morning! here's some of the most trending...


# Training the Model

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer( analyzer='word',
                      ngram_range=(1,3),
                      stop_words = 'english')

In [8]:
model = cv.fit_transform(list(train_df["tweet"]))

X = list(train_df["tweet"])
y = train_df["sentiment"]
X = cv.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X , y, test_size=0.33, random_state=42 )

In [9]:
cv.fit(train_df["tweet"])
X_train = cv.transform(train_df["tweet"])
X_test = cv.transform(test_df["tweet"])
y_train = train_df["sentiment"]
y_test = test_df["sentiment"]

In [10]:
#linear regression model
from sklearn.linear_model import LinearRegression
log_model = LinearRegression(fit_intercept=True)
log_model = log_model.fit(X_train, y_train)
y_pred = log_model.predict(X_test)

#random forest
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)

print(clf.feature_importances_)
forest_prediction = clf.predict(X_test)
print(forest_prediction)
print(mean_squared_error(y_test, forest_prediction))

predictions = nb.predict(X_test)

from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test,predictions))
print('\n')
print(classification_report(y_test, predictions))

# Evaluation

In [11]:
def assignClasses(data): 
    value = list()
    for i in data: 
        if i > 0:
            value.append("bullish")
        elif i < 0:
            value.append("bearish")
        else:
            value.append("neutral")
    
    return value 

In [12]:
n_y_test = assignClasses(y_test)
n_y_pred = assignClasses(y_pred)

In [13]:
len(n_y_test)

634

In [14]:
#from sklearn.metrics import mean_squared_error
#from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

print("MSE: ", mean_squared_error(y_test, y_pred))
print('\n')
print("F1 Macro Avg: ", f1_score(n_y_test, n_y_pred, average='macro'))
print("F1 Micro Avg: ", f1_score(n_y_test, n_y_pred, average='micro'))


MSE:  0.093756820038


F1 Macro Avg:  0.51849022167
F1 Micro Avg:  0.796529968454


  'precision', 'predicted', average, warn_for)
