In [1]:
# Import the necessary modules
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split 
import pandas as pd

from sklearn.metrics import f1_score
from sklearn.metrics import mean_squared_error

import numpy as np

import matplotlib.pyplot as plt
from matplotlib import style

# Adjusting the size of matplotlib
import matplotlib as mpl
mpl.rc('figure', figsize=(8, 7))
mpl.__version__

# Adjusting the style of matplotlib
style.use('ggplot')

# Magic to tell python notebook we want matplotlib charts included
%matplotlib inline

In [2]:
train_df = pd.read_json("training_set.json")
test_df = pd.read_json("test_set.json")
hashtag_corpus = pd.read_json("NTUSDFinCorpus/NTUSD_Fin_hashtag_v1.0.json")
word_corpus = pd.read_json("NTUSDFinCorpus/NTUSD_Fin_word_v1.0.json")

In [17]:
hashtag_corpus

Unnamed: 0,bear_cfidf,bear_freq,bull_cfidf,bull_freq,chi_squared,market_sentiment,token,word_vec
0,0.000000,0,66.746120,33,46.946756,1.274635,boolishaf,"[-0.10372790694236701, -0.039682704955339, 0.0..."
1,0.000000,0,63.342516,25,35.545978,1.274635,magicdojiplay,"[-0.035492535680532004, -0.035272806882858006,..."
2,0.000000,0,53.785018,12,17.046690,1.274635,moist,"[0.054784148931503004, -0.06538165360689101, 0..."
3,36.127804,6,59.257745,18,11.237069,0.859598,china,"[0.005029591266065001, -0.09715940803289401, -..."
4,0.000000,0,56.851343,15,21.312796,1.274635,senhance,"[-0.054273013025522, -0.09080003947019501, -0...."
5,0.000000,0,51.191971,10,14.203605,1.274635,savage,"[-0.067075610160827, -0.079591922461986, -0.05..."
6,33.945773,5,60.604310,20,15.441303,0.952707,fakenews,"[0.028488140553236, 0.010947021655738002, 0.07..."
7,15.453946,1,65.458982,30,39.379523,1.227330,dontbeasheep,"[-0.03775043413043, -0.000720742158591, 0.0678..."
8,40.806634,9,58.513487,17,6.213774,0.661658,market,"[0.056465990841388, 0.06531291455030401, 0.018..."
9,0.000000,0,57.713857,16,22.735225,1.274635,sippinknows,"[0.057264972478151, -0.11316230148077001, 0.10..."


In [3]:
conditions = [
    (train_df['sentiment'] == 0) , 
    (train_df['sentiment'] <  0) ,
    (train_df['sentiment'] >  0)]
choices = ['neutral', 'bullish', 'bearish']
train_df['classes'] = np.select(conditions, choices, default='neutral')

In [4]:
#checking for missing values
print(train_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1396 entries, 0 to 1395
Data columns (total 5 columns):
sentiment    1396 non-null float64
snippet      1396 non-null object
target       1396 non-null object
tweet        1396 non-null object
classes      1396 non-null object
dtypes: float64(1), object(4)
memory usage: 54.6+ KB
None


In [5]:
train_df

Unnamed: 0,sentiment,snippet,target,tweet,classes
0,-0.463,downgrade,$PCAR,downgrades $SON $ARI $GG $FLTX $WMC $MFA $IVR ...,bullish
1,0.678,"[looking sexy this morning, break on volume]",$AMZN,$AMZN looking sexy this morning...$600 break o...,bearish
2,0.377,still long term fan!,$SBUX,@GerberKawasaki stock hasn't moved much since ...,bearish
3,0.129,$TFM will have a way to go price wise to compe...,$KR,Whole foods $WFM may feel price competition bu...,bearish
4,0.395,iPhone SE Could Be Doing Better Than Expected,$AAPL,Apple's iPhone SE Could Be Doing Better Than E...,bearish
5,0.458,Now up to 200 stocks making new 52 week highs,$TJX,Now up to 200 stocks making new 52 week highs ...,bearish
6,0.288,There's no reason why $GOOGL can't get back to...,$GOOGL,There's no reason why $GOOGL can't get back to...,bearish
7,-0.453,short,$CVEO,Nice names on watch for tomorrow $TUES long $8...,bullish
8,0.372,Pivotal sees 27% upside for Alphabet,$YHOO,Pivotal sees 27% upside for Alphabet https://t...,bearish
9,0.216,the most trending,$DWTI,Good Morning! Here's some of the most trending...,bearish


# Pre-Processing

In [None]:
train_df['tweet'] = train_df['tweet'].str.lower()
train_df['tweet'] = train_df['tweet'].str.replace('([@][\w_-]+)', ' @mentions ', case=False)
train_df['tweet'] = train_df['tweet'].str.replace('([$][a-z]+)', ' @cashtag ', case=False)
train_df['tweet'] = train_df['tweet'].str.replace('http\S+|www.\S+', '@url', case=False)

In [6]:
## Considering NLTK Tweeter Tokenizer
from nltk.tokenize import TweetTokenizer
tweettokenizer = TweetTokenizer()


# the problem about using tweeter tokenize is that it splits the cashtag and not the hashtag, so we might not be able to use this
#train_df['tweet_tokens'] = df.apply(lambda row: tweettokenizer.tokenize(row['tweet'].lower()), axis=1)

In [7]:
train_df

Unnamed: 0,sentiment,snippet,target,tweet,classes
0,-0.463,downgrade,$PCAR,downgrades $SON $ARI $GG $FLTX $WMC $MFA $IVR ...,bullish
1,0.678,"[looking sexy this morning, break on volume]",$AMZN,$AMZN looking sexy this morning...$600 break o...,bearish
2,0.377,still long term fan!,$SBUX,@GerberKawasaki stock hasn't moved much since ...,bearish
3,0.129,$TFM will have a way to go price wise to compe...,$KR,Whole foods $WFM may feel price competition bu...,bearish
4,0.395,iPhone SE Could Be Doing Better Than Expected,$AAPL,Apple's iPhone SE Could Be Doing Better Than E...,bearish
5,0.458,Now up to 200 stocks making new 52 week highs,$TJX,Now up to 200 stocks making new 52 week highs ...,bearish
6,0.288,There's no reason why $GOOGL can't get back to...,$GOOGL,There's no reason why $GOOGL can't get back to...,bearish
7,-0.453,short,$CVEO,Nice names on watch for tomorrow $TUES long $8...,bullish
8,0.372,Pivotal sees 27% upside for Alphabet,$YHOO,Pivotal sees 27% upside for Alphabet https://t...,bearish
9,0.216,the most trending,$DWTI,Good Morning! Here's some of the most trending...,bearish


# Training the Model

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

X = list(train_df["tweet"])
y = train_df["sentiment"]

X = cv.fit_transform(X)

In [9]:
# nothing (1396, 4608)
# smth (1396, 3249)
print(X.shape)

(1396, 4608)


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X , y, test_size=0.33, random_state=42 )

In [11]:
#from sklearn.naive_bayes import MultinomialNB
#nb = MultinomialNB()
#nb.fit(X_train, y_train)

In [12]:
#linear regression model
from sklearn.linear_model import LinearRegression
log_model = LinearRegression()
log_model = log_model.fit(X_train, y_train)
y_pred = log_model.predict(X_test)

In [13]:
# support vector regression

In [14]:
#Neural network models (supervised)



#random forest
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)

print(clf.feature_importances_)
forest_prediction = clf.predict(X_test)
print(forest_prediction)
print(mean_squared_error(y_test, forest_prediction))

predictions = nb.predict(X_test)

from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test,predictions))
print('\n')
print(classification_report(y_test, predictions))

In [15]:
y_pred

array([ -5.87302458e-01,   7.30000013e-02,   8.65339229e-01,
        -2.46433207e-01,  -2.81947619e-01,   4.51999980e-01,
        -4.00499997e-01,   1.23441694e-01,  -4.42999981e-01,
         1.06928689e-01,  -1.35999975e-01,  -2.73868925e-02,
         4.47082817e-01,   5.06180880e-02,   4.81861774e-01,
        -2.40425830e-01,   9.68995979e-02,   3.52000024e-01,
         1.23078658e-01,   4.42541768e-01,   5.11953393e-01,
         3.65000020e-01,   2.77629527e-01,   1.17588824e-01,
         7.30000013e-02,  -2.20907678e-01,   1.42760343e-01,
         3.99666665e-01,   6.14285181e-01,   6.63006849e-01,
         2.83200005e-01,   1.85681039e-01,  -4.43251096e-01,
        -2.06166008e-02,   3.23277038e-01,  -2.56868866e-01,
         5.99566842e-01,   1.15578648e-01,   1.65186584e-01,
         2.42085955e-01,   1.59381572e-01,   3.06365966e-01,
         4.61000013e-01,  -2.91664380e-01,   1.65000026e-01,
         9.89387538e-02,  -2.41148518e-02,  -1.86038725e-01,
        -4.08359292e-01,

# Evaluation

In [16]:
#from sklearn.metrics import mean_squared_error
#from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

print(mean_squared_error(y_test, y_pred))
print('\n')
#f1_score(y_test, y_pred, average='macro')
#f1_score(y_test, y_pred, average='micro')
#f1_score(y_test, y_pred, average='weighted') 


0.114464208529


