### Json Train data

In [58]:
import pandas as pd
import numpy as np
import json
import emoji
import nltk
import html.parser as htmlparser
import warnings # Ignores any warning
warnings.filterwarnings("ignore")
import re

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shreya1996/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
from nltk.corpus import stopwords

In [3]:
jdata = pd.read_json("train_data-(2).json")

In [5]:
x=[]
y=[]
z=[]
a=[]
for i in jdata.records:
    x.append(i['stocktwit_tweet'])
    y.append(i['sentiment_score'])
    z.append(pd.to_datetime(i['timestamp']))
    a.append(i['ticker'])

In [6]:
json_data = pd.DataFrame({'Tweet':x,
                          'Sentiment_score':y,
                          'Date':z,
                          'Stock':a})

In [9]:
json_data.to_csv('jdata.csv',index="False")

In [29]:
jf = pd.read_csv('jdata.csv')

In [30]:
jf.drop(jf.columns[0], axis = 1,inplace=True) 

In [31]:
jf.head()

Unnamed: 0,Tweet,Sentiment_score,Date,Stock
0,$AMD going up but hesitating however chart is ...,3,2018-09-19 18:38:28+00:00,$AMD
1,@inforlong @MariaGascon Despite\nChina trade w...,3,2018-10-09 03:51:06+00:00,$CAT
2,$AVGO WTF?,2,2018-07-12 13:35:32+00:00,$AVGO
3,$PH\n New Insider Filing On: \n MULLER KLAUS P...,2,2018-07-19 03:32:50+00:00,$PH
4,$FB if it bounces tommorrow do the right thing...,3,2018-08-23 19:07:54+00:00,$FB


In [32]:
#changing the date format
date = jf['Date']
a, b = zip(*(s.split(" ") for s in date))
jf['date'] = a
jf['time'] = b
jf.drop(['Date'],axis=1,inplace=True)

In [33]:
jf['Date'] = pd.to_datetime(jf['date']).dt.day
jf['Month'] = pd.to_datetime(jf['date']).dt.month
jf['Year'] = pd.to_datetime(jf['date']).dt.year
jf['Weekday'] = pd.to_datetime(jf['date']).dt.weekday

##### Basic cleaning

In [34]:
#parsing the html part
parser = htmlparser.HTMLParser()
jf['Tweet']= jf['Tweet'].apply(lambda x: parser.unescape(x))

In [35]:
#delete the url from the tweet text
jf['Tweet'] = jf['Tweet'].apply(lambda x: ' '.join(re.sub(r"http\S+", "", x)))

In [36]:
#remove \n from the tweets
jf['Tweet']=jf['Tweet'].str.replace("\\n", " ",regex=True)

In [37]:
#make double $ as single $
jf['Stock'] = jf['Stock'].str.replace('$$','$',regex=False)

In [38]:
#removing $ from stock
jf['Stock'] = jf['Stock'].str.replace('$','',regex=False)

In [39]:
#uppercase the stock 
jf['Stock']=jf['Stock'].str.upper()

In [40]:
#lowercase the Tweet
jf['Tweet']=jf['Tweet'].str.lower()

In [41]:
jf.head()#displaying after converting to lowercase

Unnamed: 0,Tweet,Sentiment_score,Stock,date,time,Date,Month,Year,Weekday
0,$amd going up but hesitating however chart is ...,3,AMD,2018-09-19,18:38:28+00:00,19,9,2018,2
1,@inforlong @mariagascon despite china trade wa...,3,CAT,2018-10-09,03:51:06+00:00,9,10,2018,1
2,$avgo wtf?,2,AVGO,2018-07-12,13:35:32+00:00,12,7,2018,3
3,$ph new insider filing on: muller klaus pet...,2,PH,2018-07-19,03:32:50+00:00,19,7,2018,3
4,$fb if it bounces tommorrow do the right thing...,3,FB,2018-08-23,19:07:54+00:00,23,8,2018,3


##### Basic Pre-processing on json train data

In [42]:
#remove user names
jf['Tweet'] = jf['Tweet'].apply(lambda x: ' '.join(re.sub("(@[A-Za-z0-9]+)", " ", x).split()))
jf['Tweet'].head()

0    $amd going up but hesitating however chart is ...
1        despite china trade war $cat held very well 👍
2                                           $avgo wtf?
3    $ph new insider filing on: muller klaus peter ...
4    $fb if it bounces tommorrow do the right thing...
Name: Tweet, dtype: object

In [43]:
#remove numerics
jf['Tweet'] = jf['Tweet'].apply(lambda x: " ".join(re.sub("[0-9]", " ", x).split()))
jf['Tweet'].head()

0    $amd going up but hesitating however chart is ...
1        despite china trade war $cat held very well 👍
2                                           $avgo wtf?
3    $ph new insider filing on: muller klaus peter ...
4    $fb if it bounces tommorrow do the right thing...
Name: Tweet, dtype: object

In [44]:
#translating emoji into text
jf['Tweet']=jf['Tweet'].apply(lambda x: emoji.demojize(x))
# the emoji gets converted into word and stored like :thumbs_up:
jf['Tweet']=jf['Tweet'].str.replace(":", " ",regex=False) # so we remove the : sign
jf['Tweet']=jf['Tweet'].str.replace("_", " ",regex=False) # and the _ sign
jf['Tweet'].head()

0    $amd going up but hesitating however chart is ...
1    despite china trade war $cat held very well  t...
2                                           $avgo wtf?
3    $ph new insider filing on  muller klaus peter ...
4    $fb if it bounces tommorrow do the right thing...
Name: Tweet, dtype: object

In [45]:
#removing punctuations
jf['Tweet'] = jf['Tweet'].str.replace("[^a-zA-Z]", " ")
jf['Tweet'].head()

0     amd going up but hesitating however chart is ...
1    despite china trade war  cat held very well  t...
2                                            avgo wtf 
3     ph new insider filing on  muller klaus peter ...
4     fb if it bounces tommorrow do the right thing...
Name: Tweet, dtype: object

In [46]:
#remove repeated letters in a word
jf['Tweet'] = jf['Tweet'].apply(lambda x : re.sub(r'(.)\1{1,}', r'\1\1', x))

In [47]:
#remove stop words
stop = stopwords.words('english')
jf['Tweet'] = jf['Tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
jf['Tweet'].head()

0    amd going hesitating however chart stable goin...
1         despite china trade war cat held well thumbs
2                                             avgo wtf
3    ph new insider filing muller klaus peter trans...
4                fb bounces tommorrow right thing gtfo
Name: Tweet, dtype: object

In [48]:
#stemming words
from nltk.stem import PorterStemmer
st = PorterStemmer()
jf['Tweet'] = jf['Tweet'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

In [49]:
(jf['Tweet'].values == '').sum() 

503

In [50]:
jf =jf[jf['Tweet']!='']

In [51]:
jf['Tweet'].head()

0             amd go hesit howev chart stabl go upward
1           despit china trade war cat held well thumb
2                                             avgo wtf
3    ph new insid file muller klau peter transact code
4                  fb bounc tommorrow right thing gtfo
Name: Tweet, dtype: object

In [34]:
jf.head()

Unnamed: 0,Tweet,Sentiment_score,Stock,date,time,Date,Month,Year,Weekday
0,amd go hesit howev chart stabl go upward,3,AMD,2018-09-19,18:38:28+00:00,19,9,2018,2
1,despit china trade war cat held well thumb,3,CAT,2018-10-09,03:51:06+00:00,9,10,2018,1
2,avgo wtf,2,AVGO,2018-07-12,13:35:32+00:00,12,7,2018,3
3,ph new insid file muller klau peter transact code,2,PH,2018-07-19,03:32:50+00:00,19,7,2018,3
4,fb bounc tommorrow right thing gtfo,3,FB,2018-08-23,19:07:54+00:00,23,8,2018,3


In [35]:
jf =jf[jf['Tweet']!='']

In [36]:
(jf['Tweet'].values == '').sum() 

0

In [102]:
jf.to_csv("Cleaned_Json_TrainData.csv",index=False) #saved intoa csv for further use directly

In [37]:
fin = pd.read_csv("Cleaned_Json_TrainData.csv")

In [38]:
fin.head()

Unnamed: 0,Tweet,Sentiment_score,Stock,date,time,Date,Month,Year,Weekday,word_count,char_count,numerics
0,amd going up but hesitating however chart is ...,3,AMD,2018-09-19,18:38:28+00:00,19,9,2018,2,13,74,0
1,despite china trade war cat held very well t...,3,CAT,2018-10-09,03:51:06+00:00,9,10,2018,1,11,69,0
2,avgo wtf,2,AVGO,2018-07-12,13:35:32+00:00,12,7,2018,3,2,10,0
3,ph new insider filing on muller klaus peter ...,2,PH,2018-07-19,03:32:50+00:00,19,7,2018,3,16,69,0
4,fb if it bounces tommorrow do the right thing...,3,FB,2018-08-23,19:07:54+00:00,23,8,2018,3,11,55,0


In [4]:
fin.shape

(1039131, 12)

###### Validation Prediction

In [39]:
x = fin['Tweet']
y = fin['Sentiment_score']

In [40]:
from sklearn.model_selection import train_test_split
xTrain, xVal, yTrain, yVal = train_test_split(x, y, test_size = .3, random_state = 0)

In [41]:
xTrain.shape, xVal.shape, yTrain.shape, yVal.shape 

((727391,), (311740,), (727391,), (311740,))

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
jf_vect_train = tfidf.fit_transform(xTrain)
jf_vect_val = tfidf.transform(xVal)

In [43]:
from scipy import sparse
from scipy.sparse import csc_matrix,csr_matrix
xTrain_vect_train = sparse.csr_matrix(jf_vect_train)
xTrain_vect_val = sparse.csr_matrix(jf_vect_val)

##### Logistic Regression

In [51]:
from sklearn.metrics import f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression
log_reg=LogisticRegression(solver='newton-cg')
log_reg.fit(xTrain_vect_train,yTrain)
test_pred=log_reg.predict(xTrain_vect_val)

In [52]:
print(accuracy_score(yVal,test_pred))

0.6506736382883171


In [54]:
from sklearn.metrics import f1_score
print(f1_score(yVal,test_pred, average='macro'))

0.5800019900625732


##### Random Forest

In [44]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf_fit = rf.fit(xTrain_vect_train, yTrain)


In [48]:
from sklearn.metrics import accuracy_score
print(accuracy_score(yVal,X_counr_csr_val_pred))

0.6382017065503304


In [55]:
from sklearn.metrics import f1_score
print(f1_score(yVal,X_counr_csr_val_pred,average='macro'))

0.5706081622319079


Since logistic regression gave a better score, that was used to predict the score.

### Json Test data

In [None]:
jtest = pd.read_json("TestData/test_data.json")

In [None]:
x=[]
z=[]
a=[]
for i in jdata.records:
    x.append(i['stocktwit_tweet'])
    z.append(pd.to_datetime(i['timestamp']))
    a.append(i['ticker'])

In [None]:
json_data = pd.DataFrame({'Tweet':x,
                          'Date':z,
                          'Stock':a})

In [None]:
json_data.to_csv('jtest.csv',index="False")

In [5]:
jf_test = pd.read_csv('jtest.csv')

In [6]:
jf_test.drop(jf_test.columns[0], axis = 1,inplace=True) 

In [7]:
jf_test.head()

Unnamed: 0,Tweet,Date,Stock
0,$CELG nothing to be exited about,2018-10-25 14:26:16+00:00,$CELG
1,$AMD yall exhaust your buyer on first green ca...,2018-07-13 13:50:39+00:00,$AMD
2,$AMD day traders day.,2018-09-25 19:10:54+00:00,$AMD
3,$CBS https://tenor.com/wLB8.gif,2018-07-27 22:45:48+00:00,$CBS
4,$MU weak price action so far today. Don’t be a...,2018-07-31 14:59:06+00:00,$MU


In [8]:
#changing the date format
date = jf_test['Date']
a, b = zip(*(s.split(" ") for s in date))
jf_test['date'] = a
jf_test['time'] = b
jf_test.drop(['Date'],axis=1,inplace=True)

In [9]:
jf_test['Date'] = pd.to_datetime(jf_test['date']).dt.day
jf_test['Month'] = pd.to_datetime(jf_test['date']).dt.month
jf_test['Year'] = pd.to_datetime(jf_test['date']).dt.year
jf_test['Weekday'] = pd.to_datetime(jf_test['date']).dt.weekday

#### Basic cleaning

In [10]:
#parsing the html part
parser = htmlparser.HTMLParser()
jf_test['Tweet']= jf_test['Tweet'].apply(lambda x: parser.unescape(x))

In [12]:
#delete the url from the tweet text
jf_test['Tweet'] = jf_test['Tweet'].apply(lambda x: ' '.join(re.sub(r"http\S+", "", x)))

In [13]:
#remove \n from the tweets
jf_test['Tweet']=jf_test['Tweet'].str.replace("\\n", " ",regex=True)

In [14]:
#make double $ as single $
jf_test['Stock'] = jf_test['Stock'].str.replace('$$','$',regex=False)

In [15]:
#removing $ from stock
jf_test['Stock'] = jf_test['Stock'].str.replace('$','',regex=False)

In [16]:
#uppercase the stock 
jf_test['Stock']=jf_test['Stock'].str.upper()

In [17]:
#lowercase the Tweet
jf_test['Tweet']=jf_test['Tweet'].str.lower()

In [18]:
jf_test.head()#displaying after converting to lowercase

Unnamed: 0,Tweet,Stock,date,time,Date,Month,Year,Weekday
0,$celg nothing to be exited about,CELG,2018-10-25,14:26:16+00:00,25,10,2018,3
1,$amd yall exhaust your buyer on first green ca...,AMD,2018-07-13,13:50:39+00:00,13,7,2018,4
2,$amd day traders day.,AMD,2018-09-25,19:10:54+00:00,25,9,2018,1
3,$cbs,CBS,2018-07-27,22:45:48+00:00,27,7,2018,4
4,$mu weak price action so far today. don’t be a...,MU,2018-07-31,14:59:06+00:00,31,7,2018,1


#### Basic Preprocessing

In [19]:
#remove user names
jf_test['Tweet'] = jf_test['Tweet'].apply(lambda x: ' '.join(re.sub("(@[A-Za-z0-9]+)", " ", x).split()))
jf_test['Tweet'].head()

0                     $celg nothing to be exited about
1    $amd yall exhaust your buyer on first green ca...
2                                $amd day traders day.
3                                                 $cbs
4    $mu weak price action so far today. don’t be a...
Name: Tweet, dtype: object

In [20]:
#remove numerics
jf_test['Tweet'] = jf_test['Tweet'].apply(lambda x: " ".join(re.sub("[0-9]", " ", x).split()))
jf_test['Tweet'].head()

0                     $celg nothing to be exited about
1    $amd yall exhaust your buyer on first green ca...
2                                $amd day traders day.
3                                                 $cbs
4    $mu weak price action so far today. don’t be a...
Name: Tweet, dtype: object

In [21]:
#translating emoji into text
jf_test['Tweet']=jf_test['Tweet'].apply(lambda x: emoji.demojize(x))
# the emoji gets converted into word and stored like :thumbs_up:
jf_test['Tweet']=jf_test['Tweet'].str.replace(":", " ",regex=False) # so we remove the : sign
jf_test['Tweet']=jf_test['Tweet'].str.replace("_", " ",regex=False) #and the _ sign
jf_test['Tweet'].head()

0                     $celg nothing to be exited about
1    $amd yall exhaust your buyer on first green ca...
2                                $amd day traders day.
3                                                 $cbs
4    $mu weak price action so far today. don’t be a...
Name: Tweet, dtype: object

In [22]:
#removing punctuations
jf_test['Tweet'] = jf_test['Tweet'].str.replace("[^a-zA-Z]", " ")
jf_test['Tweet'].head()

0                      celg nothing to be exited about
1     amd yall exhaust your buyer on first green ca...
2                                 amd day traders day 
3                                                  cbs
4     mu weak price action so far today  don t be a...
Name: Tweet, dtype: object

In [23]:
#remove duplicate letters from words
jf_test['Tweet'] = jf_test['Tweet'].apply(lambda x : re.sub(r'(.)\1{1,}', r'\1\1', x))

In [24]:
#remove stop words
stop = stopwords.words('english')
jf_test['Tweet'] = jf_test['Tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
jf_test['Tweet'].head()

0                                  celg nothing exited
1       amd yall exhaust buyer first green candle byee
2                                  amd day traders day
3                                                  cbs
4    mu weak price action far today afraid go short...
Name: Tweet, dtype: object

In [25]:
#stemming the data
from nltk.stem import PorterStemmer
st = PorterStemmer()
jf_test['Tweet'] = jf_test['Tweet'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

In [26]:
(jf_test['Tweet'].values == '').sum() 

177

In [27]:
jf_test =jf_test[jf_test['Tweet']!='']

In [28]:
jf_test.to_csv("Cleaned_Json_TestData.csv",index=False)

### Final Predictions

In [52]:
x_train = jf["Tweet"]
y_train = jf['Sentiment_score']
x_test  = jf_test["Tweet"]

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
jf_vect_train = tfidf.fit_transform(x_train)
jf_vect_test = tfidf.transform(x_test)

In [54]:
from scipy import sparse
from scipy.sparse import csc_matrix,csr_matrix
xTrain_vect_train = sparse.csr_matrix(jf_vect_train)
xTrain_vect_test = sparse.csr_matrix(jf_vect_test)

In [55]:
from sklearn.metrics import f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression
log_reg=LogisticRegression(solver='newton-cg')
log_reg.fit(xTrain_vect_train,y_train)
test_pred=log_reg.predict(xTrain_vect_test)

In [56]:
score_log = pd.DataFrame({"Sentiment_score":test_pred}) 

In [57]:
score_log.shape

(264845, 1)

In [None]:
final_log = pd.concat([jf_test, score_log],axis=1) 

In [None]:
final_log.head()

In [None]:
final_log.to_csv('Cleaned_Json_TestData_WithLogScore.csv',index=False)