In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/train_data-1573118738755.json
/kaggle/input/train_factors-1573207730757.csv
/kaggle/input/test_factors.csv
/kaggle/input/test_data.json


**Installing Emoji package**

In [3]:
!pip install emoji



**Installing TextBlob**

In [2]:
!pip install TextBlob



**Importing the required libraries**

In [4]:
#General libraries
import pandas as pd
import numpy as np
from pandas import read_csv
import statistics
from datetime import *

#Libraries for JSON
import json
from pandas.io.json import json_normalize

#Libraries for text data
import html.parser as htmlparser
from textblob import *
import emoji
import re
import string
from itertools import groupby
from collections import Counter
import nltk
from nltk.corpus import stopwords
import spacy
#from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

#Libraries for Model Building
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

#To ignore the warnings
from warnings import filterwarnings
filterwarnings('ignore')
%matplotlib inline

#Evaluation libraries
from sklearn.metrics import accuracy_score,f1_score, classification_report

**Function to read file**

In [5]:
def readfile(filename,filetype,header):
  '''
  Utility to read a file based on the type of the file.
  It takes 3 arguments the name of the file, the type of the file ['csv'/'json' etc.,] and header record name or index
  In case of csv, pass the header index, in case of JSON pass the root node/key name
  '''
  if(filetype=='csv'):
    data = read_csv(filename,header=header)
  elif(filetype=='json'):
    with open(filename,encoding='utf-8',newline='\n') as jsonfile:
      d = json.load(jsonfile)
      data = json_normalize(d[header])
  return data

**Function for statistical analysis of data**

In [6]:
def describe_data(dataset):
  '''
  Utility which will give the descriptive statistics about the given dataset.
  It takes only one argument which is the dataset.
  It provides the information about the dimension of the dataset, column names, datatypes of columns, 
  no. of unique values in each column, summary of the dataset & top 5 rows from the dataset
  '''
  print("Dimensions \n")
  print(dataset.shape)
  print('\n\n')
  print("Column names\n")
  print(dataset.columns)
  print('\n\n')
  print("Data Types\n")
  print(dataset.dtypes)
  print('\n\n')
  print("Unique values in each level\n")
  for i in dataset.columns:
    print("{} - {}".format(i,len(dataset[i].unique())))
  print('\n\n')
  print("Summary \n")
  print(dataset.describe())
  print('\n\n')
  print("Top 5 rows\n")
  print(dataset.head())
  print('\n\n')

**Function to find NAs in the dataset**

In [7]:
def findNAs(dataset):
  '''
  Function to find the count of missing values
  '''
  print(dataset.isnull().sum())

**Function to convert columns to appropriate datatypes**

In [8]:
def datatype_transformer(col_list, coltype, dataset):
  '''
  Utility to convert the columns to its appropriate datatypes.
  It take 3 arguments - the list of names of the columns to be converted, the target type to which it has to be converted 
  and the dataset where the columns reside
  It converts the datatypes of the columns specified & return the dataset
  '''
  #print(coltype)
  if coltype=='date' or coltype=='timestamp':
    for i in col_list:
      #print(i)
      dataset[i] = pd.to_datetime(dataset[i],dayfirst=True)
  else:
    for i in col_list:
      dataset[i] = dataset[i].astype(coltype)
  return dataset

**Function for Train:Validation split**

In [9]:
#Train:Test Split

def train_test(dataset,target_col,train_percentage):
    y = dataset[target_col]
    x = dataset.drop(target_col, axis=1)
    X_train, X_test, Y_train, Y_test = train_test_split(x, y, train_size=train_percentage, random_state=123)
    return X_train, X_test, Y_train, Y_test

**Reading the train & test JSON data**

In [10]:
train_json_data = readfile('/kaggle/input/train_data-1573118738755.json','json','records')

In [11]:
test_json_data = readfile('/kaggle/input/test_data.json','json','records')

**Data Analysis**

Duplicating the original text extracted before proceeeding with preprocessing steps

In [12]:
import copy
#print(type(data['text']))
original_train_data = copy.deepcopy(train_json_data)
original_test_data = copy.deepcopy(test_json_data)
#print(data.keys())
#print(original_data.keys())

In [13]:
pd.options.display.max_colwidth = 500

In [14]:
train_json_data.head()

Unnamed: 0,stocktwit_tweet,sentiment_score,timestamp,ticker
0,$AMD going up but hesitating however chart is very stable and going upward,3,2018-09-19 18:38:28+00:00,$AMD
1,@inforlong @MariaGascon Despite\nChina trade war $CAT held very well 👍,3,2018-10-09 03:51:06+00:00,$CAT
2,$AVGO WTF?,2,2018-07-12 13:35:32+00:00,$AVGO
3,$PH\n New Insider Filing On: \n MULLER KLAUS PETER\nTransaction Code: \n http://www.filingscanner.com/Alerts/PH.php,2,2018-07-19 03:32:50+00:00,$PH
4,$FB if it bounces tommorrow do the right thing and GTFO,3,2018-08-23 19:07:54+00:00,$FB


In [15]:
train_json_data.sort_values(by='ticker')

Unnamed: 0,stocktwit_tweet,sentiment_score,timestamp,ticker
382208,@Learnstocks101 You did attach $$AAPL chart instead of $TSLA.,2,2018-08-02 20:32:16+00:00,$$AAPL
892852,$$AAPL 205 puts,0,2018-08-06 08:57:43+00:00,$$AAPL
14803,$$AAPL Some sort of expected resistance at 205,1,2018-08-02 14:35:42+00:00,$$AAPL
352716,@macro_economics $$AAPL 50%+AMZN 50% My Portfolio🤑🤑,2,2018-08-02 21:59:03+00:00,$$AAPL
759936,@davidcastelli WAY 2 GO. I’M a $$AAPL\n🍎FAN. GO TRILLION!!,2,2018-08-02 17:12:59+00:00,$$AAPL
...,...,...,...,...
877208,$zts EXP:8/24/2018|MaxPain:88.5|HighPutOI:89.0(60)|HighCallOI:93.0(178) http://www.opricot.com/ticker/zts/optiongraphs,2,2018-08-24 19:01:08+00:00,$zts
681188,$zts EXP:8/31/2018|MaxPain:87.0|HighPutOI:90.0(31)|HighCallOI:93.0(97) http://www.opricot.com/ticker/zts/optiongraphs,2,2018-08-27 20:01:27+00:00,$zts
970761,$zts monster stock. Big $ made sheeeeetin. Errr. Siting Only 2 down months both less then a buck in last year n half,3,2018-08-02 17:27:59+00:00,$zts
282484,$zts EXP:8/31/2018|MaxPain:87.5|HighPutOI:89.0(41)|HighCallOI:93.0(97) http://www.opricot.com/ticker/zts/optiongraphs,2,2018-08-28 21:01:08+00:00,$zts


In [16]:
describe_data(train_json_data)

Dimensions 

(1039131, 4)



Column names

Index(['stocktwit_tweet', 'sentiment_score', 'timestamp', 'ticker'], dtype='object')



Data Types

stocktwit_tweet    object
sentiment_score     int64
timestamp          object
ticker             object
dtype: object



Unique values in each level

stocktwit_tweet - 959608
sentiment_score - 5
timestamp - 905458
ticker - 2181



Summary 

       sentiment_score
count     1.039131e+06
mean      2.203079e+00
std       1.105371e+00
min       0.000000e+00
25%       2.000000e+00
50%       2.000000e+00
75%       3.000000e+00
max       4.000000e+00



Top 5 rows

                                                                                                       stocktwit_tweet  \
0                                           $AMD going up but hesitating however chart is very stable and going upward   
1                                               @inforlong @MariaGascon Despite\nChina trade war $CAT held very well 👍   
2                           

In [17]:
findNAs(train_json_data)

stocktwit_tweet    0
sentiment_score    0
timestamp          0
ticker             0
dtype: int64


**Data Preprocessing**

In [18]:
train_json_data = datatype_transformer(['timestamp'],'timestamp',train_json_data)
train_json_data = datatype_transformer(['ticker'],'category',train_json_data)

In [19]:
train_json_data.dtypes

stocktwit_tweet                 object
sentiment_score                  int64
timestamp          datetime64[ns, UTC]
ticker                        category
dtype: object

In [20]:
train_json_data.sort_values(by='ticker')

Unnamed: 0,stocktwit_tweet,sentiment_score,timestamp,ticker
382208,@Learnstocks101 You did attach $$AAPL chart instead of $TSLA.,2,2018-08-02 20:32:16+00:00,$$AAPL
892852,$$AAPL 205 puts,0,2018-08-06 08:57:43+00:00,$$AAPL
14803,$$AAPL Some sort of expected resistance at 205,1,2018-08-02 14:35:42+00:00,$$AAPL
352716,@macro_economics $$AAPL 50%+AMZN 50% My Portfolio🤑🤑,2,2018-08-02 21:59:03+00:00,$$AAPL
759936,@davidcastelli WAY 2 GO. I’M a $$AAPL\n🍎FAN. GO TRILLION!!,2,2018-08-02 17:12:59+00:00,$$AAPL
...,...,...,...,...
877208,$zts EXP:8/24/2018|MaxPain:88.5|HighPutOI:89.0(60)|HighCallOI:93.0(178) http://www.opricot.com/ticker/zts/optiongraphs,2,2018-08-24 19:01:08+00:00,$zts
681188,$zts EXP:8/31/2018|MaxPain:87.0|HighPutOI:90.0(31)|HighCallOI:93.0(97) http://www.opricot.com/ticker/zts/optiongraphs,2,2018-08-27 20:01:27+00:00,$zts
970761,$zts monster stock. Big $ made sheeeeetin. Errr. Siting Only 2 down months both less then a buck in last year n half,3,2018-08-02 17:27:59+00:00,$zts
282484,$zts EXP:8/31/2018|MaxPain:87.5|HighPutOI:89.0(41)|HighCallOI:93.0(97) http://www.opricot.com/ticker/zts/optiongraphs,2,2018-08-28 21:01:08+00:00,$zts


In [21]:
len(train_json_data.ticker.unique())

2181

**Removing $ from tickers**

In [24]:
train_json_data['ticker'] = train_json_data['ticker'].str.replace(r'^\$', '')

In [25]:
len(train_json_data.ticker.unique())

2166

In [26]:
train_json_data.sort_values(by='ticker')

Unnamed: 0,stocktwit_tweet,sentiment_score,timestamp,ticker
604174,$A max pain is 67.5 for expiry 2018-11-16 Source: http://sweep.ly/maxpain.html,2,2018-10-22 12:11:08+00:00,A
720774,AGT Food and Ingredients announces Quarterly dividend of $0.15. Payable on 10/11/2018. 3.35% Yield. https://www.marketbeat.com/d/91905 $A,2,2018-09-21 20:59:21+00:00,A
178549,#Update(52) $A Oct-19 67.5 Puts Up +179%. since alerted on: Oct 04. Peaked 1105% \n\n https://www.sleekoptions.com/st.aspx?ald=4FD9484,2,2018-10-17 17:42:52+00:00,A
909461,$A #Alert - Open Jul-20 65 Calls at $0.25 per contract. \nStock at: 63.11,3,2018-07-09 15:16:25+00:00,A
302064,#Update(8) $A Oct-19 67.5 Puts Up +79%. since alerted on: Oct 04. \n\n https://www.sleekoptions.com/st.aspx?ald=4FD9484C-3FA5-4A13-861,1,2018-10-09 14:33:19+00:00,A
...,...,...,...,...
970761,$zts monster stock. Big $ made sheeeeetin. Errr. Siting Only 2 down months both less then a buck in last year n half,3,2018-08-02 17:27:59+00:00,zts
282484,$zts EXP:8/31/2018|MaxPain:87.5|HighPutOI:89.0(41)|HighCallOI:93.0(97) http://www.opricot.com/ticker/zts/optiongraphs,2,2018-08-28 21:01:08+00:00,zts
580471,$zts EXP:8/24/2018|MaxPain:88.5|HighPutOI:89.0(60)|HighCallOI:93.0(178) http://www.opricot.com/ticker/zts/optiongraphs,2,2018-08-24 16:00:57+00:00,zts
258182,$zts EXP:8/24/2018|MaxPain:88.5|HighPutOI:89.0(60)|HighCallOI:93.0(178) http://www.opricot.com/ticker/zts/optiongraphs,2,2018-08-24 18:01:20+00:00,zts


**Converting the ticker names to UPPER case**

In [27]:
train_json_data['ticker'] = train_json_data['ticker'].str.upper()

In [28]:
len(train_json_data.ticker.unique())

1530

In [29]:
train_json_data.sort_values(by=['ticker','timestamp'])

Unnamed: 0,stocktwit_tweet,sentiment_score,timestamp,ticker
520339,$A Market Positioning is Very Underweight. CrowdThnk&#39;s Algo forecasts 61% chance Moving Higher: https://crowdthnk.com/stocks/A,2,2018-07-03 21:40:47+00:00,A
780362,short interest ratio of $A is 1.53 at 2018-06-15 and short % to float is 1.50% http://sunshineavenue.com/stock/A/ via @sunshineave,0,2018-07-04 07:15:37+00:00,A
684345,Here’s what 15 Estimize analysts believe $A will report for Q3 2018 revenue [Reporting 08/14 AMC]\nhttp://www.estimize.com/intro/a?chart=historical&amp;metric_name=revenue&amp;utm_content=A&amp;utm_medium=revenue_update&amp;utm_source=stocktwits,2,2018-07-04 15:12:41+00:00,A
914879,#AgilentTechnologies has a high payout ratio of 78.06% #A $A \nhttps://wallmine.com/nyse/a?utm_source=stocktwits,2,2018-07-05 16:31:34+00:00,A
75734,2018-07-05 Short sale volume (not short interest) for $A is 40%. http://shortvolumes.com/?t=A via @shortvolumes,1,2018-07-06 02:01:36+00:00,A
...,...,...,...,...
998623,"Some names reporting earnings this upcoming week: $BOX, $BBY, $TIF, $CRM, $DKS, $EXPR, $DY, $LULU, $AMBA, $ULTA, $ANF, $DLTR, $ZUO #Earnings",2,2018-08-26 00:35:00+00:00,ZUO
173793,"20dh vvs stocks scan $CRON, $CGC, $CNAT, $TLRY, $EIGI, $AMD, $ZUO",2,2018-08-27 13:35:22+00:00,ZUO
18783,"20dh in leading stocks scan $CGC, $CRON, $AMD, $ICL, $EIGI, $CNAT, $TLRY, $ZUO",2,2018-08-27 13:35:24+00:00,ZUO
734312,"@drexlca $zuo if wondering why initial drop, must missed the run from $23 in d last month.Same action as $Crm. Profit takes on &quot;Sell d News)",3,2018-08-30 20:27:34+00:00,ZUO


**User defined functions for Text Preprocessing**

In [30]:
def handle_URLs(tweet):
  '''
  Utility function to remove links/URL's from the tweets using regular expressions
  '''
  return(' '.join(re.sub("((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))|(http\S+)", "URL",tweet).split()))   

def remove_handles(tweet):
  '''
  Utility function to remove twitter handles
  '''
  return(' '.join(re.sub("(@[A-Za-z0-9]+)", "",tweet).split()))   

def demojize_emoji(tweet):
  '''
  Utility function to convert emoji's to text
  '''
  return emoji.demojize(tweet,delimiters=(' ',' '))

def remove_tickernames(tweet):
  return(' '.join(re.sub("(^\$[A-Za-z]+ )|(^\$\$[A-Za-z]+ )", "",tweet).split()))   

def remove_hashtag(tweet):
  '''
  Utility to remove the hash symbol in hashtag
  '''
  tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
  return tweet

def remove_consecutive_duplicate_words(tweet):
    #return ' '.join(set(tweet.split()))
    tweet_split = [k for k,v in groupby(tweet.split())]
    tweet = ' '.join(tweet_split)
    return tweet

# puncts = list(string.punctuation)
# puncts.pop(3) '$'
# puncts.pop(4) '%'
# puncts.pop(21) '@'
def remove_punctuation(tweet):
  nopunc = [char for char in tweet if char not in string.punctuation]
  nopunc = ''.join(nopunc)
  return nopunc

**Function to remove stop words**

In [31]:
def remove_stop_words(tweet):
  stop_words = stopwords.words('english')
  stopwords_dict = Counter(stop_words)
  tweet = ' '.join([word for word in tweet.split() if word not in stopwords_dict])
  return tweet

**Function to expand Contractions**

In [32]:
contractions = {
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}

def expand_contractions(text):
    for word in text.split():
        if word.lower() in contractions:
            text = text.replace(word, contractions[word.lower()])
    return text

**Function to get the tweet sentiment from TextBlob**

In [33]:
#Function to get the tweet sentiment polarity
def get_tweet_sentiment_tb(tweet): 
  # create TextBlob object of passed tweet text 
  analysis = TextBlob(tweet)
  # set sentiment 
  polarity = analysis.sentiment.polarity
  #return polarity
  if polarity < 0 : 
      return(-1)
  elif polarity==0: 
      return (0)
  else: 
      return (1)

In [34]:
# def get_tweet_sentiment_vs(tweet): 
#   analyser = SentimentIntensityAnalyzer()
#   # set sentiment 
#   polarity_dict = analyser.polarity_scores(tweet)
#   comp_polarity = polarity_dict['compound']
#   #return comp_polarity
#   if polarity < 0 : 
#     return(-1)
#   elif polarity==0: 
#     return (0)
#   else: 
#     return (1)

In [35]:
train_json_data.shape

(1039131, 4)

**Unescape function - to convert Non-readable HTML characters**

In [36]:
def unescape_fn(tweet):
  '''
  Utility function to convert the non-readable character notations to readable characters.
  Eg: It'll convert &#39; to "'" (apostrophe) & characters like \n,\t etc., will be removed
  It takes the tweet text as an argument & returns the modified tweet text after removing/modifying such characters
  '''
  parser = htmlparser.HTMLParser()
  tweet_new = parser.unescape(tweet)
  return tweet_new

In [37]:
train_json_data['stocktwit_tweet'] = [unescape_fn(tweet) for tweet in train_json_data['stocktwit_tweet']]
train_json_data

Unnamed: 0,stocktwit_tweet,sentiment_score,timestamp,ticker
0,$AMD going up but hesitating however chart is very stable and going upward,3,2018-09-19 18:38:28+00:00,AMD
1,@inforlong @MariaGascon Despite\nChina trade war $CAT held very well 👍,3,2018-10-09 03:51:06+00:00,CAT
2,$AVGO WTF?,2,2018-07-12 13:35:32+00:00,AVGO
3,$PH\n New Insider Filing On: \n MULLER KLAUS PETER\nTransaction Code: \n http://www.filingscanner.com/Alerts/PH.php,2,2018-07-19 03:32:50+00:00,PH
4,$FB if it bounces tommorrow do the right thing and GTFO,3,2018-08-23 19:07:54+00:00,FB
...,...,...,...,...
1039126,$CLF https://twitter.com/hrdrckminer/status/1029129019088764928?s=12,2,2018-08-13 22:15:04+00:00,CLF
1039127,$AMD 🔥🔥🔥,3,2018-09-04 15:25:59+00:00,AMD
1039128,$FB damn should of shorted 196 after hours to late now,0,2018-07-25 21:29:03+00:00,FB
1039129,$AMD back to the 31's,0,2018-09-28 16:19:55+00:00,AMD


**Converting to Lower case**

In [38]:
train_json_data['stocktwit_tweet'] = [tweet.strip().lower() for tweet in train_json_data['stocktwit_tweet']]
train_json_data

Unnamed: 0,stocktwit_tweet,sentiment_score,timestamp,ticker
0,$amd going up but hesitating however chart is very stable and going upward,3,2018-09-19 18:38:28+00:00,AMD
1,@inforlong @mariagascon despite\nchina trade war $cat held very well 👍,3,2018-10-09 03:51:06+00:00,CAT
2,$avgo wtf?,2,2018-07-12 13:35:32+00:00,AVGO
3,$ph\n new insider filing on: \n muller klaus peter\ntransaction code: \n http://www.filingscanner.com/alerts/ph.php,2,2018-07-19 03:32:50+00:00,PH
4,$fb if it bounces tommorrow do the right thing and gtfo,3,2018-08-23 19:07:54+00:00,FB
...,...,...,...,...
1039126,$clf https://twitter.com/hrdrckminer/status/1029129019088764928?s=12,2,2018-08-13 22:15:04+00:00,CLF
1039127,$amd 🔥🔥🔥,3,2018-09-04 15:25:59+00:00,AMD
1039128,$fb damn should of shorted 196 after hours to late now,0,2018-07-25 21:29:03+00:00,FB
1039129,$amd back to the 31's,0,2018-09-28 16:19:55+00:00,AMD


**Handling URL**

In [39]:
train_json_data['stocktwit_tweet'] = [' '.join(re.sub("((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))|(http\S+)", "URL",tweet).split()) for tweet in train_json_data['stocktwit_tweet']]
train_json_data

Unnamed: 0,stocktwit_tweet,sentiment_score,timestamp,ticker
0,$amd going up but hesitating however chart is very stable and going upward,3,2018-09-19 18:38:28+00:00,AMD
1,@inforlong @mariagascon despite china trade war $cat held very well 👍,3,2018-10-09 03:51:06+00:00,CAT
2,$avgo wtf?,2,2018-07-12 13:35:32+00:00,AVGO
3,$ph new insider filing on: muller klaus peter transaction code: URL,2,2018-07-19 03:32:50+00:00,PH
4,$fb if it bounces tommorrow do the right thing and gtfo,3,2018-08-23 19:07:54+00:00,FB
...,...,...,...,...
1039126,$clf URL,2,2018-08-13 22:15:04+00:00,CLF
1039127,$amd 🔥🔥🔥,3,2018-09-04 15:25:59+00:00,AMD
1039128,$fb damn should of shorted 196 after hours to late now,0,2018-07-25 21:29:03+00:00,FB
1039129,$amd back to the 31's,0,2018-09-28 16:19:55+00:00,AMD


**Removing Twitter handles**

In [40]:
train_json_data['stocktwit_tweet'] = [' '.join(re.sub("(@[A-Za-z0-9]+)", "",tweet).split()) for tweet in train_json_data['stocktwit_tweet']]
train_json_data

Unnamed: 0,stocktwit_tweet,sentiment_score,timestamp,ticker
0,$amd going up but hesitating however chart is very stable and going upward,3,2018-09-19 18:38:28+00:00,AMD
1,despite china trade war $cat held very well 👍,3,2018-10-09 03:51:06+00:00,CAT
2,$avgo wtf?,2,2018-07-12 13:35:32+00:00,AVGO
3,$ph new insider filing on: muller klaus peter transaction code: URL,2,2018-07-19 03:32:50+00:00,PH
4,$fb if it bounces tommorrow do the right thing and gtfo,3,2018-08-23 19:07:54+00:00,FB
...,...,...,...,...
1039126,$clf URL,2,2018-08-13 22:15:04+00:00,CLF
1039127,$amd 🔥🔥🔥,3,2018-09-04 15:25:59+00:00,AMD
1039128,$fb damn should of shorted 196 after hours to late now,0,2018-07-25 21:29:03+00:00,FB
1039129,$amd back to the 31's,0,2018-09-28 16:19:55+00:00,AMD


**Removing Ticker Names**

In [41]:
train_json_data['stocktwit_tweet'] = [remove_tickernames(tweet) for tweet in train_json_data['stocktwit_tweet']]
train_json_data

Unnamed: 0,stocktwit_tweet,sentiment_score,timestamp,ticker
0,going up but hesitating however chart is very stable and going upward,3,2018-09-19 18:38:28+00:00,AMD
1,despite china trade war $cat held very well 👍,3,2018-10-09 03:51:06+00:00,CAT
2,wtf?,2,2018-07-12 13:35:32+00:00,AVGO
3,new insider filing on: muller klaus peter transaction code: URL,2,2018-07-19 03:32:50+00:00,PH
4,if it bounces tommorrow do the right thing and gtfo,3,2018-08-23 19:07:54+00:00,FB
...,...,...,...,...
1039126,URL,2,2018-08-13 22:15:04+00:00,CLF
1039127,🔥🔥🔥,3,2018-09-04 15:25:59+00:00,AMD
1039128,damn should of shorted 196 after hours to late now,0,2018-07-25 21:29:03+00:00,FB
1039129,back to the 31's,0,2018-09-28 16:19:55+00:00,AMD


**Demojize Emojis**

In [42]:
train_json_data['stocktwit_tweet'] = [emoji.demojize(tweet,delimiters=(' ',' ')) for tweet in train_json_data['stocktwit_tweet']]
train_json_data

Unnamed: 0,stocktwit_tweet,sentiment_score,timestamp,ticker
0,going up but hesitating however chart is very stable and going upward,3,2018-09-19 18:38:28+00:00,AMD
1,despite china trade war $cat held very well thumbs_up,3,2018-10-09 03:51:06+00:00,CAT
2,wtf?,2,2018-07-12 13:35:32+00:00,AVGO
3,new insider filing on: muller klaus peter transaction code: URL,2,2018-07-19 03:32:50+00:00,PH
4,if it bounces tommorrow do the right thing and gtfo,3,2018-08-23 19:07:54+00:00,FB
...,...,...,...,...
1039126,URL,2,2018-08-13 22:15:04+00:00,CLF
1039127,fire fire fire,3,2018-09-04 15:25:59+00:00,AMD
1039128,damn should of shorted 196 after hours to late now,0,2018-07-25 21:29:03+00:00,FB
1039129,back to the 31's,0,2018-09-28 16:19:55+00:00,AMD


**Removing Hashtags**

In [43]:
train_json_data['stocktwit_tweet'] = [re.sub(r'#([^\s]+)', r'\1', tweet) for tweet in train_json_data['stocktwit_tweet']]
train_json_data

Unnamed: 0,stocktwit_tweet,sentiment_score,timestamp,ticker
0,going up but hesitating however chart is very stable and going upward,3,2018-09-19 18:38:28+00:00,AMD
1,despite china trade war $cat held very well thumbs_up,3,2018-10-09 03:51:06+00:00,CAT
2,wtf?,2,2018-07-12 13:35:32+00:00,AVGO
3,new insider filing on: muller klaus peter transaction code: URL,2,2018-07-19 03:32:50+00:00,PH
4,if it bounces tommorrow do the right thing and gtfo,3,2018-08-23 19:07:54+00:00,FB
...,...,...,...,...
1039126,URL,2,2018-08-13 22:15:04+00:00,CLF
1039127,fire fire fire,3,2018-09-04 15:25:59+00:00,AMD
1039128,damn should of shorted 196 after hours to late now,0,2018-07-25 21:29:03+00:00,FB
1039129,back to the 31's,0,2018-09-28 16:19:55+00:00,AMD


**Expand Contractions**

In [44]:
train_json_data['stocktwit_tweet'] = [expand_contractions(tweet) for tweet in train_json_data['stocktwit_tweet']]
train_json_data

Unnamed: 0,stocktwit_tweet,sentiment_score,timestamp,ticker
0,going up but hesitating however chart is very stable and going upward,3,2018-09-19 18:38:28+00:00,AMD
1,despite china trade war $cat held very well thumbs_up,3,2018-10-09 03:51:06+00:00,CAT
2,wtf?,2,2018-07-12 13:35:32+00:00,AVGO
3,new insider filing on: muller klaus peter transaction code: URL,2,2018-07-19 03:32:50+00:00,PH
4,if it bounces tommorrow do the right thing and gtfo,3,2018-08-23 19:07:54+00:00,FB
...,...,...,...,...
1039126,URL,2,2018-08-13 22:15:04+00:00,CLF
1039127,fire fire fire,3,2018-09-04 15:25:59+00:00,AMD
1039128,damn should of shorted 196 after hours to late now,0,2018-07-25 21:29:03+00:00,FB
1039129,back to the 31's,0,2018-09-28 16:19:55+00:00,AMD


**Remove accented Characters**

In [45]:
import unicodedata
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    #https://docs.python.org/2/library/unicodedata.html
    return text

In [46]:
train_json_data['stocktwit_tweet'] = [remove_accented_chars(tweet) for tweet in train_json_data['stocktwit_tweet']]
train_json_data

Unnamed: 0,stocktwit_tweet,sentiment_score,timestamp,ticker
0,going up but hesitating however chart is very stable and going upward,3,2018-09-19 18:38:28+00:00,AMD
1,despite china trade war $cat held very well thumbs_up,3,2018-10-09 03:51:06+00:00,CAT
2,wtf?,2,2018-07-12 13:35:32+00:00,AVGO
3,new insider filing on: muller klaus peter transaction code: URL,2,2018-07-19 03:32:50+00:00,PH
4,if it bounces tommorrow do the right thing and gtfo,3,2018-08-23 19:07:54+00:00,FB
...,...,...,...,...
1039126,URL,2,2018-08-13 22:15:04+00:00,CLF
1039127,fire fire fire,3,2018-09-04 15:25:59+00:00,AMD
1039128,damn should of shorted 196 after hours to late now,0,2018-07-25 21:29:03+00:00,FB
1039129,back to the 31's,0,2018-09-28 16:19:55+00:00,AMD


**Scrub words**

In [47]:
def scrub_words(text):
    #Replace \xao characters in text
    text = re.sub('\xa0', ' ', text)
    
    #Replace non ascii / not words and digits
    text = re.sub("(\\W|\\d)",' ',text)
    
    #Replace new line characters and following text untill space
    text = re.sub('\n(\w*?)[\s]', '', text)
    
    #Remove html markup
    text = re.sub("<.*?>", ' ', text)
    
    #Remove extra spaces from the text
    text = re.sub("\s+", ' ', text)
    return text

In [48]:
train_json_data['stocktwit_tweet'] = [scrub_words(tweet) for tweet in train_json_data['stocktwit_tweet']]
train_json_data

Unnamed: 0,stocktwit_tweet,sentiment_score,timestamp,ticker
0,going up but hesitating however chart is very stable and going upward,3,2018-09-19 18:38:28+00:00,AMD
1,despite china trade war cat held very well thumbs_up,3,2018-10-09 03:51:06+00:00,CAT
2,wtf,2,2018-07-12 13:35:32+00:00,AVGO
3,new insider filing on muller klaus peter transaction code URL,2,2018-07-19 03:32:50+00:00,PH
4,if it bounces tommorrow do the right thing and gtfo,3,2018-08-23 19:07:54+00:00,FB
...,...,...,...,...
1039126,URL,2,2018-08-13 22:15:04+00:00,CLF
1039127,fire fire fire,3,2018-09-04 15:25:59+00:00,AMD
1039128,damn should of shorted after hours to late now,0,2018-07-25 21:29:03+00:00,FB
1039129,back to the s,0,2018-09-28 16:19:55+00:00,AMD


**Remove Stop words**

In [49]:
train_json_data['stocktwit_tweet'] = [remove_stop_words(tweet) for tweet in train_json_data['stocktwit_tweet']]
train_json_data

Unnamed: 0,stocktwit_tweet,sentiment_score,timestamp,ticker
0,going hesitating however chart stable going upward,3,2018-09-19 18:38:28+00:00,AMD
1,despite china trade war cat held well thumbs_up,3,2018-10-09 03:51:06+00:00,CAT
2,wtf,2,2018-07-12 13:35:32+00:00,AVGO
3,new insider filing muller klaus peter transaction code URL,2,2018-07-19 03:32:50+00:00,PH
4,bounces tommorrow right thing gtfo,3,2018-08-23 19:07:54+00:00,FB
...,...,...,...,...
1039126,URL,2,2018-08-13 22:15:04+00:00,CLF
1039127,fire fire fire,3,2018-09-04 15:25:59+00:00,AMD
1039128,damn shorted hours late,0,2018-07-25 21:29:03+00:00,FB
1039129,back,0,2018-09-28 16:19:55+00:00,AMD


**Tokenization & Lemmatization**

In [51]:
# Here I define a tokenizer and stemmer which returns the set of stems (excluding stop words) in the text that it is passed

def tokenize_and_lemmatize(doc):
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token.text):
            filtered_tokens.append(token)
    lemma = [t.lemma_ for t in filtered_tokens]
    return lemma


**Converting to Dictionary**

In [52]:
data_dict = train_json_data.reset_index().to_dict(orient='list')

In [53]:
# import spacy
# nlp = spacy.load("en_core_web_sm")
# vocab_lemmatized = []
# data_dict['clean_tweet_lemmatized'] = []

# for idx,text in enumerate(data_dict['stocktwit_tweet']):
#   doc = nlp(text)
#   #print("processing {idx} document")
#   words_lemmatized = tokenize_and_lemmatize(doc)
#   vocab_lemmatized.extend(words_lemmatized)
#   data_dict['clean_tweet_lemmatized'].append(words_lemmatized)

In [54]:
# data_dict['clean_tweet_lemmatized'] = [' '.join(text) for text in data_dict['clean_tweet_lemmatized']]

In [55]:
# data_dict['clean_tweet_lemmatized'][:10]

In [56]:
data_dict.keys()

dict_keys(['index', 'stocktwit_tweet', 'sentiment_score', 'timestamp', 'ticker'])

**Converting the sentiment score to 3 categories --> Negative(-1), Neutral(0) & Positive(+1)**

In [57]:
def convert_sent_score(sent_score):
  if sent_score==0 or sent_score==1:
    return -1
  elif sent_score==2:
    return 0
  elif sent_score==3 or sent_score==4:
    return 1

In [58]:
data_dict['sentiment'] = []
data_dict['sentiment'] = [convert_sent_score(i) for i in data_dict['sentiment_score']]

**Defining X & Y for building a Sentimental Analysis model**

In [59]:
x=data_dict['stocktwit_tweet']
y=data_dict['sentiment']

**Train:Validation Split**

In [60]:
X_train, X_val, Y_train, Y_val = train_test_split(x, y, train_size=0.8, random_state=123)

**Converting the words to Matrices to pass on to the model**

In [61]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(stop_words='english',lowercase=True,strip_accents='unicode', decode_error='ignore')

x_train_cv = cv.fit(X_train)
x_train_trans_cv = cv.transform(X_train)
x_val_trans_cv = cv.transform(X_val)

**Building the base Random Forest Classifier to classify the sentiment of each tweet**

In [63]:
rf_model = RandomForestClassifier()
rf_model.fit(x_train_trans_cv,Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

**Prediction of sentiments for both train & validation data using the model built**

In [64]:
y_train_pred = rf_model.predict(x_train_trans_cv)

In [65]:
y_val_pred = rf_model.predict(x_val_trans_cv)

In [66]:
print("\nClassification Report on Train Data\n",classification_report(Y_train,y_train_pred,digits=2))
print("\nClassification Report on Test Data\n",classification_report(Y_val,y_val_pred,digits=2))


Classification Report on Train Data
               precision    recall  f1-score   support

          -1       0.94      0.91      0.92    170267
           0       0.93      0.96      0.94    371798
           1       0.94      0.91      0.93    289239

    accuracy                           0.93    831304
   macro avg       0.94      0.93      0.93    831304
weighted avg       0.93      0.93      0.93    831304


Classification Report on Test Data
               precision    recall  f1-score   support

          -1       0.66      0.58      0.61     42526
           0       0.73      0.77      0.75     92604
           1       0.68      0.68      0.68     72697

    accuracy                           0.70    207827
   macro avg       0.69      0.68      0.68    207827
weighted avg       0.70      0.70      0.70    207827



*The above output shows me that it is overfitting. Since there are lots of records in the data, I'm not able to do Hyper parameter tuning to see whether it improves.*

**Textblob Sentiment Analysis**

In [67]:
modified_df = pd.DataFrame.from_dict(data_dict)

In [69]:
TB_pred_sentiment = [get_tweet_sentiment_tb(tweet) for tweet in modified_df['stocktwit_tweet']]

In [70]:
print("\nClassification Report on Train Data\n",classification_report(modified_df['sentiment'],TB_pred_sentiment,digits=2))


Classification Report on Train Data
               precision    recall  f1-score   support

          -1       0.25      0.17      0.21    212793
           0       0.47      0.59      0.52    464402
           1       0.40      0.34      0.36    361936

    accuracy                           0.42   1039131
   macro avg       0.37      0.37      0.36   1039131
weighted avg       0.40      0.42      0.40   1039131



**Test_Data analysis**

In [71]:
test_json_data.head()

Unnamed: 0,stocktwit_tweet,timestamp,ticker
0,$CELG nothing to be exited about,2018-10-25 14:26:16+00:00,$CELG
1,"$AMD yall exhaust your buyer on first green candle,,,, byeeeeee",2018-07-13 13:50:39+00:00,$AMD
2,$AMD day traders day.,2018-09-25 19:10:54+00:00,$AMD
3,$CBS https://tenor.com/wLB8.gif,2018-07-27 22:45:48+00:00,$CBS
4,"$MU weak price action so far today. Don’t be afraid to go short, it’s gonna break support and free fall soon",2018-07-31 14:59:06+00:00,$MU


In [72]:
test_json_data.shape

(265022, 3)

In [73]:
describe_data(test_json_data)

Dimensions 

(265022, 3)



Column names

Index(['stocktwit_tweet', 'timestamp', 'ticker'], dtype='object')



Data Types

stocktwit_tweet    object
timestamp          object
ticker             object
dtype: object



Unique values in each level

stocktwit_tweet - 252890
timestamp - 254753
ticker - 1444



Summary 

       stocktwit_tweet                  timestamp  ticker
count           265022                     265022  265022
unique          252890                     254753    1444
top               $AMD  2018-10-29 14:00:03+00:00    $AMD
freq               622                         18   42140



Top 5 rows

                                                                                                stocktwit_tweet  \
0                                                                              $CELG nothing to be exited about   
1                                               $AMD yall exhaust your buyer on first green candle,,,, byeeeeee   
2                               

In [74]:
findNAs(test_json_data)

stocktwit_tweet    0
timestamp          0
ticker             0
dtype: int64


**Test_Data Preprocessing**

In [75]:
test_json_data = datatype_transformer(['timestamp'],'timestamp',test_json_data)

In [76]:
test_json_data = datatype_transformer(['ticker'],'category',test_json_data)

In [77]:
test_json_data.dtypes

stocktwit_tweet                 object
timestamp          datetime64[ns, UTC]
ticker                        category
dtype: object

In [78]:
test_json_data.sort_values(by='ticker')

Unnamed: 0,stocktwit_tweet,timestamp,ticker
109686,"$$DIS already closed. +50%. Love this company, but can make money playing both sides.",2018-07-11 15:05:29+00:00,$$DIS
215786,"@JRquiz A great example of the dry, intelligent American wit! Well done son!!! $$NKE",2018-09-04 17:22:36+00:00,$$NKE
154344,$$aapl 230 today hmmm,2018-08-31 13:35:04+00:00,$$aapl
131056,$$amzn Yes that sucks they don&#39;t wanna break 17760 area- always some sellers or short at that area- 1775 more difficult too,2018-10-25 17:47:18+00:00,$$amzn
172993,"@AbeS1 @BobDole_Yahoo @mowill @coinage @bvisse the expenses must by spiking, the outsourced Cust Svc at TaskU is charging them big $$s",2018-08-11 22:03:38+00:00,$$s
...,...,...,...
1724,$yum EXP:10/5/2018|MaxPain:86.5|HighPutOI:89.5(260)|HighCallOI:91.0(2118) http://www.opricot.com/ticker/yum/optiongraphs,2018-10-05 17:21:07+00:00,$yum
196697,$zbh EXP:7/20/2018|MaxPain:110.0|HighPutOI:110.0(848)|HighCallOI:120.0(2251) http://www.opricot.com/ticker/zbh/optiongraphs,2018-07-11 14:00:15+00:00,$zbh
134321,$zbh EXP:7/20/2018|MaxPain:110.0|HighPutOI:110.0(849)|HighCallOI:120.0(2251) http://www.opricot.com/ticker/zbh/optiongraphs,2018-07-19 21:09:28+00:00,$zbh
2901,$zion EXP:7/20/2018|MaxPain:52.5|HighPutOI:52.5(2626)|HighCallOI:60.0(5967) http://www.opricot.com/ticker/zion/optiongraphs,2018-07-13 18:20:19+00:00,$zion


In [79]:
len(test_json_data.ticker.unique())

1444

In [80]:
test_json_data['ticker'] = test_json_data['ticker'].str.replace(r'^\$', '')

In [81]:
len(test_json_data.ticker.unique())

1444

In [82]:
test_json_data.sort_values(by='ticker')

Unnamed: 0,stocktwit_tweet,timestamp,ticker
109686,"$$DIS already closed. +50%. Love this company, but can make money playing both sides.",2018-07-11 15:05:29+00:00,$DIS
215786,"@JRquiz A great example of the dry, intelligent American wit! Well done son!!! $$NKE",2018-09-04 17:22:36+00:00,$NKE
154344,$$aapl 230 today hmmm,2018-08-31 13:35:04+00:00,$aapl
131056,$$amzn Yes that sucks they don&#39;t wanna break 17760 area- always some sellers or short at that area- 1775 more difficult too,2018-10-25 17:47:18+00:00,$amzn
172993,"@AbeS1 @BobDole_Yahoo @mowill @coinage @bvisse the expenses must by spiking, the outsourced Cust Svc at TaskU is charging them big $$s",2018-08-11 22:03:38+00:00,$s
...,...,...,...
1724,$yum EXP:10/5/2018|MaxPain:86.5|HighPutOI:89.5(260)|HighCallOI:91.0(2118) http://www.opricot.com/ticker/yum/optiongraphs,2018-10-05 17:21:07+00:00,yum
196697,$zbh EXP:7/20/2018|MaxPain:110.0|HighPutOI:110.0(848)|HighCallOI:120.0(2251) http://www.opricot.com/ticker/zbh/optiongraphs,2018-07-11 14:00:15+00:00,zbh
134321,$zbh EXP:7/20/2018|MaxPain:110.0|HighPutOI:110.0(849)|HighCallOI:120.0(2251) http://www.opricot.com/ticker/zbh/optiongraphs,2018-07-19 21:09:28+00:00,zbh
2901,$zion EXP:7/20/2018|MaxPain:52.5|HighPutOI:52.5(2626)|HighCallOI:60.0(5967) http://www.opricot.com/ticker/zion/optiongraphs,2018-07-13 18:20:19+00:00,zion


In [83]:
test_json_data['ticker'] = test_json_data['ticker'].str.upper()

In [84]:
len(test_json_data.ticker.unique())

973

In [85]:
test_json_data.sort_values(by=['ticker','timestamp'])

Unnamed: 0,stocktwit_tweet,timestamp,ticker
154344,$$aapl 230 today hmmm,2018-08-31 13:35:04+00:00,$AAPL
131056,$$amzn Yes that sucks they don&#39;t wanna break 17760 area- always some sellers or short at that area- 1775 more difficult too,2018-10-25 17:47:18+00:00,$AMZN
109686,"$$DIS already closed. +50%. Love this company, but can make money playing both sides.",2018-07-11 15:05:29+00:00,$DIS
215786,"@JRquiz A great example of the dry, intelligent American wit! Well done son!!! $$NKE",2018-09-04 17:22:36+00:00,$NKE
172993,"@AbeS1 @BobDole_Yahoo @mowill @coinage @bvisse the expenses must by spiking, the outsourced Cust Svc at TaskU is charging them big $$s",2018-08-11 22:03:38+00:00,$S
...,...,...,...
120454,Short volume percent for $ZTS was 27.26% on 10-26-18 and 20 day rank was ~30th percentile https://volumebot.com/?s=ZTS,2018-10-29 10:47:55+00:00,ZTS
178043,"Wall St is expecting 18.77% YoY EPS growth for $ZTS in Q3, down from 45.28% in Q2 [Reporting 11/01 BMO]\nhttp://www.estimize.com/intro/zts?chart=historical&amp;metric_name=eps&amp;utm_content=ZTS&amp;utm_medium=eps_growth&amp;utm_source=stocktwits",2018-10-29 12:12:29+00:00,ZTS
79161,$ZTS Earnings November 01 BMO. 02-Nov-18 Straddle Implies ±4.8% Move vs Prev. Move +6.3% http://tinyurl.com/ya8cavl3,2018-10-31 11:32:07+00:00,ZTS
153118,"MA10-MA50 Bull Crossover Stocks: $ARQL, $ESRX, $FNSR, $TTI, $VEEV, $WMS, $ZBRA, $ZUMZ",2018-08-14 00:38:33+00:00,ZUMZ


In [86]:
test_json_data['stocktwit_tweet'] = [unescape_fn(tweet) for tweet in test_json_data['stocktwit_tweet']]
test_json_data

Unnamed: 0,stocktwit_tweet,timestamp,ticker
0,$CELG nothing to be exited about,2018-10-25 14:26:16+00:00,CELG
1,"$AMD yall exhaust your buyer on first green candle,,,, byeeeeee",2018-07-13 13:50:39+00:00,AMD
2,$AMD day traders day.,2018-09-25 19:10:54+00:00,AMD
3,$CBS https://tenor.com/wLB8.gif,2018-07-27 22:45:48+00:00,CBS
4,"$MU weak price action so far today. Don’t be afraid to go short, it’s gonna break support and free fall soon",2018-07-31 14:59:06+00:00,MU
...,...,...,...
265017,$CVS recent bad PR made this an easy pick to load up on. Thanks to two idiot employees I'll be making some $$ on a great company,2018-07-20 15:54:50+00:00,CVS
265018,Here’s what 16 Estimize analysts believe $S will report for Q2 2018 EPS [Reporting 10/31 BMO]\nhttp://www.estimize.com/intro/s?chart=historical&metric_name=eps&utm_content=S&utm_medium=eps_update&utm_source=stocktwits,2018-10-24 22:56:18+00:00,S
265019,"$AMD Could be at 18 or high 17s. If not, will still get to 20 later this year. Either way, margin callzzz for the shorties!",2018-07-25 21:48:25+00:00,AMD
265020,$TSN files form 10-Q https://fintel.io/filings/us/tsn,2018-08-06 11:43:24+00:00,TSN


In [87]:
test_json_data['stocktwit_tweet'] = [tweet.strip().lower() for tweet in test_json_data['stocktwit_tweet']]
test_json_data

Unnamed: 0,stocktwit_tweet,timestamp,ticker
0,$celg nothing to be exited about,2018-10-25 14:26:16+00:00,CELG
1,"$amd yall exhaust your buyer on first green candle,,,, byeeeeee",2018-07-13 13:50:39+00:00,AMD
2,$amd day traders day.,2018-09-25 19:10:54+00:00,AMD
3,$cbs https://tenor.com/wlb8.gif,2018-07-27 22:45:48+00:00,CBS
4,"$mu weak price action so far today. don’t be afraid to go short, it’s gonna break support and free fall soon",2018-07-31 14:59:06+00:00,MU
...,...,...,...
265017,$cvs recent bad pr made this an easy pick to load up on. thanks to two idiot employees i'll be making some $$ on a great company,2018-07-20 15:54:50+00:00,CVS
265018,here’s what 16 estimize analysts believe $s will report for q2 2018 eps [reporting 10/31 bmo]\nhttp://www.estimize.com/intro/s?chart=historical&metric_name=eps&utm_content=s&utm_medium=eps_update&utm_source=stocktwits,2018-10-24 22:56:18+00:00,S
265019,"$amd could be at 18 or high 17s. if not, will still get to 20 later this year. either way, margin callzzz for the shorties!",2018-07-25 21:48:25+00:00,AMD
265020,$tsn files form 10-q https://fintel.io/filings/us/tsn,2018-08-06 11:43:24+00:00,TSN


In [88]:
test_json_data['stocktwit_tweet'] = [' '.join(re.sub("((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))|(http\S+)", "URL",tweet).split()) for tweet in test_json_data['stocktwit_tweet']]
test_json_data

Unnamed: 0,stocktwit_tweet,timestamp,ticker
0,$celg nothing to be exited about,2018-10-25 14:26:16+00:00,CELG
1,"$amd yall exhaust your buyer on first green candle,,,, byeeeeee",2018-07-13 13:50:39+00:00,AMD
2,$amd day traders day.,2018-09-25 19:10:54+00:00,AMD
3,$cbs URL,2018-07-27 22:45:48+00:00,CBS
4,"$mu weak price action so far today. don’t be afraid to go short, it’s gonna break support and free fall soon",2018-07-31 14:59:06+00:00,MU
...,...,...,...
265017,$cvs recent bad pr made this an easy pick to load up on. thanks to two idiot employees i'll be making some $$ on a great company,2018-07-20 15:54:50+00:00,CVS
265018,here’s what 16 estimize analysts believe $s will report for q2 2018 eps [reporting 10/31 bmo] URL,2018-10-24 22:56:18+00:00,S
265019,"$amd could be at 18 or high 17s. if not, will still get to 20 later this year. either way, margin callzzz for the shorties!",2018-07-25 21:48:25+00:00,AMD
265020,$tsn files form 10-q URL,2018-08-06 11:43:24+00:00,TSN


In [89]:
test_json_data['stocktwit_tweet'] = [' '.join(re.sub("(@[A-Za-z0-9]+)", "",tweet).split()) for tweet in test_json_data['stocktwit_tweet']]
test_json_data

Unnamed: 0,stocktwit_tweet,timestamp,ticker
0,$celg nothing to be exited about,2018-10-25 14:26:16+00:00,CELG
1,"$amd yall exhaust your buyer on first green candle,,,, byeeeeee",2018-07-13 13:50:39+00:00,AMD
2,$amd day traders day.,2018-09-25 19:10:54+00:00,AMD
3,$cbs URL,2018-07-27 22:45:48+00:00,CBS
4,"$mu weak price action so far today. don’t be afraid to go short, it’s gonna break support and free fall soon",2018-07-31 14:59:06+00:00,MU
...,...,...,...
265017,$cvs recent bad pr made this an easy pick to load up on. thanks to two idiot employees i'll be making some $$ on a great company,2018-07-20 15:54:50+00:00,CVS
265018,here’s what 16 estimize analysts believe $s will report for q2 2018 eps [reporting 10/31 bmo] URL,2018-10-24 22:56:18+00:00,S
265019,"$amd could be at 18 or high 17s. if not, will still get to 20 later this year. either way, margin callzzz for the shorties!",2018-07-25 21:48:25+00:00,AMD
265020,$tsn files form 10-q URL,2018-08-06 11:43:24+00:00,TSN


In [90]:
test_json_data['stocktwit_tweet'] = [remove_tickernames(tweet) for tweet in test_json_data['stocktwit_tweet']]
test_json_data

Unnamed: 0,stocktwit_tweet,timestamp,ticker
0,nothing to be exited about,2018-10-25 14:26:16+00:00,CELG
1,"yall exhaust your buyer on first green candle,,,, byeeeeee",2018-07-13 13:50:39+00:00,AMD
2,day traders day.,2018-09-25 19:10:54+00:00,AMD
3,URL,2018-07-27 22:45:48+00:00,CBS
4,"weak price action so far today. don’t be afraid to go short, it’s gonna break support and free fall soon",2018-07-31 14:59:06+00:00,MU
...,...,...,...
265017,recent bad pr made this an easy pick to load up on. thanks to two idiot employees i'll be making some $$ on a great company,2018-07-20 15:54:50+00:00,CVS
265018,here’s what 16 estimize analysts believe $s will report for q2 2018 eps [reporting 10/31 bmo] URL,2018-10-24 22:56:18+00:00,S
265019,"could be at 18 or high 17s. if not, will still get to 20 later this year. either way, margin callzzz for the shorties!",2018-07-25 21:48:25+00:00,AMD
265020,files form 10-q URL,2018-08-06 11:43:24+00:00,TSN


In [91]:
test_json_data['stocktwit_tweet'] = [emoji.demojize(tweet,delimiters=(' ',' ')) for tweet in test_json_data['stocktwit_tweet']]
test_json_data

Unnamed: 0,stocktwit_tweet,timestamp,ticker
0,nothing to be exited about,2018-10-25 14:26:16+00:00,CELG
1,"yall exhaust your buyer on first green candle,,,, byeeeeee",2018-07-13 13:50:39+00:00,AMD
2,day traders day.,2018-09-25 19:10:54+00:00,AMD
3,URL,2018-07-27 22:45:48+00:00,CBS
4,"weak price action so far today. don’t be afraid to go short, it’s gonna break support and free fall soon",2018-07-31 14:59:06+00:00,MU
...,...,...,...
265017,recent bad pr made this an easy pick to load up on. thanks to two idiot employees i'll be making some $$ on a great company,2018-07-20 15:54:50+00:00,CVS
265018,here’s what 16 estimize analysts believe $s will report for q2 2018 eps [reporting 10/31 bmo] URL,2018-10-24 22:56:18+00:00,S
265019,"could be at 18 or high 17s. if not, will still get to 20 later this year. either way, margin callzzz for the shorties!",2018-07-25 21:48:25+00:00,AMD
265020,files form 10-q URL,2018-08-06 11:43:24+00:00,TSN


In [92]:
test_json_data['stocktwit_tweet'] = [re.sub(r'#([^\s]+)', r'\1', tweet) for tweet in test_json_data['stocktwit_tweet']]
test_json_data

Unnamed: 0,stocktwit_tweet,timestamp,ticker
0,nothing to be exited about,2018-10-25 14:26:16+00:00,CELG
1,"yall exhaust your buyer on first green candle,,,, byeeeeee",2018-07-13 13:50:39+00:00,AMD
2,day traders day.,2018-09-25 19:10:54+00:00,AMD
3,URL,2018-07-27 22:45:48+00:00,CBS
4,"weak price action so far today. don’t be afraid to go short, it’s gonna break support and free fall soon",2018-07-31 14:59:06+00:00,MU
...,...,...,...
265017,recent bad pr made this an easy pick to load up on. thanks to two idiot employees i'll be making some $$ on a great company,2018-07-20 15:54:50+00:00,CVS
265018,here’s what 16 estimize analysts believe $s will report for q2 2018 eps [reporting 10/31 bmo] URL,2018-10-24 22:56:18+00:00,S
265019,"could be at 18 or high 17s. if not, will still get to 20 later this year. either way, margin callzzz for the shorties!",2018-07-25 21:48:25+00:00,AMD
265020,files form 10-q URL,2018-08-06 11:43:24+00:00,TSN


In [93]:
test_json_data['stocktwit_tweet'] = [expand_contractions(tweet) for tweet in test_json_data['stocktwit_tweet']]
test_json_data

Unnamed: 0,stocktwit_tweet,timestamp,ticker
0,nothing to be exited about,2018-10-25 14:26:16+00:00,CELG
1,"yall exhaust your buyer on first green candle,,,, byeeeeee",2018-07-13 13:50:39+00:00,AMD
2,day traders day.,2018-09-25 19:10:54+00:00,AMD
3,URL,2018-07-27 22:45:48+00:00,CBS
4,"weak price action so far today. don’t be afraid to go short, it’s gonna break support and free fall soon",2018-07-31 14:59:06+00:00,MU
...,...,...,...
265017,recent bad pr made this an easy pick to load up on. thanks to two idiot employees i will be making some $$ on a great company,2018-07-20 15:54:50+00:00,CVS
265018,here’s what 16 estimize analysts believe $s will report for q2 2018 eps [reporting 10/31 bmo] URL,2018-10-24 22:56:18+00:00,S
265019,"could be at 18 or high 17s. if not, will still get to 20 later this year. either way, margin callzzz for the shorties!",2018-07-25 21:48:25+00:00,AMD
265020,files form 10-q URL,2018-08-06 11:43:24+00:00,TSN


In [94]:
test_json_data['stocktwit_tweet'] = [remove_accented_chars(tweet) for tweet in test_json_data['stocktwit_tweet']]
test_json_data

Unnamed: 0,stocktwit_tweet,timestamp,ticker
0,nothing to be exited about,2018-10-25 14:26:16+00:00,CELG
1,"yall exhaust your buyer on first green candle,,,, byeeeeee",2018-07-13 13:50:39+00:00,AMD
2,day traders day.,2018-09-25 19:10:54+00:00,AMD
3,URL,2018-07-27 22:45:48+00:00,CBS
4,"weak price action so far today. dont be afraid to go short, its gonna break support and free fall soon",2018-07-31 14:59:06+00:00,MU
...,...,...,...
265017,recent bad pr made this an easy pick to load up on. thanks to two idiot employees i will be making some $$ on a great company,2018-07-20 15:54:50+00:00,CVS
265018,heres what 16 estimize analysts believe $s will report for q2 2018 eps [reporting 10/31 bmo] URL,2018-10-24 22:56:18+00:00,S
265019,"could be at 18 or high 17s. if not, will still get to 20 later this year. either way, margin callzzz for the shorties!",2018-07-25 21:48:25+00:00,AMD
265020,files form 10-q URL,2018-08-06 11:43:24+00:00,TSN


In [95]:
test_json_data['stocktwit_tweet'] = [scrub_words(tweet) for tweet in test_json_data['stocktwit_tweet']]
test_json_data

Unnamed: 0,stocktwit_tweet,timestamp,ticker
0,nothing to be exited about,2018-10-25 14:26:16+00:00,CELG
1,yall exhaust your buyer on first green candle byeeeeee,2018-07-13 13:50:39+00:00,AMD
2,day traders day,2018-09-25 19:10:54+00:00,AMD
3,URL,2018-07-27 22:45:48+00:00,CBS
4,weak price action so far today dont be afraid to go short its gonna break support and free fall soon,2018-07-31 14:59:06+00:00,MU
...,...,...,...
265017,recent bad pr made this an easy pick to load up on thanks to two idiot employees i will be making some on a great company,2018-07-20 15:54:50+00:00,CVS
265018,heres what estimize analysts believe s will report for q eps reporting bmo URL,2018-10-24 22:56:18+00:00,S
265019,could be at or high s if not will still get to later this year either way margin callzzz for the shorties,2018-07-25 21:48:25+00:00,AMD
265020,files form q URL,2018-08-06 11:43:24+00:00,TSN


In [96]:
test_json_data['stocktwit_tweet'] = [remove_stop_words(tweet) for tweet in test_json_data['stocktwit_tweet']]
test_json_data

Unnamed: 0,stocktwit_tweet,timestamp,ticker
0,nothing exited,2018-10-25 14:26:16+00:00,CELG
1,yall exhaust buyer first green candle byeeeeee,2018-07-13 13:50:39+00:00,AMD
2,day traders day,2018-09-25 19:10:54+00:00,AMD
3,URL,2018-07-27 22:45:48+00:00,CBS
4,weak price action far today dont afraid go short gonna break support free fall soon,2018-07-31 14:59:06+00:00,MU
...,...,...,...
265017,recent bad pr made easy pick load thanks two idiot employees making great company,2018-07-20 15:54:50+00:00,CVS
265018,heres estimize analysts believe report q eps reporting bmo URL,2018-10-24 22:56:18+00:00,S
265019,could high still get later year either way margin callzzz shorties,2018-07-25 21:48:25+00:00,AMD
265020,files form q URL,2018-08-06 11:43:24+00:00,TSN


In [97]:
test_data_dict = test_json_data.reset_index().to_dict(orient='list')

In [98]:
x_test=test_data_dict['stocktwit_tweet']
x_test_trans_cv = cv.transform(x_test)

In [99]:
test_pred = rf_model.predict(x_test_trans_cv)

In [100]:
print(max(test_pred)," ",min(test_pred))

1   -1


**Adding the predicted sentiments to the test dataframe**

In [101]:
test_json_data['pred_sentiment'] = test_pred

**Creating the train_json dataframe to merge it to train_factors & writing it to csv**

In [102]:
train_json_data.head()

Unnamed: 0,stocktwit_tweet,sentiment_score,timestamp,ticker
0,going hesitating however chart stable going upward,3,2018-09-19 18:38:28+00:00,AMD
1,despite china trade war cat held well thumbs_up,3,2018-10-09 03:51:06+00:00,CAT
2,wtf,2,2018-07-12 13:35:32+00:00,AVGO
3,new insider filing muller klaus peter transaction code URL,2,2018-07-19 03:32:50+00:00,PH
4,bounces tommorrow right thing gtfo,3,2018-08-23 19:07:54+00:00,FB


In [103]:
data_dict.keys()

dict_keys(['index', 'stocktwit_tweet', 'sentiment_score', 'timestamp', 'ticker', 'sentiment'])

In [104]:
train_json_modified = pd.DataFrame.from_dict(data_dict)

In [105]:
train_json_modified.shape

(1039131, 6)

In [106]:
train_json_modified.head()

Unnamed: 0,index,stocktwit_tweet,sentiment_score,timestamp,ticker,sentiment
0,0,going hesitating however chart stable going upward,3,2018-09-19 18:38:28+00:00,AMD,1
1,1,despite china trade war cat held well thumbs_up,3,2018-10-09 03:51:06+00:00,CAT,1
2,2,wtf,2,2018-07-12 13:35:32+00:00,AVGO,0
3,3,new insider filing muller klaus peter transaction code URL,2,2018-07-19 03:32:50+00:00,PH,0
4,4,bounces tommorrow right thing gtfo,3,2018-08-23 19:07:54+00:00,FB,1


In [107]:
test_json_data.head()

Unnamed: 0,stocktwit_tweet,timestamp,ticker,pred_sentiment
0,nothing exited,2018-10-25 14:26:16+00:00,CELG,0
1,yall exhaust buyer first green candle byeeeeee,2018-07-13 13:50:39+00:00,AMD,1
2,day traders day,2018-09-25 19:10:54+00:00,AMD,0
3,URL,2018-07-27 22:45:48+00:00,CBS,0
4,weak price action far today dont afraid go short gonna break support free fall soon,2018-07-31 14:59:06+00:00,MU,-1


In [108]:
train_json_tomerge = train_json_modified[['ticker','timestamp','sentiment']]

In [109]:
train_json_tomerge['ticker'] = train_json_tomerge['ticker'].astype('category')

In [110]:
train_json_tomerge['day'] = train_json_tomerge.timestamp.dt.dayofyear

In [111]:
train_json_tomerge['day_name'] = train_json_tomerge.timestamp.dt.weekday_name

In [112]:
train_json_tomerge.head()

Unnamed: 0,ticker,timestamp,sentiment,day,day_name
0,AMD,2018-09-19 18:38:28+00:00,1,262,Wednesday
1,CAT,2018-10-09 03:51:06+00:00,1,282,Tuesday
2,AVGO,2018-07-12 13:35:32+00:00,0,193,Thursday
3,PH,2018-07-19 03:32:50+00:00,0,200,Thursday
4,FB,2018-08-23 19:07:54+00:00,1,235,Thursday


In [113]:
datatype_transformer(['day','day_name'],'category',train_json_tomerge)

Unnamed: 0,ticker,timestamp,sentiment,day,day_name
0,AMD,2018-09-19 18:38:28+00:00,1,262,Wednesday
1,CAT,2018-10-09 03:51:06+00:00,1,282,Tuesday
2,AVGO,2018-07-12 13:35:32+00:00,0,193,Thursday
3,PH,2018-07-19 03:32:50+00:00,0,200,Thursday
4,FB,2018-08-23 19:07:54+00:00,1,235,Thursday
...,...,...,...,...,...
1039126,CLF,2018-08-13 22:15:04+00:00,0,225,Monday
1039127,AMD,2018-09-04 15:25:59+00:00,1,247,Tuesday
1039128,FB,2018-07-25 21:29:03+00:00,-1,206,Wednesday
1039129,AMD,2018-09-28 16:19:55+00:00,-1,271,Friday


In [114]:
train_json_tomerge.head()

Unnamed: 0,ticker,timestamp,sentiment,day,day_name
0,AMD,2018-09-19 18:38:28+00:00,1,262,Wednesday
1,CAT,2018-10-09 03:51:06+00:00,1,282,Tuesday
2,AVGO,2018-07-12 13:35:32+00:00,0,193,Thursday
3,PH,2018-07-19 03:32:50+00:00,0,200,Thursday
4,FB,2018-08-23 19:07:54+00:00,1,235,Thursday


In [115]:
train_json_tomerge.dtypes

ticker                  category
timestamp    datetime64[ns, UTC]
sentiment                  int64
day                     category
day_name                category
dtype: object

In [116]:
train_json_tomerge.to_csv('train_json_tomerge.csv',index=False)

**Creating the test_json dataframe to merge with test_factors & writing it to csv**

In [117]:
test_json_data

Unnamed: 0,stocktwit_tweet,timestamp,ticker,pred_sentiment
0,nothing exited,2018-10-25 14:26:16+00:00,CELG,0
1,yall exhaust buyer first green candle byeeeeee,2018-07-13 13:50:39+00:00,AMD,1
2,day traders day,2018-09-25 19:10:54+00:00,AMD,0
3,URL,2018-07-27 22:45:48+00:00,CBS,0
4,weak price action far today dont afraid go short gonna break support free fall soon,2018-07-31 14:59:06+00:00,MU,-1
...,...,...,...,...
265017,recent bad pr made easy pick load thanks two idiot employees making great company,2018-07-20 15:54:50+00:00,CVS,1
265018,heres estimize analysts believe report q eps reporting bmo URL,2018-10-24 22:56:18+00:00,S,0
265019,could high still get later year either way margin callzzz shorties,2018-07-25 21:48:25+00:00,AMD,1
265020,files form q URL,2018-08-06 11:43:24+00:00,TSN,0


In [118]:
test_json_tomerge = test_json_data[['ticker','timestamp','pred_sentiment']]

In [119]:
test_json_tomerge.head()

Unnamed: 0,ticker,timestamp,pred_sentiment
0,CELG,2018-10-25 14:26:16+00:00,0
1,AMD,2018-07-13 13:50:39+00:00,1
2,AMD,2018-09-25 19:10:54+00:00,0
3,CBS,2018-07-27 22:45:48+00:00,0
4,MU,2018-07-31 14:59:06+00:00,-1


In [120]:
test_json_tomerge['day'] = test_json_tomerge.timestamp.dt.dayofyear

In [121]:
test_json_tomerge['day_name'] = test_json_tomerge.timestamp.dt.weekday_name

In [122]:
test_json_tomerge.head()

Unnamed: 0,ticker,timestamp,pred_sentiment,day,day_name
0,CELG,2018-10-25 14:26:16+00:00,0,298,Thursday
1,AMD,2018-07-13 13:50:39+00:00,1,194,Friday
2,AMD,2018-09-25 19:10:54+00:00,0,268,Tuesday
3,CBS,2018-07-27 22:45:48+00:00,0,208,Friday
4,MU,2018-07-31 14:59:06+00:00,-1,212,Tuesday


In [123]:
test_json_tomerge.dtypes

ticker                         object
timestamp         datetime64[ns, UTC]
pred_sentiment                  int64
day                             int64
day_name                       object
dtype: object

In [124]:
datatype_transformer(['day','day_name','ticker'],'category',test_json_tomerge)

Unnamed: 0,ticker,timestamp,pred_sentiment,day,day_name
0,CELG,2018-10-25 14:26:16+00:00,0,298,Thursday
1,AMD,2018-07-13 13:50:39+00:00,1,194,Friday
2,AMD,2018-09-25 19:10:54+00:00,0,268,Tuesday
3,CBS,2018-07-27 22:45:48+00:00,0,208,Friday
4,MU,2018-07-31 14:59:06+00:00,-1,212,Tuesday
...,...,...,...,...,...
265017,CVS,2018-07-20 15:54:50+00:00,1,201,Friday
265018,S,2018-10-24 22:56:18+00:00,0,297,Wednesday
265019,AMD,2018-07-25 21:48:25+00:00,1,206,Wednesday
265020,TSN,2018-08-06 11:43:24+00:00,0,218,Monday


In [125]:
test_json_tomerge.dtypes

ticker                       category
timestamp         datetime64[ns, UTC]
pred_sentiment                  int64
day                          category
day_name                     category
dtype: object

In [126]:
test_json_tomerge.to_csv('test_json_tomerge.csv',index=False)