# Predicting the alpha signal using microblogging data

# Preprocessing of Test Data

## Importing Library and Data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import warnings
import spacy
import nltk
from nltk.corpus import wordnet as wn
from nltk.tokenize.toktok import ToktokTokenizer
import re
from bs4 import BeautifulSoup
import unicodedata
nlp = spacy.load("en_core_web_sm")
import emoji
import copy
warnings.filterwarnings("ignore")

In [53]:
Test_data_factors = pd.read_csv("test_factors.csv")

In [3]:
Test_data_factors.head(5)

Unnamed: 0,Id,date,ticker,SF1,SF2,SF3,SF4,SF5,SF6,SF7
0,270007,21/07/18,$INTC,-3.062194,1.223466,1.741714,2.279266,-1.323573,-0.274912,-4.504449
1,270008,05/10/18,$CTSH,0.816263,-2.184408,0.157975,-0.264743,-0.836282,0.046276,0.826353
2,270009,01/10/18,$CB,0.401281,0.091604,0.083411,-1.147041,-0.485223,-0.60106,1.012811
3,270010,24/10/18,$CTAS,-0.783521,1.192929,0.813831,-0.368166,-1.113656,-0.553581,-0.683803
4,270011,27/07/18,$intc,0.796507,0.455341,0.679032,0.354336,-1.799055,0.126153,0.297111


In [4]:
jason_data = pd.read_json("test_data.json")

In [5]:
jason_data.head(5)

Unnamed: 0,records
0,{'stocktwit_tweet': '$CELG nothing to be exite...
1,{'stocktwit_tweet': '$AMD yall exhaust your bu...
2,"{'stocktwit_tweet': '$AMD day traders day.', '..."
3,{'stocktwit_tweet': '$CBS https://tenor.com/wL...
4,{'stocktwit_tweet': '$MU weak price action so ...


In [6]:
Test_data_twitter = pd.DataFrame([i for i in jason_data.records])

In [7]:
Test_data_twitter.head(5)

Unnamed: 0,stocktwit_tweet,ticker,timestamp
0,$CELG nothing to be exited about,$CELG,2018-10-25 14:26:16+00:00
1,$AMD yall exhaust your buyer on first green ca...,$AMD,2018-07-13 13:50:39+00:00
2,$AMD day traders day.,$AMD,2018-09-25 19:10:54+00:00
3,$CBS https://tenor.com/wLB8.gif,$CBS,2018-07-27 22:45:48+00:00
4,$MU weak price action so far today. Don’t be a...,$MU,2018-07-31 14:59:06+00:00


In [8]:
Test_data_twitter.shape

(265022, 3)

### Converting Timestamp to Date format

In [9]:
Test_data_twitter['Date']=Test_data_twitter['timestamp'].str.split(expand=True)[0]

In [10]:
Test_data_twitter['date']=pd.to_datetime(Test_data_twitter['Date'].astype(str))

In [11]:
Test_data_twitter.drop('timestamp',inplace=True,axis=1)

In [12]:
Test_data_twitter.head(5)

Unnamed: 0,stocktwit_tweet,ticker,Date,date
0,$CELG nothing to be exited about,$CELG,2018-10-25,2018-10-25
1,$AMD yall exhaust your buyer on first green ca...,$AMD,2018-07-13,2018-07-13
2,$AMD day traders day.,$AMD,2018-09-25,2018-09-25
3,$CBS https://tenor.com/wLB8.gif,$CBS,2018-07-27,2018-07-27
4,$MU weak price action so far today. Don’t be a...,$MU,2018-07-31,2018-07-31


In [13]:
Test_data_twitter.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 265022 entries, 0 to 265021
Data columns (total 4 columns):
stocktwit_tweet    265022 non-null object
ticker             265022 non-null object
Date               265022 non-null object
date               265022 non-null datetime64[ns]
dtypes: datetime64[ns](1), object(3)
memory usage: 8.1+ MB


### Deleting Duplicate Records

In [14]:
Test_data_twitter.duplicated(keep=False).sum()

6245

In [15]:
Test_data_twitter.drop_duplicates(keep='first',inplace=True)

In [16]:
Test_data_twitter.shape

(261142, 4)

In [17]:
#data_twitter.drop('duplicate',inplace=True,axis=1)

In [18]:
Test_data_twitter =Test_data_twitter.reset_index(drop=True)

In [19]:
Test_data_twitter.head(5)

Unnamed: 0,stocktwit_tweet,ticker,Date,date
0,$CELG nothing to be exited about,$CELG,2018-10-25,2018-10-25
1,$AMD yall exhaust your buyer on first green ca...,$AMD,2018-07-13,2018-07-13
2,$AMD day traders day.,$AMD,2018-09-25,2018-09-25
3,$CBS https://tenor.com/wLB8.gif,$CBS,2018-07-27,2018-07-27
4,$MU weak price action so far today. Don’t be a...,$MU,2018-07-31,2018-07-31


### Keeping original copy of data before cleaning

In [20]:

print(type(Test_data_twitter['ticker']))
original_data = copy.deepcopy(Test_data_twitter)
print(Test_data_twitter.keys())
print(original_data.keys())

<class 'pandas.core.series.Series'>
Index(['stocktwit_tweet', 'ticker', 'Date', 'date'], dtype='object')
Index(['stocktwit_tweet', 'ticker', 'Date', 'date'], dtype='object')


### Functions to Expand Contractions

In [21]:
contractions = {
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}


def expand_contractions(text):
    for word in text.split():
        if word.lower() in contractions:
            text = text.replace(word, contractions[word.lower()])
    return text



### Functions to Remove Accented characters

In [22]:
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    #https://docs.python.org/2/library/unicodedata.html
    return text



# Functions to remove scrub words

In [23]:
def scrub_words(text):
    #Replace \xao characters in text
    text = re.sub('\xa0', ' ', text)
    
    #Replace non ascii / not words and digits
    text = re.sub("(\\W|\\d)",' ',text)
    
    #Replace new line characters and following text untill space
    text = re.sub('\n(\w*?)[\s]', '', text)
    
    #Remove html markup
    text = re.sub("<.*?>", ' ', text)
    
    #Remove extra spaces from the text
    text = re.sub("\s+", ' ', text)
    return text

### Functions to clean Twitter related Data

In [24]:
#!pip install emoji

In [25]:
def cleaning_twitter_data(text):
    
    text= re.sub('http\S*|www.\S*','', text) #remove http/https address
    text= emoji.demojize(text)  #Convert emoji into text
    text= re.sub("_", ' ', text) #remove "_" from text
    text= text.strip().lower()  #Convert into lower case
    text= re.sub("@[\w]*","", text) #remove @twitter 
    text= re.sub("\$[\w]*","", text) #remove $sign
    
    text= re.sub("[#+]?\B","", text) #remove hashtags
    #text= re.sub("#","", text)
    text= expand_contractions(re.sub('’', "'", text)) #Expand contractions
    text= remove_accented_chars(text) #remove accented characters
    text= scrub_words(text) #remove scrub words
    return text

def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    return stripped_text

In [26]:
Test_data_twitter['stocktwit_tweet']=Test_data_twitter['stocktwit_tweet'].apply(strip_html_tags)

Test_data_twitter['stocktwit_tweet']=Test_data_twitter['stocktwit_tweet'].apply(lambda x: cleaning_twitter_data(x))

In [27]:
## Using TextBlob to spell correct.

#import textblob
#from textblob import TextBlob

#data_twitter['stocktwit_tweet'] = [TextBlob(text).correct() for text in data_twitter['stocktwit_tweet']]

In [28]:
print("Original data: \n",original_data['stocktwit_tweet'][1])
print("\n\n**************************************************************************\n\n")
print("Clean data: \n",Test_data_twitter['stocktwit_tweet'][1])

Original data: 
 $AMD yall exhaust your buyer on first green candle,,,, byeeeeee


**************************************************************************


Clean data: 
  yall exhaust your buyer on first green candle byeeeeee


### Remove dollar sign from ticker column

In [29]:
Test_data_twitter['ticker']=Test_data_twitter['ticker'].str.replace("$","")
Test_data_twitter['ticker']=Test_data_twitter['ticker'].str.upper()

In [30]:
Test_data_twitter.to_csv("first.csv")

In [31]:
#data_twitter=pd.read_csv("first.csv")

### Function to Remove Stop words and stemming the words

In [32]:
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer 

stopwords = spacy.lang.en.stop_words.STOP_WORDS
print('Number of stop words: %d' % len(stopwords))
print('First ten stop words: %s' % list(stopwords)[:10])
stopwords.remove('no')
stopwords.remove('not')
stopwords.add('utm');

stopwords.add('source');
stopwords.add('stocktwits');

stemmer = PorterStemmer()
tokenizer = ToktokTokenizer()
lemmatizer = WordNetLemmatizer()
def stopword_remove_lemma(text):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    filtered_tokens = [token for token in tokens if token not in stopwords]
    #filtered_tokens = [stemmer.stem(i) for i in filtered_tokens]
    filtered_tokens = [lemmatizer.lemmatize(i) for i in filtered_tokens]
    filtered_text = ' '.join(filtered_tokens) 

    return filtered_text

Number of stop words: 326
First ten stop words: ['less', 'no', 'becoming', 'before', 'then', 'neither', 'did', 'whereas', 'may', 'amongst']


In [33]:
Test_data_twitter['stocktwit_tweet']=Test_data_twitter['stocktwit_tweet'].apply(lambda x: stopword_remove_lemma(x))


In [34]:
print("Original data: \n",original_data['stocktwit_tweet'][1])
print("\n\n**************************************************************************\n\n")
print("Clean data: \n",Test_data_twitter['stocktwit_tweet'][1])

Original data: 
 $AMD yall exhaust your buyer on first green candle,,,, byeeeeee


**************************************************************************


Clean data: 
 yall exhaust buyer green candle byeeeeee


In [35]:
Test_data_twitter.to_csv('after_clean_data.csv')

### Check for Null values

In [36]:
Test_data_twitter.isnull().sum()

stocktwit_tweet    0
ticker             0
Date               0
date               0
dtype: int64

In [37]:
Test_data_twitter[Test_data_twitter['stocktwit_tweet']==""].count()

stocktwit_tweet    7447
ticker             7447
Date               7447
date               7447
dtype: int64

In [38]:
Test_data_twitter=Test_data_twitter[Test_data_twitter['stocktwit_tweet']!=""]

In [39]:
Test_data_twitter['stocktwit_tweet'].isnull().sum()

0

In [40]:
Test_data_twitter.shape

(253695, 4)

In [41]:
Test_data_twitter.to_csv("Test_data_twitter_pre_data.csv")

# Preprocessing of Factors data

In [42]:
Test_data_factors.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11575 entries, 0 to 11574
Data columns (total 10 columns):
Id        11575 non-null int64
date      11575 non-null object
ticker    11575 non-null object
SF1       11575 non-null float64
SF2       11575 non-null float64
SF3       11575 non-null float64
SF4       11575 non-null float64
SF5       11575 non-null float64
SF6       11575 non-null float64
SF7       11575 non-null float64
dtypes: float64(7), int64(1), object(2)
memory usage: 904.4+ KB


In [54]:
Test_data_factors['date']=pd.to_datetime(Test_data_factors['date'].astype(str),format="%d/%m/%y")

In [55]:
Test_data_factors.isnull().sum()

Id        0
date      0
ticker    0
SF1       0
SF2       0
SF3       0
SF4       0
SF5       0
SF6       0
SF7       0
dtype: int64

In [56]:
Test_data_factors.duplicated(keep=False).sum()

0

### Remove dollar sign from ticker

In [57]:
Test_data_factors['ticker'] = Test_data_factors['ticker'].str.replace("$","")
Test_data_factors['ticker']=Test_data_factors['ticker'].str.upper()

In [58]:
Test_data_factors.head()

Unnamed: 0,Id,date,ticker,SF1,SF2,SF3,SF4,SF5,SF6,SF7
0,270007,2018-07-21,INTC,-3.062194,1.223466,1.741714,2.279266,-1.323573,-0.274912,-4.504449
1,270008,2018-10-05,CTSH,0.816263,-2.184408,0.157975,-0.264743,-0.836282,0.046276,0.826353
2,270009,2018-10-01,CB,0.401281,0.091604,0.083411,-1.147041,-0.485223,-0.60106,1.012811
3,270010,2018-10-24,CTAS,-0.783521,1.192929,0.813831,-0.368166,-1.113656,-0.553581,-0.683803
4,270011,2018-07-27,INTC,0.796507,0.455341,0.679032,0.354336,-1.799055,0.126153,0.297111


In [59]:
Test_data_twitter.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 253695 entries, 0 to 261141
Data columns (total 4 columns):
stocktwit_tweet    253695 non-null object
ticker             253695 non-null object
Date               253695 non-null object
date               253695 non-null datetime64[ns]
dtypes: datetime64[ns](1), object(3)
memory usage: 9.7+ MB


In [60]:
Test_data_twitter.to_csv("Test_data_twitter.csv")

In [61]:
Test_data_factors.to_csv("Test_data_factors.csv")