In [32]:
import pandas as pd
import numpy as np
import json
from datetime import datetime
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from dateutil.parser import parse
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import chi2

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sohnnick/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


***Frame Databases and Obtain Metrics***

In [33]:
with open('NKE-explore.txt') as f:
    nke_json = json.load(f)

In [34]:
def convert_json2df(json_data): 
    #create empty dictionary
    dictdat = {'date':[], 'sentiment':[], 'body':[]}
    #append to dictionary
    for data in json_data:
        if data['sentiment'] != None:
            dictdat['sentiment'].append(data['sentiment']['class'])
        else:
            dictdat['sentiment'].append('None')
        dictdat['date'].append(datetime.strptime(data['created_at'][:16], '%a, %d %b %Y'))
        dictdat['body'].append(data['body'])
    #convert to dataframe
    df = pd.DataFrame(data=dictdat).sort_values(by='date').reset_index()
    del df['index']
    return df

In [35]:
#call function
text_nke = convert_json2df(nke_json)

In [36]:
text_nke.head(20)

Unnamed: 0,date,sentiment,body
0,2013-12-03,,A lot of stocks now trading green: $GS $WFC $...
1,2013-12-03,,"Nike's EVP & CFO just cashed-in 33,000 options..."
2,2013-12-04,,China dragged down a lot of giant companies la...
3,2013-12-04,bullish,$NKE Buying the dip.
4,2013-12-04,,"Drake Signs with Jordan Brand, Kanye with adid..."
5,2013-12-04,,$NKE NIKE Redefines Basketball Footwear with t...
6,2013-12-04,,$NKE gives you a chance to design shoes for Ti...
7,2013-12-05,,On http://stks.co/f00Qo ( http://stks.co/f00Qp...
8,2013-12-05,,On The Yield Game ( http://stks.co/dx9q ) Now ...
9,2013-12-05,,$JCP from my profit going to get some $NKE clo...


In [37]:
def obtain_metrics(df):
    #obtain unique dates
    date_list = list(set(list(df['date'])))
    len(date_list)
    date_list.sort()
    
    #group data
    grouped_df = df.groupby('date')
    
    #obtain polarity, message volume, 1-day volume change, 10-day likelihood
    polarity_list = []
    volume_list = []
    vchange1 = []
    vchange10 = []
    polarity_movaverage = []
    for i in range(0, len(date_list)):
        date_temp = grouped_df.get_group(date_list[i])
        date_temp = list(date_temp['sentiment'])
        polarity = (date_temp.count('bullish')-date_temp.count('bearish'))/len(date_temp)
        polarity_list.append(polarity)
        volume_list.append(len(date_temp))
        if i == 0:
            vchange1.append(None)
        else:
            day1change = (volume_list[i]-volume_list[i-1])/volume_list[i-1]
            vchange1.append(day1change)
        if i < 10:
            vchange10.append(None)
        else:
            day10change = len(date_temp)/(sum(volume_list[i-10:i])/10)
            vchange10.append(day10change)
        if i >= 2:
            movave = (polarity_list[i] + polarity_list[i-1] + polarity_list[i-2])/3
            polarity_movaverage.append(movave)
        else:
            polarity_movaverage.append(None)
    
    #create a dataframe with the results
    date_metrics_dict = {}
    date_metrics_dict['date'] = date_list
    date_metrics_dict['polarity'] = polarity_list
    date_metrics_dict['st'] = polarity_movaverage
    date_metrics_dict['msgvolume'] = volume_list
    date_metrics_dict['mv1t'] = vchange1
    date_metrics_dict['mv10t'] = vchange10
    df_final = pd.DataFrame(data=date_metrics_dict)
    return df_final

In [38]:
#call function
df_volume = obtain_metrics(text_nke)
df_volume.head(20)
df_volume.to_csv("df_volumemetrics.csv")

In [39]:
df_price = pd.read_csv('NKE-explore.csv')
df_price = df_price.sort_values(by='Date')

In [40]:
def prediction_target(df_price):
    length = len(df_price)
    #only interested in the closing price
    close = list(df_price['CLOSE'])
    rt3 = []
    rt5 = []
    for i in range(0,length):
        if i > length-4:
            rt3.append(None)
        else:
            temp_3 = (close[i+3]-close[i])/close[i]
            rt3.append(temp_3)
        if i > length-6:
            rt5.append(None)
        else:
            temp_5 = (close[i+5]-close[i])/close[i]
            rt5.append(temp_5)
    
    prediction_dict = {}
    prediction_dict['rt3'] = rt3
    prediction_dict['rt5'] = rt5
    return prediction_dict

In [41]:
#call function to obtain forward T day return
df_prediction = prediction_target(df_price)
df_prediction = pd.DataFrame(data=df_prediction)

In [42]:
df_metricsandvalue = pd.concat([df_price, df_prediction], axis=1)
df_metricsandvalue

Unnamed: 0,Date,OPEN,HIGH,LOW,VOLUME,CLOSE,rt3,rt5
0,2013-08-26,30.810515,30.968887,30.676139,5472400,30.676139,-0.003582,0.009133
1,2013-08-27,30.359395,30.589756,30.133837,4870800,30.143435,0.003485,0.040384
2,2013-08-28,30.100242,30.570559,30.066649,6044000,30.412186,0.017891,0.035149
3,2013-08-29,30.330330,30.749243,30.277363,4600000,30.566269,0.025992,0.023630
4,2013-08-30,30.677016,30.715537,30.142539,5851000,30.248472,0.040751,0.041070
...,...,...,...,...,...,...,...,...
823,2016-11-30,50.338380,50.627339,49.889998,8198400,49.889998,0.039286,
824,2016-12-01,50.110001,51.250000,50.060001,7610100,50.650002,-0.001580,
825,2016-12-02,50.259998,50.680000,50.110001,6730200,50.459999,,
826,2016-12-05,50.799999,52.270000,50.700001,11995300,51.849998,,


In [43]:
df_metricsandvalue.to_csv("df_stockmovement.csv")

***Pre Processing***

In [44]:
#get list of characters/tickers to remove
list_of_symbols = pd.read_csv('constituents_csv.csv')
remove = list(list_of_symbols['Symbol'])
temp = list('{}()[].,:;+-*/&|<>=~@#$?%!&*')
#manually add some symbols/characters that should be removed
remove1 = ['http', '\'s', '``', '\'\'', '...', '--', '..', 'puc=yahoo', 'cm_ven=YAHOO',
          'yptr=yahoo', '//dividendvaluebuilder.com/nike-nke-dividend-stock-analysis/',
          'utm_medium=eps_update', '//marketrealist.com/', 'n\'t', 'utm_source=stocktwits',
          '//www.estimize.com/intro/nke', 'utm_content=NKE', 'chart=historical', '\'',
          '\'m', 'utm_medium=reporting_this_week_consensus', '//simplywall.st/NYSE',
          'utm_medium=stocktwits', '//link.scoutfin.com/8gyk/SHiJ2vhB2t', 'nke', 'Nike', 'I',
          '//bit.ly/TTSNKE', 'chart=scatter-plot', 'past-future-earnings', 'anchor=past-future-earnings']
remove = remove + temp + remove1
#print(remove)

In [45]:
def list_of_words(text):
    body = list(text['body'])
    wordlist = []
    for i in range(0, len(body)):
        text = body[i]
        text_tokens = word_tokenize(text)
        wordlist = wordlist + text_tokens
    return wordlist

In [46]:
wordlist = list_of_words(text_nke)

In [47]:
len(wordlist)

565316

In [48]:
#create function to check if string is a date
def is_date(string, fuzzy=False):
    try: 
        parse(string, fuzzy=fuzzy)
        return True

    except ValueError:
        return False

In [49]:
#create dictionary and filter the keys
def word_dictionary(wordlist):
    word_dict = {}
    #get counts for each unique word word
    for i in range(0, len(wordlist)):
        if wordlist[i] in word_dict:
            word_dict[wordlist[i]] = word_dict.get(wordlist[i])+1
        else:
            word_dict[wordlist[i]] = 1
    print('total unique words:', len(word_dict.keys()))
    
    #filter the words
    keys = list(word_dict.keys())
    for i in keys:
        #remove word if it appears less than 25 times
        if word_dict.get(i) < 25:
            del word_dict[i]
        #remove word if it is a stopword
        elif i in stopwords.words():
            del word_dict[i]
        #remove word if it falls under the words to remove listed above
        elif i in remove:
            del word_dict[i]
        elif is_date(i) == True:
            del word_dict[i]
        elif i.isnumeric() == True:
            del word_dict[i]
    print('total unique words after filtering:', len(word_dict))
    return word_dict

In [50]:
word_dict = word_dictionary(wordlist)

total unique words: 37828
total unique words after filtering: 1482


In [51]:
#check which words occur most frequently and discover any data that needs to be filtered
#sorted(word_dict.items(), key=lambda x:x[1], reverse=True)

In [52]:
word_features = list(word_dict.keys())

Alter dataframe such that the posts in a given day form one document and then determine tfidf

In [53]:
text_nke['body'][0]

'A lot of stocks now trading green:  $GS $WFC $JPM $PG $KO $TRV $VZ $XOM $CVX $NKE $PM $CLX $MCD'

In [54]:
def tf(wordlist, word_features):
    data = {}
    #laplace smoothing numerator
    data = data.fromkeys(word_features,0)
    for feature in word_features:
        for word in wordlist:
            if feature == word:
                data[feature] = data.get(feature) + 1
    #for key in data:
        #laplace smoothing denominator
        #data[key] = data.get(key)/(len(wordlist)+2)
    return data

In [55]:
def get_tf_dataframe(text, word_features):    
    dates = list(set(list(text['date'])))
    dates.sort()
    text_dictbydate = []
    for i in range(0, len(dates)):
        temp = text[text['date'] == dates[i]]['body']
        b = []
        for string in temp:
            a = word_tokenize(string)
            b = b + a
        text_dictbydate.append(tf(b, word_features))
    df_tf = pd.DataFrame(data=text_dictbydate)
    #df_tf['date'] = dates
    date_df = pd.DataFrame({'date': dates})
    df_tf = pd.concat([date_df, df_tf], axis=1)
    return df_tf

In [56]:
df_tf = get_tf_dataframe(text_nke, word_features)
df_tf

Unnamed: 0,date,lot,stocks,trading,green,EVP,CFO,cashed-in,options,China,...,Brean,DWTI,lights,7x,5x,E2,https,NKE/,ApparelAndAccessories,0.51
0,2013-12-03,1,1,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2013-12-04,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,2013-12-05,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2013-12-06,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2013-12-07,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1077,2016-12-01,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,2,0,0,0
1078,2016-12-02,0,1,0,1,0,0,0,1,0,...,0,0,0,0,0,0,3,0,0,0
1079,2016-12-03,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1080,2016-12-04,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [57]:
def get_idf(text, word_features):    
    dates = list(set(list(text['date'])))
    dates.sort()
    
    idf = {}
    idf = idf.fromkeys(word_features, 1)
    
    for i in range(0, len(dates)):
        temp = text[text['date'] == dates[i]]['body']
        b = []
        for string in temp:
            a = word_tokenize(string)
            b = b + a
        for word in word_features:
            if word in b:
                idf[word] = idf.get(word)+1
    for key in idf:
        idf[key] = np.log(len(dates)/idf.get(key))
    return idf

In [58]:
#obtain idf values for features
dict_idf = get_idf(text_nke, word_features)

In [59]:
#this value makes sense as we can see that the word earnings appears most frequently (not in terms of number of docs)
#but in terms of absolute frequency
#from this, a low log(Corpus volume/number of docs where earnings occurs) should be relatively low
dict_idf['earnings']

1.062310661991895

In [60]:
#test out something
temp = [{'a': 3, 'b': -1, 'c': 5}, {'a': -1, 'b': 1, 'c': 4}]
dates = {'date': [1000, 2000]}
dates = pd.DataFrame(dates)
frog = pd.DataFrame(temp)
df_temp = pd.concat([dates, frog], axis=1)
print(df_temp)

   date  a  b  c
0  1000  3 -1  5
1  2000 -1  1  4


In [61]:
def get_tfidf(df_tf, dict_idf):
    tfidf = df_tf
    for key in dict_idf:
        tfidf[key] = tfidf[key].apply(lambda x: x*dict_idf.get(key))
    return tfidf

In [62]:
#obtain the final TFIDF dataframe
df_tfidf = get_tfidf(df_tf, dict_idf)
df_tfidf.head(5)
df_tfidf.to_csv("df_tfidf.csv")