In [80]:
import numpy as np
import pandas as pd
import json
from glob import glob
from pandas.io.json import json_normalize

In [81]:



def total_bear_bull_tag(filenames):
    
    stocks = {}
    for f in filenames:
        #print(f)
        df = pd.read_json(f)
        df.fillna(value = pd.np.nan , inplace=True)
        data = pd.DataFrame([df.body, df.sentiment]).transpose()
        data['sentiment'] = data['sentiment'].apply(lambda x: {} if pd.isna(x) else x)
        sentiment_data = json_normalize(data=data['sentiment'], 
                                meta=['class', 'name'])
        new_data = pd.concat([data, sentiment_data], axis=1)
        new_data = new_data.drop(columns=['sentiment', 'name'])
        
        #print(new_data['class'].value_counts())
        tags = new_data['class'].value_counts().keys().tolist()
        values  = new_data['class'].value_counts().tolist()
        
        bear_bull_values = {tags[i]: values[i] for i in range(len(tags))}
        stocks[f] = bear_bull_values
    return stocks 

In [None]:
filenames = glob("../twits/*.json")
total_bear_bull_tag(filenames)


#### This tag_polarity_breakdown function take a file and the value n (the number of days for the rolling average being calculated) and returns a dataframe with these columns:

1. Date of the Tweets created
2. The numbers of each of the three message tags (None, bearish, bullish) on that date. 
3. Message volume for the day
4. The polarity of the stock for each day, which is the difference in the number of bullish and bearish
    tags divided by the total message volume for that day
5. s_t: which is a rolling average where t indicates the number of days for the rolling average
6.  m1v_t: which is the difference between today's message volume and yesterday's message volume
    divided by yesterday's message volume
7. mv10_t: which is today's message volume divided by the average message volume in the previous 10 days



In [123]:


def tag_polarity_breakdown(file, t): 
    
    #n is the value of the number of days for rolling average
    data = pd.read_json(file)
    data.fillna(value = pd.np.nan , inplace=True)
    data['sentiment'] = data['sentiment'].apply(lambda x: {} if pd.isna(x) else x)
    bear_bull_tags = json_normalize(data=data['sentiment'], 
                                    meta=['class', 'name'])
    date_data = data['created_at']


    new_data = pd.concat([data['body'], bear_bull_tags, date_data], axis=1)
    new_data  = new_data.drop(columns = 'name')
    new_data['created_at'] = pd.to_datetime(new_data['created_at']).apply(lambda x: x.date())
    new_data['class'] = new_data['class'].replace(np.nan, 'None', regex=True)


    total_daily_messages = new_data['created_at'].value_counts().rename_axis('unique dates').reset_index(name='message_volume')

    new_data['count'] = 1
    daily_tag_breakdown = new_data.pivot_table(
        index=['created_at'], dropna = False, columns='class', values='count',
        fill_value=0, aggfunc=np.sum
    )

    total_daily_messages = total_daily_messages.sort_values(by='unique dates').reset_index(drop= True)
    daily_tag_breakdown = daily_tag_breakdown.reset_index()
    daily_tag_breakdown = pd.concat([daily_tag_breakdown, total_daily_messages['message_volume']], axis=1)
              #remove categories
    daily_tag_breakdown

    '''Polarity: Calculating the difference of the number of bullish and bearish tags and 
    divide it by the total number of messages tagged for each stock in each day, 
    '''
    daily_tag_breakdown['polarity'] = daily_tag_breakdown.apply(lambda row: (row.bullish - row.bearish)/(row.message_volume), axis=1)
    daily_tag_breakdown['s_t'] = daily_tag_breakdown['polarity'].rolling(t).mean()
    daily_tag_breakdown['mv1_t'] = daily_tag_breakdown['message_volume'].diff(periods = 1).div(daily_tag_breakdown['message_volume'].shift(1))
    daily_tag_breakdown['mv10_t'] = daily_tag_breakdown['message_volume'].div(daily_tag_breakdown['message_volume'].rolling(10).mean())
    
    #m1_t = message volume difference between current day and day before divided by message volume of day before
    return daily_tag_breakdown

In [124]:
file_set_path = "../twits/FB.json"
n = 3
tag_polarity_breakdown(file_set_path, n)

Unnamed: 0,created_at,None,bearish,bullish,message_volume,polarity,s_t,m1_t,m10_t
0,2015-07-11,36,1,15,52,0.269231,,,
1,2015-07-12,34,2,18,54,0.296296,,0.038462,
2,2015-07-13,183,13,112,308,0.321429,0.295652,4.703704,
3,2015-07-14,177,35,111,323,0.235294,0.284340,0.048701,
4,2015-07-15,188,14,70,272,0.205882,0.254202,-0.157895,
...,...,...,...,...,...,...,...,...,...
496,2016-11-18,367,73,220,660,0.222727,0.157602,0.380753,1.159522
497,2016-11-19,104,12,44,160,0.200000,0.204368,-0.757576,0.300019
498,2016-11-20,106,6,43,155,0.238710,0.220479,-0.031250,0.347846
499,2016-11-21,444,48,398,890,0.393258,0.277323,4.741935,1.868570


In [92]:
print(daily_tag_breakdown['polarity'].value_counts())

 0.000000    387
 0.500000     46
 0.250000     45
 0.166667     36
 0.333333     36
            ... 
 0.409091      1
-0.238095      1
 0.033333      1
-0.189781      1
-0.150000      1
Name: polarity, Length: 158, dtype: int64


Unnamed: 0,body,class,created_at,count
0,$AXP analysts on Estimize are expecting -18.44...,,2016-11-30,1
1,$AXP price to book ratio is now at 2009 levels...,bullish,2016-11-30,1
2,Try using shareholder yield instead of dividen...,,2016-11-30,1
3,hmm is the $AXP bandwagon done with?!,,2016-11-30,1
4,$AXP reached 50% at ( $73.16 ) is it time to T...,,2016-11-30,1
...,...,...,...,...
11425,$AXP Other Financial Highlights for 3QFY13. ht...,,2013-12-02,1
11426,$AXP Small Business SaturdayÂ® Gives Small Bus...,,2013-12-02,1
11427,Bullish Ideas $AAPL $GOOG $DIS $DECK $PAY $XLF...,,2013-12-02,1
11428,10 Numbers That Show Why Small Business Saturd...,,2013-12-02,1
