In [1]:
import pandas as pd
import numpy as np

In [3]:
fname = 'dataset/nasdaq/overnight_sentiments.csv'

data = pd.read_csv( fname, index_col = 0)
data.head(3)

Unnamed: 0,datetime,stockcode,source,headline,article,urls,dt,IsMarketOpen,TradeDate,summary,_relevance,_sentiment
0,"February 07, 2019, 09:11:00 PM EDT",AMZN,RTTNews,Amazon CEO Jeff Bezos Accuses National Enquire...,\n\n\nShutterstock photo\n\n@media screen and ...,https://www.nasdaq.com/article/amazon-ceo-jeff...,2019-02-07 21:11:00-05:00,False,2019-02-08,Shutterstock photo@media screen and (Amazon CE...,11,0.717833
2,"February 07, 2019, 06:55:00 PM EDT",AMZN,Reuters,Amazon's Bezos says National Enquirer owner tr...,\n\n\nReuters\n\n@media screen and (max-device...,https://www.nasdaq.com/article/amazons-bezos-s...,2019-02-07 18:55:00-05:00,False,2019-02-08,"Jeff Bezos, chief executive of Amazon.com Inc,...",5,0.34276
3,"February 07, 2019, 06:26:00 PM EDT",AMZN,Reuters,Amazon's Bezos says National Enquirer tried to...,\n\n\nReuters\n\n@media screen and (max-device...,https://www.nasdaq.com/article/amazons-bezos-s...,2019-02-07 18:26:00-05:00,False,2019-02-08,"Jeff Bezos, chief executive of Amazon.com Inc,...",5,0.45036


In [47]:
def GetAvgSentiment( l_sentiments, l_relevances, threshold = 1, method = 'simple'):
    '''
    Given a list of sentiments and corresponding relevance score, this function returns...
    1) "simple": equally weighted average sentiment score for sentiments' with relevance >= "threshold"
    2) "weighted": weighted average sentiment score
    '''
    
    if method == 'simple':
        # --- Method 1 ---
        # simple average sentiment with threhold cut off
    
        l_senti = []

        for i, s_ in enumerate(l_sentiments):
            if l_relevances[i] >= threshold:
                l_senti.append(s_)
        
        avg_score = np.mean(l_senti)
        return avg_score
    
    elif method == 'weighted':
        # --- Method 2 ---
        # weighted average senti w.r.t. relevance
        
        if sum(l_relevances) == 0:
            return np.nan
        
        weights = [rel/ sum(l_relevances) for rel in l_relevances]
        avg_score = 0

        for i, s_ in enumerate(l_sentiments):
            avg_score += s_ * weights[i]
        
        return avg_score
    else:
        print(f'Method = {method} is not available')
        return None

## Generate Daily Sentiments

In [55]:
l_tickers = data['stockcode'].unique()
l_to_df = []

for stock in l_tickers:
    print(f'Calculating Daily Sentiments for {stock}...')
    idf = data[data['stockcode'] == stock]
    l_td = idf['TradeDate'].unique()
    
    for idate in l_td:
        inews = idf[idf['TradeDate']== idate]
        inews = inews.reset_index()
        
        ss = GetAvgSentiment(inews['_sentiment'], inews['_relevance'], threshold = 1, method = 'simple')
        l_to_df.append(
            [stock, idate, ss]
        )

Calculating Daily Sentiments for AMZN...
Calculating Daily Sentiments for GOOGL...
Calculating Daily Sentiments for FB...
Calculating Daily Sentiments for NFLX...


### Quick Look at the data

In [56]:
df_csv = pd.DataFrame(data = l_to_df, columns = ['stockcode','trade_date', 'sentiment_score'])
df_csv.describe()

Unnamed: 0,sentiment_score
count,326.0
mean,0.435579
std,0.128291
min,-0.04448
25%,0.366718
50%,0.432567
75%,0.515777
max,0.79458


## Output to CSV

In [57]:
output_fname = 'dataset/nasdaq/daily_sentiment.csv'
df_csv.to_csv(output_fname)

## Testing

In [49]:
test = data[
            data['stockcode'] == 'GOOGL' 
        ][ data['TradeDate'] == '2019-02-11']
test

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,datetime,stockcode,source,headline,article,urls,dt,IsMarketOpen,TradeDate,summary,_relevance,_sentiment
6025,"February 09, 2019, 07:30:00 AM EDT",GOOGL,Motley Fool,How Are S&P 500 Stocks Chosen?,\n Though the S&P 500 (SNPINDEX: ^GSPC) m...,https://www.nasdaq.com/article/how-are-sp-500-...,2019-02-09 07:30:00-05:00,False,2019-02-11,companies must get the approval of the index...,0,0.32728
6028,"February 08, 2019, 06:13:00 PM EDT",GOOGL,Reuters,"EU countries agree on copyright reforms, deal ...",\n\n\nReuters\n\n@media screen and (max-device...,https://www.nasdaq.com/article/eu-countries-ag...,2019-02-08 18:13:00-05:00,False,2019-02-11,The European Union's effort to rewrite two-dec...,3,0.39138


In [48]:
test_ = test.reset_index()
GetAvgSentiment(test_['_sentiment'],test_['_relevance'], threshold = 1 )

0.43714

In [33]:
df_csv[df_csv['stockcode']== 'GOOGL']

Unnamed: 0,stockcode,trade_date,sentiment_score
61,GOOGL,2019-02-08,0.346187
62,GOOGL,2019-02-07,
63,GOOGL,2019-02-06,0.340511
64,GOOGL,2019-02-05,0.432891
65,GOOGL,2019-02-04,0.547460
66,GOOGL,2019-02-01,0.421980
67,GOOGL,2019-01-31,0.584245
68,GOOGL,2019-01-30,0.445540
69,GOOGL,2019-01-29,0.481467
70,GOOGL,2019-01-28,0.465520
