In [1]:
import arrow
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from dateutil import parser

import util

In [2]:
sp_df = pd.read_csv("./sp_data.csv")

In [3]:
sp_df.head()

Unnamed: 0,Date,Close/Last,Volume,Open,High,Low
0,02/26/2020,3110.2,306,3098.3,3101.5,3098.3
1,02/25/2020,3132.6,119,3221.3,3255.0,3135.2
2,02/24/2020,3226.3,210,3305.8,3305.8,3224.0
3,02/21/2020,3339.3,101,3357.6,3362.2,3354.4
4,02/20/2020,3369.2,5,3394.9,3394.9,3381.5


In [4]:
tw_df = pd.read_excel("./tweets.xlsx")

In [5]:
tw_df.head()

Unnamed: 0,source,id_str,text,created_at,retweet_count,in_reply_to_user_id_str,favorite_count,is_retweet,date
0,Twitter for iPhone,815271067749059968,RT @realDonaldTrump: Happy Birthday @DonaldJTr...,Sat Dec 31 18:59:04 +0000 2016,9529,,0,1.0,31 Dec 2016
1,Twitter for iPhone,815270850916208000,Happy Birthday @DonaldJTrumpJr!\r\nhttps://t.c...,Sat Dec 31 18:58:12 +0000 2016,9529,,55601,0.0,31 Dec 2016
2,Twitter for Android,815185071317676032,"Happy New Year to all, including to my many en...",Sat Dec 31 13:17:21 +0000 2016,141853,,350860,0.0,31 Dec 2016
3,Twitter for Android,814958820980039040,Russians are playing @CNN and @NBCNews for suc...,Fri Dec 30 22:18:18 +0000 2016,23213,,84254,0.0,30 Dec 2016
4,Twitter for iPhone,814920722208295936,"Join @AmerIcan32, founded by Hall of Fame lege...",Fri Dec 30 19:46:55 +0000 2016,7366,,25336,0.0,30 Dec 2016


In [6]:
analyzer = SentimentIntensityAnalyzer()
def sentiment_analyze(text, flag):
    vs = analyzer.polarity_scores(text)
    return vs[flag]


In [22]:
def get_sp_price(date):
    date = parser.parse(date)
    utc = arrow.get(date.isoformat())
    tar_date = utc.to('US/Eastern').format('MM/DD/YYYY')
    tar_price = sp_df[sp_df['Date'] == tar_date]
    if len(tar_price) > 0:
        return tar_price.iloc[0][3]
    return 0

In [8]:
COLUMN_NAMES = ["id","text","favorite_count","is_retweet","retweet_count","sentiment_compound","sentiment_neg","sentiment_neu","sentiment_pos","source","hour","day","week","month","year", "price"]

In [9]:
df = pd.DataFrame(columns=COLUMN_NAMES)

In [23]:
df['id'] = tw_df['id_str']
df['source'] = tw_df['source']
df['text'] = tw_df['text'].apply(util.encodeText)
df['favorite_count'] = tw_df['favorite_count']
df['retweet_count'] = tw_df['retweet_count']
df['is_retweet'] = tw_df['is_retweet']
df['hour'] = tw_df['created_at'].apply(util.convertUTCtoHourOfDay)
df['day'] = tw_df['created_at'].apply(util.convertUTCtoDay)
df['week'] = tw_df['created_at'].apply(util.convertUTCtoWeekNumber)
df['month'] = tw_df['created_at'].apply(util.convertUTCtoMonth)
df['year'] = tw_df['created_at'].apply(util.convertUTCtoYear)
df['sentiment_compound'] = tw_df['text'].apply(lambda x: sentiment_analyze(x, "compound"))
df['sentiment_neg'] = tw_df['text'].apply(lambda x: sentiment_analyze(x, "neg"))
df['sentiment_neu'] = tw_df['text'].apply(lambda x: sentiment_analyze(x, "neu"))
df['sentiment_pos'] = tw_df['text'].apply(lambda x: sentiment_analyze(x, "pos"))
df['price'] = tw_df['created_at'].apply(get_sp_price)

In [24]:
df.head()

Unnamed: 0,id,text,favorite_count,is_retweet,retweet_count,sentiment_compound,sentiment_neg,sentiment_neu,sentiment_pos,source,hour,day,week,month,year,price
0,815271067749059968,b'RT @realDonaldTrump: Happy Birthday @DonaldJ...,0,1.0,9529,0.6114,0.0,0.556,0.444,Twitter for iPhone,13,31,52,12,2016,0.0
1,815270850916208000,b'Happy Birthday @DonaldJTrumpJr!\r\nhttps://t...,55601,0.0,9529,0.6114,0.0,0.429,0.571,Twitter for iPhone,13,31,52,12,2016,0.0
2,815185071317676032,"b""Happy New Year to all, including to my many ...",350860,0.0,141853,-0.4911,0.288,0.524,0.188,Twitter for Android,8,31,52,12,2016,0.0
3,814958820980039040,"b""Russians are playing @CNN and @NBCNews for s...",84254,0.0,23213,0.2695,0.116,0.691,0.192,Twitter for Android,17,30,52,12,2016,2246.4
4,814920722208295936,"b'Join @AmerIcan32, founded by Hall of Fame le...",25336,0.0,7366,0.6249,0.0,0.718,0.282,Twitter for iPhone,14,30,52,12,2016,2246.4


In [25]:
processed_df = df[df['price'] != 0]

In [26]:
processed_df.head()

Unnamed: 0,id,text,favorite_count,is_retweet,retweet_count,sentiment_compound,sentiment_neg,sentiment_neu,sentiment_pos,source,hour,day,week,month,year,price
3,814958820980039040,"b""Russians are playing @CNN and @NBCNews for s...",84254,0.0,23213,0.2695,0.116,0.691,0.192,Twitter for Android,17,30,52,12,2016,2246.4
4,814920722208295936,"b'Join @AmerIcan32, founded by Hall of Fame le...",25336,0.0,7366,0.6249,0.0,0.718,0.282,Twitter for iPhone,14,30,52,12,2016,2246.4
5,814919370711460992,b'Great move on delay (by V. Putin) - I always...,97669,0.0,34415,0.7257,0.106,0.553,0.341,Twitter for Android,14,30,52,12,2016,2246.4
6,814484710025993984,b'My Administration will follow two simple rul...,45609,0.0,11330,0.0,0.0,1.0,0.0,Twitter for iPhone,9,29,52,12,2016,2246.0
7,814231064847728000,"b""'Economists say Trump delivered hope' https:...",51857,0.0,13919,0.4404,0.0,0.633,0.367,Twitter for iPhone,17,28,52,12,2016,2262.1


In [16]:
df = pd.read_csv("./processed_tweet_price.csv")

In [17]:
df.to_parquet('processed_data.parquet')