In [1]:
import pandas as pd
import re
import nltk
# nltk.download('vader_lexicon')
import warnings
from pandas.core.common import SettingWithCopyWarning

warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [2]:
# Read the data
data2010 = pd.read_csv('../textData/BAC_2010.csv')
data2011 = pd.read_csv('../textData/BAC_2011.csv')
data2012 = pd.read_csv('../textData/BAC_2012.csv')
data2013 = pd.read_csv('../textData/BAC_2013.csv')
data2014 = pd.read_csv('../textData/BAC_2014.csv')
data2015 = pd.read_csv('../textData/BAC_2015.csv')
data2016 = pd.read_csv('../textData/BAC_2016.csv')

In [3]:
# Combine all .csv
tweets_data_all = pd.concat([data2010, data2011, data2012, data2013, data2014, data2015, data2016], 
                            ignore_index = True)

# Keep the rows with hashtag "#BAC"
tweets_data_with_hashtag = tweets_data_all[tweets_data_all['Embedded_text'].str.contains('#BAC')]

# Reset the index
tweets_data_with_hashtag.reset_index(inplace = True)

# Clean the data
# Remove useless info
def clean_text(text):
    text = re.sub(r"(@[A-Za-z0–9_]+)|(#[A-Za-z0–9_]+)|\$", "", text)
    text = re.sub(r"(http\://|https\://|www)\S+", "", text.lower())
    text = re.sub(r"\S*.com\S*", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"^\s*", "", text)
    text = re.sub(r"\s*$", "", text)
    text = re.sub(r"\s+", " ", text)
    return text

for i in range(0, len(tweets_data_with_hashtag)):
    tweets_data_with_hashtag.loc[i, 'Embedded_text'] = clean_text(tweets_data_with_hashtag.Embedded_text[i])


# Change the Timestamp to the same format with the Stock_Price data
tweets_data_with_hashtag.loc[:, 'date'] = pd.to_datetime(tweets_data_with_hashtag['Timestamp'],
                                                  format='%Y/%m/%d').dt.date

# Keep useful columns
tweets_data_clean = tweets_data_with_hashtag[['Embedded_text', 'Likes', 'date']]


# Generate sentiment score
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

texts = tweets_data_clean['Embedded_text'].tolist()

output_score = []
for text in texts:
  output_score.append(sid.polarity_scores(str(text))["compound"])

tweets_data_clean.loc[:, 'Sentiment_Score'] = output_score

# Pure sentiment score!!!
tweets_data_clean

Unnamed: 0,Embedded_text,Likes,date,Sentiment_Score
0,rmcali rmcali no i dropped it last night dont ...,,2010-01-02,0.1531
1,unity uniformity,,2010-01-03,0.0000
2,vancouver loves their canucks illustration thi...,,2010-01-03,0.5719
3,i dont know what is i probably dont want to,,2010-01-04,-0.0572
4,what rock did you crawl out from under now is ...,,2010-01-05,0.4404
...,...,...,...,...
35742,meet on extending telangana session held today,,2016-12-30,0.0000
35743,city of pacifica is recruiting citizens for be...,,2016-12-30,0.4404
35744,bac credomatic edif grane,,2016-12-31,0.0000
35745,why did reducing laws from to have an impact o...,,2016-12-31,0.0000


In [4]:
#============================================================#
# Processing for likes
#============================================================#

# Fill NA likes with 1
tweets_data_clean.Likes.fillna(1, inplace = True)

# convert all likes to a number
def clean_number(likes):
    likes = re.sub(r"[^0-9]", "", str(likes))
    return likes

tweets_data_clean.loc[:, 'Likes'] = tweets_data_clean.Likes.apply(clean_number)
tweets_data_clean.loc[:, 'Likes'] = tweets_data_clean.Likes.astype(int)

tweets_data_clean.tail(20)

Unnamed: 0,Embedded_text,Likes,date,Sentiment_Score
35727,personally i cant wait for to explain to peasa...,1,2016-12-26,0.8225
35728,top stories of wells fargo under pressure it w...,1,2016-12-27,0.1531
35729,advances in wearable monitoring devices could ...,1,2016-12-27,-0.0772
35730,largest buys of rdquarter by of daviddreman corp,10,2016-12-27,0.0
35731,citigroup our long trade moves higher bac kbe,1,2016-12-28,0.0
35732,we have tickets to visit for a chance to,1,2016-12-28,0.25
35733,kingsman end year party with diana dendy and o...,1,2016-12-28,0.4019
35734,that is a big ass cat,1,2016-12-29,-0.5423
35735,good news for students our association wishes ...,10,2016-12-29,0.8074
35736,not long not until we pick a for our with,1,2016-12-30,0.0


In [5]:
# tweets_data_clean_sum = tweets_data_clean[['date', 'Sentiment_Score_total']]
# tweets_data_clean_sum = tweets_data_clean_sum.groupby('date').sum()
# tweets_data_clean_sum.tail(20)

In [6]:
# tweets_data_clean_sum.to_csv('BAC_sentiment_sum.csv')

In [7]:
tweets_data_clean['Negative'] = [0] * len(tweets_data_clean)
tweets_data_clean['Neutral'] = [0] * len(tweets_data_clean)
tweets_data_clean['Positive'] = [0] * len(tweets_data_clean)

In [8]:
tweets_data_clean

Unnamed: 0,Embedded_text,Likes,date,Sentiment_Score,Negative,Neutral,Positive
0,rmcali rmcali no i dropped it last night dont ...,1,2010-01-02,0.1531,0,0,0
1,unity uniformity,1,2010-01-03,0.0000,0,0,0
2,vancouver loves their canucks illustration thi...,1,2010-01-03,0.5719,0,0,0
3,i dont know what is i probably dont want to,1,2010-01-04,-0.0572,0,0,0
4,what rock did you crawl out from under now is ...,1,2010-01-05,0.4404,0,0,0
...,...,...,...,...,...,...,...
35742,meet on extending telangana session held today,1,2016-12-30,0.0000,0,0,0
35743,city of pacifica is recruiting citizens for be...,1,2016-12-30,0.4404,0,0,0
35744,bac credomatic edif grane,1,2016-12-31,0.0000,0,0,0
35745,why did reducing laws from to have an impact o...,1,2016-12-31,0.0000,0,0,0


In [12]:
for i in range(len(tweets_data_clean)):
    if tweets_data_clean.Sentiment_Score[i] < 0:
        tweets_data_clean.Negative[i] = tweets_data_clean.Likes[i]
    elif tweets_data_clean.Sentiment_Score[i] > 0:
        tweets_data_clean.Positive[i] = tweets_data_clean.Likes[i]
    else:
        tweets_data_clean.Neutral[i] = tweets_data_clean.Likes[i]
        
tweets_data_clean.tail(40)

Unnamed: 0,Embedded_text,Likes,date,Sentiment_Score,Negative,Neutral,Positive
35707,analysts on estimize are expecting yoy eps gro...,1,2016-12-22,0.3818,0,0,1
35708,day giant caribbean sea condylactis gigantean ...,30,2016-12-22,0.0,0,30,0
35709,had level of after arrest,1,2016-12-22,-0.34,1,0,0
35710,floyd had a high bac huh blood alcohol concent...,1,2016-12-22,0.0,0,1,0
35711,reportedly had of at time of could face jail time,1,2016-12-22,0.0,0,1,0
35712,floyd had of faces jail time for,10,2016-12-22,0.0,0,10,0
35713,inc mu chronicle,1,2016-12-23,0.0,0,1,0
35714,seminal otel,1,2016-12-23,0.0,0,1,0
35715,watchlist,1,2016-12-23,0.0,0,1,0
35716,patriots unaware of when they claimed him,10,2016-12-23,-0.2023,10,0,0


In [10]:
tweets_data_clean_3 = tweets_data_clean[['date', 'Negative', 'Neutral', 'Positive']]
tweets_data_clean_3 = tweets_data_clean_3.groupby('date').sum()
tweets_data_clean_3.tail(20)

Unnamed: 0_level_0,Negative,Neutral,Positive
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-12-12,1,53,2
2016-12-13,1,32,0
2016-12-14,20,91,1
2016-12-15,0,45,1
2016-12-16,1,34,0
2016-12-17,0,32,2
2016-12-18,0,42,1
2016-12-19,2,80,2
2016-12-20,0,67,82
2016-12-21,0,16,0


In [13]:
tweets_data_clean_3.to_csv('twitter_vader_sum.csv')