# Load dataset

In [11]:
import pandas as pd
import re
import nltk
import numpy as np
from tqdm import tqdm
import torch
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F
# nltk.download('vader_lexicon')

In [3]:
# Read the data
data2010 = pd.read_csv('../../textData/BAC_2010.csv')
data2011 = pd.read_csv('../../textData/BAC_2011.csv')
data2012 = pd.read_csv('../../textData/BAC_2012.csv')
data2013 = pd.read_csv('../../textData/BAC_2013.csv')
data2014 = pd.read_csv('../../textData/BAC_2014.csv')
data2015 = pd.read_csv('../../textData/BAC_2015.csv')
data2016 = pd.read_csv('../../textData/BAC_2016.csv')

In [4]:
print('dimension of data2010: ', data2010.shape)
print('dimension of data2011: ', data2011.shape)
print('dimension of data2012: ', data2012.shape)
print('dimension of data2013: ', data2013.shape)
print('dimension of data2014: ', data2014.shape)
print('dimension of data2015: ', data2015.shape)
print('dimension of data2016: ', data2016.shape)

dimension of data2010:  (1433, 11)
dimension of data2011:  (8145, 11)
dimension of data2012:  (11429, 11)
dimension of data2013:  (12422, 11)
dimension of data2014:  (8582, 11)
dimension of data2015:  (14151, 11)
dimension of data2016:  (5647, 11)


In [5]:
# Combine all .csv
tweets_data_all = pd.concat([data2010, data2011, data2012, data2013, data2014, data2015, data2016], 
                            ignore_index = True)

# Keep the useful columns
tweets_data = tweets_data_all[['Timestamp', 'Embedded_text', 'Likes']]

# Keep the rows with hashtag "#BAC"
tweets_data_with_hashtag = tweets_data[tweets_data['Embedded_text'].str.contains('#BAC')]

# Reset the index
tweets_data_with_hashtag.reset_index(inplace = True)
tweets_data_with_hashtag = tweets_data_with_hashtag[['Timestamp', 'Embedded_text', 'Likes']]
tweets_data_with_hashtag

Unnamed: 0,Timestamp,Embedded_text,Likes
0,2010-01-02T22:49:33.000Z,回复 \n@JLo4rmCali\n@JLo4rmCali\n no i dropped i...,
1,2010-01-03T18:37:30.000Z,Unity != uniformity #BAC,
2,2010-01-03T18:16:24.000Z,@jozefrong: Vancouver loves their Canucks - ...,
3,2010-01-04T02:32:02.000Z,回复 \n@Eazy_Bake\n@Eazy_Bake\n I don't know wha...,
4,2010-01-05T12:04:02.000Z,what rock did you crawl out from under? now is...,
...,...,...,...
35742,2016-12-30T14:12:10.000Z,#BAC Meet on Extending Telangana #Assembly ses...,
35743,2016-12-30T00:45:19.000Z,City of Pacifica is recruiting citizens for Be...,
35744,2016-12-31T18:49:52.000Z,#happynewyear #coworkers #BAC #loveU # @ BAC C...,
35745,2016-12-31T16:45:22.000Z,Why did reducing #BAC laws from .10 to .08 hav...,


In [6]:
# Clean the data
# Remove useless info

def clean_text(text):
    
    text = re.sub(r"(@[A-Za-z0–9_]+)|(#[A-Za-z0–9_]+)|\$", "", text)
    text = re.sub(r"(http\://|https\://|www)\S+", "", text.lower())
    text = re.sub(r"\S*.com\S*", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"^\s*", "", text)
    text = re.sub(r"\s*$", "", text)
    text = re.sub(r"\s+", " ", text)
    return text

for i in range(0, len(tweets_data_with_hashtag)):
    tweets_data_with_hashtag.Embedded_text[i] = clean_text(tweets_data_with_hashtag.Embedded_text[i])
    

tweets_data_with_hashtag

Unnamed: 0,Timestamp,Embedded_text,Likes
0,2010-01-02T22:49:33.000Z,rmcali rmcali no i dropped it last night dont ...,
1,2010-01-03T18:37:30.000Z,unity uniformity,
2,2010-01-03T18:16:24.000Z,vancouver loves their canucks illustration thi...,
3,2010-01-04T02:32:02.000Z,i dont know what is i probably dont want to,
4,2010-01-05T12:04:02.000Z,what rock did you crawl out from under now is ...,
...,...,...,...
35742,2016-12-30T14:12:10.000Z,meet on extending telangana session held today,
35743,2016-12-30T00:45:19.000Z,city of pacifica is recruiting citizens for be...,
35744,2016-12-31T18:49:52.000Z,bac credomatic edif grane,
35745,2016-12-31T16:45:22.000Z,why did reducing laws from to have an impact o...,


In [7]:
# Change the Timestamp to the same format with the Stock_Price data
tweets_data_with_hashtag['date'] = pd.to_datetime(tweets_data_with_hashtag['Timestamp'], 
                                                       format='%Y/%m/%d').dt.date

# Keep useful columns
tweets_data_clean = tweets_data_with_hashtag[['Embedded_text', 'Likes', 'date']]


tweets_data_clean

Unnamed: 0,Embedded_text,Likes,date
0,rmcali rmcali no i dropped it last night dont ...,,2010-01-02
1,unity uniformity,,2010-01-03
2,vancouver loves their canucks illustration thi...,,2010-01-03
3,i dont know what is i probably dont want to,,2010-01-04
4,what rock did you crawl out from under now is ...,,2010-01-05
...,...,...,...
35742,meet on extending telangana session held today,,2016-12-30
35743,city of pacifica is recruiting citizens for be...,,2016-12-30
35744,bac credomatic edif grane,,2016-12-31
35745,why did reducing laws from to have an impact o...,,2016-12-31


In [8]:
texts = tweets_data_clean.Embedded_text
texts

0        rmcali rmcali no i dropped it last night dont ...
1                                         unity uniformity
2        vancouver loves their canucks illustration thi...
3              i dont know what is i probably dont want to
4        what rock did you crawl out from under now is ...
                               ...                        
35742       meet on extending telangana session held today
35743    city of pacifica is recruiting citizens for be...
35744                            bac credomatic edif grane
35745    why did reducing laws from to have an impact o...
35746    analysts on estimize are expecting yoy eps gro...
Name: Embedded_text, Length: 35747, dtype: object

# FinBERT

In [12]:
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

Downloading:   0%|          | 0.00/252 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading:   0%|          | 0.00/758 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [13]:
def SentimentAnalyzer(doc):
    pt_batch = tokenizer(doc,padding=True,truncation=True,max_length=512,return_tensors="pt")
    outputs = model(**pt_batch)
    pt_predictions = F.softmax(outputs.logits, dim=-1)
    return pt_predictions.detach().cpu().numpy()

In [21]:
sentiments = texts.apply(SentimentAnalyzer)
sentiments

0          [[0.026583808, 0.67494524, 0.298471]]
1         [[0.08095752, 0.022971062, 0.8960715]]
2        [[0.07321655, 0.031883795, 0.89489967]]
3         [[0.038052775, 0.09245713, 0.8694901]]
4          [[0.06845371, 0.029205278, 0.902341]]
                          ...                   
35742    [[0.027681932, 0.03366223, 0.93865585]]
35743     [[0.07127386, 0.013786005, 0.9149402]]
35744      [[0.05085974, 0.01638417, 0.9327561]]
35745      [[0.050801676, 0.04624728, 0.902951]]
35746    [[0.9458028, 0.022427173, 0.031770073]]
Name: Embedded_text, Length: 35747, dtype: object

In [38]:
for i in range(len(tweets_data_clean)):
    tweets_data_clean['positive'][i] = sentiments[i][0][0]
    tweets_data_clean['negative'][i] = sentiments[i][0][1]
    tweets_data_clean['neutral'][i] = sentiments[i][0][2]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets_data_clean['positive'][i] = sentiments[i][0][0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets_data_clean['negative'][i] = sentiments[i][0][1]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets_data_clean['neutral'][i] = sentiments[i][0][2]


In [42]:
tweets_data_clean.to_csv('FinBERT_tweets_sentiment.csv', index=False)

In [48]:
tweets_data_clean = pd.read_csv('FinBERT_tweets_sentiment.csv')
tweets_data_clean

Unnamed: 0,Embedded_text,Likes,date,positive,negative,neutral
0,rmcali rmcali no i dropped it last night dont ...,,2010-01-02,0.026584,0.674945,0.298471
1,unity uniformity,,2010-01-03,0.080958,0.022971,0.896072
2,vancouver loves their canucks illustration thi...,,2010-01-03,0.073217,0.031884,0.894900
3,i dont know what is i probably dont want to,,2010-01-04,0.038053,0.092457,0.869490
4,what rock did you crawl out from under now is ...,,2010-01-05,0.068454,0.029205,0.902341
...,...,...,...,...,...,...
35742,meet on extending telangana session held today,,2016-12-30,0.027682,0.033662,0.938656
35743,city of pacifica is recruiting citizens for be...,,2016-12-30,0.071274,0.013786,0.914940
35744,bac credomatic edif grane,,2016-12-31,0.050860,0.016384,0.932756
35745,why did reducing laws from to have an impact o...,,2016-12-31,0.050802,0.046247,0.902951


In [49]:
tweets_data_clean.Likes.fillna(1, inplace = True)
tweets_data_clean

Unnamed: 0,Embedded_text,Likes,date,positive,negative,neutral
0,rmcali rmcali no i dropped it last night dont ...,1,2010-01-02,0.026584,0.674945,0.298471
1,unity uniformity,1,2010-01-03,0.080958,0.022971,0.896072
2,vancouver loves their canucks illustration thi...,1,2010-01-03,0.073217,0.031884,0.894900
3,i dont know what is i probably dont want to,1,2010-01-04,0.038053,0.092457,0.869490
4,what rock did you crawl out from under now is ...,1,2010-01-05,0.068454,0.029205,0.902341
...,...,...,...,...,...,...
35742,meet on extending telangana session held today,1,2016-12-30,0.027682,0.033662,0.938656
35743,city of pacifica is recruiting citizens for be...,1,2016-12-30,0.071274,0.013786,0.914940
35744,bac credomatic edif grane,1,2016-12-31,0.050860,0.016384,0.932756
35745,why did reducing laws from to have an impact o...,1,2016-12-31,0.050802,0.046247,0.902951


In [50]:
def clean_number(likes):
    likes = re.sub(r"[^0-9]", "", str(likes))
    return likes

tweets_data_clean.Likes = tweets_data_clean.Likes.apply(clean_number)
tweets_data_clean.Likes = tweets_data_clean.Likes.astype(int)

In [52]:
finbert_tweet = tweets_data_clean
finbert_tweet.to_csv('finbert_tweet.csv', index=False)