## Step 1 - Creating pandas dataframe

In [1]:
import glob
import os
import pandas as pd
import re
import numpy as np
import string
from sklearn.preprocessing import minmax_scale
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
from pprint import pprint

nltk.download('vader_lexicon')

base_path_to_csv = os.path.join(os.getcwd() + '/eltweets/*.csv')
csv_list = glob.glob(base_path_to_csv)

# index_col removes the duplicates
df_list = [pd.read_csv(csv, index_col='id') for csv in csv_list]
df = pd.concat(df_list)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9737 entries, 1406073484300591105 to 1272993752890486784
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   created_at     9737 non-null   object
 1   full_text      9737 non-null   object
 2   retweet_count  9737 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 304.3+ KB
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/prbpedro/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## Step 2 - Data cleaning

In [2]:
df = df.reset_index().drop_duplicates(subset='id', keep='first').set_index('id')

df['full_text'] = df['full_text'].astype('unicode')
remove_rt = lambda x: re.sub('RT @\w+: ', ' ', x)
remove_users_ref = lambda x: re.sub("@[A-Za-z0-9]+","",x)
remove_links = lambda x: re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", x)
remove_hashtags_underlines = lambda x: x.replace("#", "").replace("_", " ")

df['full_text'] = df['full_text'].map(remove_rt).map(remove_users_ref)
df['full_text'] = df['full_text'].map(remove_links)
df['full_text'] = df['full_text'].map(remove_hashtags_underlines)

df['full_text'] = df['full_text'].str.lower()
df = df[(
    df['full_text'].str.contains("bitcoin") | 
    df['full_text'].str.contains("btc") | 
    df['full_text'].str.contains("crypto") | 
    df['full_text'].str.contains("doge"))]
df['full_text'] = df['full_text'].str.replace('&amp;', 'and')
df['full_text'] = df['full_text'].str.replace('&', 'and')
df['full_text'] = df['full_text'].str.replace('💔', 'broke my heart')
df['full_text'] = df['full_text'].str.replace('🤣', 'laughing ')
df['full_text'] = df['full_text'].str.replace('🎶', '')
df['full_text'] = df['full_text'].str.replace("it’s", 'it is')
df['full_text'] = df['full_text'].str.replace("don’t", 'do not')
df['full_text'] = df['full_text'].str.replace("can’t", 'can not')
df['full_text'] = df['full_text'].str.replace("won’t", 'will not')
df['full_text'] = df['full_text'].str.replace("people’s", 'people')
df['full_text'] = df['full_text'].str.replace("people’s", 'people')
df['full_text'] = df['full_text'].str.replace("there’s", 'there is')
 
remove_pontuacao = lambda x:  re.sub(r'[^\w\s]', '', x)
df['full_text'] = df['full_text'].map(remove_pontuacao)

df['full_text'] = df['full_text'].str.replace(r'\\n',' ', regex=True) 

remove_multiplos_espacos = lambda x:  re.sub(' +', ' ', x)
df['full_text'] = df['full_text'].map(remove_multiplos_espacos)

df.dropna(inplace=True)


df['created_at'] = pd.to_datetime(df['created_at'])
df['created_at'] = df['created_at'].dt.normalize()
df['influence_end_at'] = df['created_at']  + pd.DateOffset(days=1)
df['influence_end_at'] = df['influence_end_at'].dt.normalize()

df['full_text'] = df['full_text'].astype('unicode')

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 73 entries, 1404132183254523905 to 1284290215561986048
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype              
---  ------            --------------  -----              
 0   created_at        73 non-null     datetime64[ns, UTC]
 1   full_text         73 non-null     object             
 2   retweet_count     73 non-null     int64              
 3   influence_end_at  73 non-null     datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](2), int64(1), object(1)
memory usage: 2.9+ KB


In [3]:
analyzer = SentimentIntensityAnalyzer()
score_full_text = lambda x: analyzer.polarity_scores(x)['compound']

df['full_text_score'] = None
df['full_text_score'] = df['full_text'].map(score_full_text)

In [4]:
new_rows = df[df['created_at'] != df['influence_end_at']]
new_rows['created_at'] = new_rows['influence_end_at']
new_rows['id'] = None

new_df = df.append(new_rows)

mask = df['created_at'].duplicated(keep=False)
duplicados = df[mask]

f_df = df[~mask].copy()
f_df['Date'] = f_df['created_at']
f_df['Score'] = f_df['full_text_score']
f_df = f_df[['Date', 'Score']]


In [5]:



m = {}
for d in duplicados['created_at'].unique():
    if d not in m.keys():
        m[d] = { 'count': 0, 'full_text_score': 0.0 }
    for i, row in df.loc[df['created_at'] == d].iterrows():
        m[d]['full_text_score'] += row['full_text_score']
        m[d]['count'] += 1


pprint(m)

{Timestamp('2020-12-20 00:00:00+0000', tz='UTC'): {'count': 3,
                                                   'full_text_score': 0.4404},
 Timestamp('2021-02-04 00:00:00+0000', tz='UTC'): {'count': 3,
                                                   'full_text_score': -0.6369},
 Timestamp('2021-02-07 00:00:00+0000', tz='UTC'): {'count': 2,
                                                   'full_text_score': 0.5563},
 Timestamp('2021-02-10 00:00:00+0000', tz='UTC'): {'count': 2,
                                                   'full_text_score': -0.34},
 Timestamp('2021-02-11 00:00:00+0000', tz='UTC'): {'count': 2,
                                                   'full_text_score': -0.8155},
 Timestamp('2021-02-14 00:00:00+0000', tz='UTC'): {'count': 2,
                                                   'full_text_score': 0.7201},
 Timestamp('2021-02-19 00:00:00+0000', tz='UTC'): {'count': 3,
                                                   'full_text_score': 0.181099999999

In [6]:
f_df2 = pd.DataFrame([ {'Date': k, 'Score': v['full_text_score'] / v['count'] } for k, v in m.items()])

df = pd.concat([f_df, f_df2], ignore_index=True)
df.set_index('Date', inplace=True, drop=True)
df = df[df['Score'] !=0]
df.to_csv("btc_em_sentimental_analysis.csv")

In [7]:
btc_df = pd.read_csv("Bitcoin Historical Data - Investing.com.csv", usecols = ['Date','Price'])
btc_df['Date'] = pd.to_datetime(btc_df['Date'])
btc_df['Date'] = btc_df['Date'].dt.normalize()
btc_df = btc_df.reset_index().set_index('Date')
btc_df = btc_df.drop(['index'], axis=1)

In [8]:
df.index = df.index.tz_localize(None)
btc_df.index = btc_df.index.tz_localize(None)
merged = pd.merge(df, btc_df, on = ['Date'], how = 'outer')


In [9]:
merged['Score'] = merged['Score'].fillna(0)
merged['Score'] = pd.to_numeric(merged['Score'])
merged['Price'] = merged['Price'].str.replace(',', '')
merged['Price'] = pd.to_numeric(merged['Price'])

In [10]:
merged.to_csv('btc_value_em_tweets_sentimental_score.csv')