## Step 1 - Creating pandas dataframe

In [1]:
import glob
import os
import pandas as pd
import re
import numpy as np
import string
from sklearn.preprocessing import minmax_scale


base_path_to_csv = os.path.join(os.getcwd() + '/eltweets/*.csv')
csv_list = glob.glob(base_path_to_csv)

# index_col removes the duplicates
df_list = [pd.read_csv(csv, index_col='id') for csv in csv_list]
df = pd.concat(df_list)

## Step 2 - Data cleaning

In [2]:
df = df.reset_index().drop_duplicates(subset='id', keep='first').set_index('id')

df['full_text'] = df['full_text'].astype('unicode')
remove_rt = lambda x: re.sub('RT @\w+: ', ' ', x)
remove_users_ref = lambda x: re.sub("@[A-Za-z0-9]+","",x)
remove_links = lambda x: re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", x)
remove_hashtags_underlines = lambda x: x.replace("#", "").replace("_", " ")

df['full_text'] = df['full_text'].map(remove_rt)
df['full_text'] = df['full_text'].map(remove_users_ref)
df['full_text'] = df['full_text'].map(remove_links)
df['full_text'] = df['full_text'].map(remove_hashtags_underlines)

df['full_text'] = df['full_text'].str.lower()
df = df[(
    df['full_text'].str.contains("bitcoin") | 
    df['full_text'].str.contains("btc") | 
    (df['full_text'].str.contains("crypto") & ~df['full_text'].str.contains("doge")))]
df['full_text'] = df['full_text'].str.replace('&amp;', 'and')
df['full_text'] = df['full_text'].str.replace('&', 'and')
df['full_text'] = df['full_text'].str.replace('💔', 'broke my heart')
df['full_text'] = df['full_text'].str.replace('🤣', 'laughing ')
df['full_text'] = df['full_text'].str.replace('🎶', '')
df['full_text'] = df['full_text'].str.replace("it’s", 'it is')
df['full_text'] = df['full_text'].str.replace("don’t", 'do not')
df['full_text'] = df['full_text'].str.replace("can’t", 'can not')
df['full_text'] = df['full_text'].str.replace("won’t", 'will not')
df['full_text'] = df['full_text'].str.replace("people’s", 'people')
df['full_text'] = df['full_text'].str.replace("people’s", 'people')
df['full_text'] = df['full_text'].str.replace("there’s", 'there is')
 
remove_pontuacao = lambda x:  re.sub(r'[^\w\s]', '', x)
df['full_text'] = df['full_text'].map(remove_pontuacao)

df['full_text'] = df['full_text'].str.replace(r'\\n',' ', regex=True) 

remove_multiplos_espacos = lambda x:  re.sub(' +', ' ', x)
df['full_text'] = df['full_text'].map(remove_multiplos_espacos)

df.dropna(inplace=True)


df['created_at'] = pd.to_datetime(df['created_at'])
df['created_at'] = df['created_at'].dt.normalize()
df['influence_end_at'] = df['created_at']  + pd.DateOffset(days=1)
df['influence_end_at'] = df['influence_end_at'].dt.normalize()

df['full_text'] = df['full_text'].astype('unicode')
df['retweet_count_n'] = None
df['retweet_count_n'] = minmax_scale(df['retweet_count'])


df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32 entries, 1404132183254523905 to 1328458535340949505
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype              
---  ------            --------------  -----              
 0   created_at        32 non-null     datetime64[ns, UTC]
 1   full_text         32 non-null     object             
 2   retweet_count     32 non-null     int64              
 3   influence_end_at  32 non-null     datetime64[ns, UTC]
 4   retweet_count_n   32 non-null     float64            
dtypes: datetime64[ns, UTC](2), float64(1), int64(1), object(1)
memory usage: 1.5+ KB


In [3]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

nltk.download('vader_lexicon')

analyzer = SentimentIntensityAnalyzer()
score_full_text = lambda x: analyzer.polarity_scores(x)['compound']

df['full_text_score'] = None
df['full_text_score'] = df['full_text'].map(score_full_text)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/prbpedro/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
