## Step 1 - Creating pandas dataframe

In [1]:
import glob
import os
import pandas as pd
import re
import numpy as np

base_path_to_csv = os.path.join(os.getcwd() + '/eltweets/*.csv')
csv_list = glob.glob(base_path_to_csv)

# index_col removes the duplicates
df_list = [pd.read_csv(csv, index_col='id') for csv in csv_list]
df = pd.concat(df_list)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6488 entries, 1404876425706938371 to 1272993752890486784
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   created_at     6488 non-null   object
 1   full_text      6488 non-null   object
 2   retweet_count  6488 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 202.8+ KB


## Step 2 - Data cleaning

In [6]:
df['full_text'] = df['full_text'].astype('unicode')
remove_rt = lambda x: re.sub('RT @\w+: ', ' ', x)
remove_users_ref = lambda x: re.sub("@[A-Za-z0-9]+","",x)
remove_links = lambda x: re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", x)
remove_hashtags_underlines = lambda x: x.replace("#", "").replace("_", " ")


df['full_text'] = df['full_text'].map(remove_rt)
df['full_text'] = df['full_text'].map(remove_users_ref)
df['full_text'] = df['full_text'].map(remove_links)
df['full_text'] = df['full_text'].map(remove_hashtags_underlines)
df['full_text'] = df['full_text'].str.strip()
df['full_text'] = df['full_text'].replace('', np.NaN)


# df['created_at'] = df['created_at'].dt.normalize()

df.dropna(inplace=True)

df['full_text'] = df['full_text'].astype('unicode')
df['created_at'] = pd.to_datetime(df['created_at'])
df['influence_end_at'] = df['created_at']  + pd.DateOffset(hours=6)

df.drop_column()
df.drop_duplicates(inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3815 entries, 1404876425706938371 to 1274025664492892160
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype              
---  ------            --------------  -----              
 0   created_at        3815 non-null   datetime64[ns, UTC]
 1   full_text         3815 non-null   object             
 2   retweet_count     3815 non-null   int64              
 3   influence_end_at  3815 non-null   datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](2), int64(1), object(1)
memory usage: 149.0+ KB


In [4]:
df['full_text'] = df['full_text'].str.lower()
df = df[(df['full_text'].str.contains("bitcoin") | df['full_text'].str.contains("btc") | df['full_text'].str.contains("crypto"))]

df.head()

Unnamed: 0_level_0,created_at,full_text,retweet_count,influence_end_at
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1404132183254523905,2021-06-13 17:42:54+00:00,this is inaccurate. tesla only sold ~10% of ho...,11062,2021-06-13 23:42:54+00:00
1400620080090730501,2021-06-04 01:07:04+00:00,bitcoin 💔,22896,2021-06-04 07:07:04+00:00
1396914548167233537,2021-05-24 19:42:36+00:00,spoke with north american bitcoin miners. they...,39327,2021-05-25 01:42:36+00:00
1396049547680391168,2021-05-22 10:25:24+00:00,the true battle is between fiat &amp; crypto. ...,12921,2021-05-22 16:25:24+00:00
1395472799020421120,2021-05-20 20:13:36+00:00,bitcoin hashing (aka mining) energy usage is s...,891,2021-05-21 02:13:36+00:00
