In [73]:
import numpy as np
import pandas as pd

In [74]:
first_part = pd.read_csv('Data/Covid-19 Twitter Dataset (Apr-Jun 2020).csv')
second_part = pd.read_csv('Data/Covid-19 Twitter Dataset (Apr-Jun 2021).csv')
third_part = pd.read_csv('Data/Covid-19 Twitter Dataset (Aug-Sep 2020).csv')


data = pd.concat([first_part, second_part, third_part])

In [75]:
#filtering Data
data_filtered = data.drop(columns=['id', 'created_at', 'source', 'user_mentions', 'lang', 'place', 'original_text',
                                   'favorite_count', 'retweet_count', 'original_author', 'compound', 'neg', 'neu', 'pos', 'sentiment'])
data_filtered

Unnamed: 0,hashtags,clean_tweet
0,,call leader help protect refuge covid19 provid...
1,,ogun state support cbn nirsal covid19 target c...
2,,polic offici base namahadi polic station busi ...
3,,covid19 oyo discharg two patient
4,Covid_19,condol famili surviv
...,...,...
120504,,exclus astrazeneca covid19 vaccin trial may re...
120505,,worker countri sign petit reinstat worker mass...
120506,,cultur china brillianc huax celebr th annivers...
120507,,trump call cnn bastard cover covid19 hear keil...


In [76]:
#preparing Data

def concat_tweet_hashtag(record):
    if  type(record['hashtags']) is str:
        return str(str(record['clean_tweet']) + ' ' + str(record['hashtags']))
    return str(record['clean_tweet'])

data_filtered['tweet_hashtag'] = data_filtered.apply(concat_tweet_hashtag, axis=1)
data_filtered = data_filtered.drop(columns=['clean_tweet', 'hashtags'])
df = data_filtered

In [77]:
#cleaning data
import re
import string

def clean_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

df['tweet_hashtag'] = df['tweet_hashtag'].apply(clean_text)
df

Unnamed: 0,tweet_hashtag
0,call leader help protect refuge provid qualit...
1,ogun state support cbn nirsal target credit f...
2,polic offici base namahadi polic station busi ...
3,oyo discharg two patient
4,condol famili surviv
...,...
120504,exclus astrazeneca vaccin trial may resum soo...
120505,worker countri sign petit reinstat worker mass...
120506,cultur china brillianc huax celebr th annivers...
120507,trump call cnn bastard cover hear keilar respons


In [78]:
# Tokenizing
import nltk

tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
df['tweet_hashtag'] = df['tweet_hashtag'].apply(lambda x: tokenizer.tokenize(x))
df

Unnamed: 0,tweet_hashtag
0,"[call, leader, help, protect, refuge, provid, ..."
1,"[ogun, state, support, cbn, nirsal, target, cr..."
2,"[polic, offici, base, namahadi, polic, station..."
3,"[oyo, discharg, two, patient]"
4,"[condol, famili, surviv]"
...,...
120504,"[exclus, astrazeneca, vaccin, trial, may, resu..."
120505,"[worker, countri, sign, petit, reinstat, worke..."
120506,"[cultur, china, brillianc, huax, celebr, th, a..."
120507,"[trump, call, cnn, bastard, cover, hear, keila..."


In [80]:
# stopwords removal
from nltk.corpus import stopwords

def remove_stopwords(text):
    stop_words = list(set(stopwords.words('english')))
    words = [w for w in text if w not in stop_words]
    return words
df['tweet_hashtag'] = df.apply(remove_stopwords)
df

Unnamed: 0,tweet_hashtag
0,"[call, leader, help, protect, refuge, provid, ..."
1,"[ogun, state, support, cbn, nirsal, target, cr..."
2,"[polic, offici, base, namahadi, polic, station..."
3,"[oyo, discharg, two, patient]"
4,"[condol, famili, surviv]"
...,...
120504,"[exclus, astrazeneca, vaccin, trial, may, resu..."
120505,"[worker, countri, sign, petit, reinstat, worke..."
120506,"[cultur, china, brillianc, huax, celebr, th, a..."
120507,"[trump, call, cnn, bastard, cover, hear, keila..."


In [82]:
# combining texts

def combine_text(list_of_text):
    combined_text = ' '.join(list_of_text)
    return combined_text

df['tweet_hashtag'] = df['tweet_hashtag'].apply(lambda x: combine_text(x))
df

Unnamed: 0,tweet_hashtag
0,call leader help protect refuge provid qualiti...
1,ogun state support cbn nirsal target credit fa...
2,polic offici base namahadi polic station busi ...
3,oyo discharg two patient
4,condol famili surviv
...,...
120504,exclus astrazeneca vaccin trial may resum soon...
120505,worker countri sign petit reinstat worker mass...
120506,cultur china brillianc huax celebr th annivers...
120507,trump call cnn bastard cover hear keilar respons
