In [14]:
import pandas as pd
import re
import string

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk import word_tokenize

import demoji
from emoji.unicode_codes import UNICODE_EMOJI

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sheenasalwan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
df_tweet = pd.read_csv("../data/vaccination_all_tweets.csv")
df_tweet.tail(3)

Unnamed: 0,id,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,retweets,favorites,is_retweet
206964,1445954643419226114,VaxBLR,"Bengaluru, India",Hourly updates on FREE and PAID 18+ and 45+ va...,2021-06-21 08:44:34,26,0,0,False,2021-10-07 03:30:26,18-44 #BBMP #Bengaluru #CovidVaccine Availabil...,"['BBMP', 'Bengaluru', 'CovidVaccine', 'COVISHI...",VaxBlr,0,1,False
206965,1445954599345475592,VaxBLR,"Bengaluru, India",Hourly updates on FREE and PAID 18+ and 45+ va...,2021-06-21 08:44:34,26,0,0,False,2021-10-07 03:30:15,18-44 #URBAN #Bengaluru #CovidVaccine Availabi...,"['URBAN', 'Bengaluru', 'CovidVaccine', 'COVISH...",VaxBlr,0,0,False
206966,1445947047052333057,VaxBLR,"Bengaluru, India",Hourly updates on FREE and PAID 18+ and 45+ va...,2021-06-21 08:44:34,26,0,0,False,2021-10-07 03:00:15,45+ #URBAN #Bengaluru #CovidVaccine Availabili...,"['URBAN', 'Bengaluru', 'CovidVaccine', 'COVISH...",VaxBlr,0,0,False


In [4]:
df_tweet.shape

(206967, 16)

In [5]:
df_tweet.describe()

Unnamed: 0,id,user_followers,user_friends,user_favourites,retweets,favorites
count,206967.0,206967.0,206967.0,206967.0,206967.0,206967.0
mean,1.402529e+18,93569.21,983.137118,11874.33,2.389091,10.334483
std,2.571008e+16,818885.2,5390.710356,38690.35,44.710131,160.316483
min,1.337728e+18,0.0,0.0,0.0,0.0,0.0
25%,1.380516e+18,59.0,25.0,57.0,0.0,0.0
50%,1.405382e+18,339.0,227.0,952.0,0.0,0.0
75%,1.422578e+18,1521.0,786.0,7130.0,0.0,2.0
max,1.449092e+18,16201170.0,582461.0,1221784.0,11288.0,25724.0


In [8]:
# lowercase and url removal

def process_tweet(text):
    text = text.lower()             # convert to lowercase
    text = re.sub(r"http\S+", "", text) # url removal
    return text

df_tweet["text"] = df_tweet["text"].apply(lambda text: process_tweet(text))
df_tweet["text"]

0         same folks said daikon paste could treat a cyt...
1         while the world has been on the wrong side of ...
2         #coronavirus #sputnikv #astrazeneca #pfizerbio...
3         facts are immutable, senator, even when you're...
4         explain to me again why we need a vaccine @bor...
                                ...                        
206962    45+ #urban #bengaluru #covidvaccine availabili...
206963    pincode: 560011\nsputnik v - dose 1: 100 slots...
206964    18-44 #bbmp #bengaluru #covidvaccine availabil...
206965    18-44 #urban #bengaluru #covidvaccine availabi...
206966    45+ #urban #bengaluru #covidvaccine availabili...
Name: text, Length: 206967, dtype: object

In [11]:
# Punctuation Removal

punctuation_removal = string.punctuation

def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', punctuation_removal))

df_tweet["text"] = df_tweet["text"].apply(lambda text: remove_punctuation(text))
df_tweet["text"]

0         same folks said daikon paste could treat a cyt...
1         while the world has been on the wrong side of ...
2         coronavirus sputnikv astrazeneca pfizerbiontec...
3         facts are immutable senator even when youre no...
4         explain to me again why we need a vaccine bori...
                                ...                        
206962    45 urban bengaluru covidvaccine availability f...
206963    pincode 560011\nsputnik v  dose 1 100 slots\n\...
206964    1844 bbmp bengaluru covidvaccine availability ...
206965    1844 urban bengaluru covidvaccine availability...
206966    45 urban bengaluru covidvaccine availability f...
Name: text, Length: 206967, dtype: object

In [12]:
# Tokenization
def tokenization(text):
    text = re.split('\W+', text)
    return text

df_tweet['tokenized'] = df_tweet['text'].apply(lambda x: tokenization(x))

In [15]:
# lemmatization
def lemmatize(text):
    """Lemmatize tweets by WordNetLemmatizer"""
    lemma_list = []
    lemmatizer = WordNetLemmatizer() 
    text = text
    words = word_tokenize(text)

    for word in words:
        lemma = lemmatizer.lemmatize(word, "n")
        if lemma == word:
            lemma = lemmatizer.lemmatize(word, "v")
        lemma_list.append(lemma)
  
    return ' '.join(lemma_list)

df_tweet['lemmatized'] = df_tweet['text'].apply(lambda x: lemmatize(x))

In [16]:
# Stopword Removal

STOPWORDS = set(stopwords.words('english'))

def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

df_tweet["text"] = df_tweet["text"].apply(lambda text: remove_stopwords(text))
df_tweet["text"]

0         folks said daikon paste could treat cytokine s...
1         world wrong side history year hopefully bigges...
2         coronavirus sputnikv astrazeneca pfizerbiontec...
3         facts immutable senator even youre ethically s...
4         explain need vaccine borisjohnson matthancock ...
                                ...                        
206962    45 urban bengaluru covidvaccine availability 3...
206963    pincode 560011 sputnik v dose 1 100 slots age ...
206964    1844 bbmp bengaluru covidvaccine availability ...
206965    1844 urban bengaluru covidvaccine availability...
206966    45 urban bengaluru covidvaccine availability 0...
Name: text, Length: 206967, dtype: object

In [17]:
# emoji convert to text

def convert_emoji_to_text(tweet):
    tokens = tweet.split()
    for i, token in enumerate(tokens):
        if token in UNICODE_EMOJI["en"]: 
            emo_desc = demoji.findall(token)[token]
            new_rep = "_".join(emo_desc.split(":")[0].split())
            tokens[i] = new_rep
            
    return " ".join(tokens)

df_tweet["text"] = df_tweet["text"].apply(lambda text: convert_emoji_to_text(text))
df_tweet["text"]

0         folks said daikon paste could treat cytokine s...
1         world wrong side history year hopefully bigges...
2         coronavirus sputnikv astrazeneca pfizerbiontec...
3         facts immutable senator even youre ethically s...
4         explain need vaccine borisjohnson matthancock ...
                                ...                        
206962    45 urban bengaluru covidvaccine availability 3...
206963    pincode 560011 sputnik v dose 1 100 slots age ...
206964    1844 bbmp bengaluru covidvaccine availability ...
206965    1844 urban bengaluru covidvaccine availability...
206966    45 urban bengaluru covidvaccine availability 0...
Name: text, Length: 206967, dtype: object

In [20]:
df_tweet.tail(3)

Unnamed: 0,id,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,retweets,favorites,is_retweet,tokenized,lemmatized
206964,1445954643419226114,VaxBLR,"Bengaluru, India",Hourly updates on FREE and PAID 18+ and 45+ va...,2021-06-21 08:44:34,26,0,0,False,2021-10-07 03:30:26,1844 bbmp bengaluru covidvaccine availability ...,"['BBMP', 'Bengaluru', 'CovidVaccine', 'COVISHI...",VaxBlr,0,1,False,"[1844, bbmp, bengaluru, covidvaccine, availabi...",1844 bbmp bengaluru covidvaccine availability ...
206965,1445954599345475592,VaxBLR,"Bengaluru, India",Hourly updates on FREE and PAID 18+ and 45+ va...,2021-06-21 08:44:34,26,0,0,False,2021-10-07 03:30:15,1844 urban bengaluru covidvaccine availability...,"['URBAN', 'Bengaluru', 'CovidVaccine', 'COVISH...",VaxBlr,0,0,False,"[1844, urban, bengaluru, covidvaccine, availab...",1844 urban bengaluru covidvaccine availability...
206966,1445947047052333057,VaxBLR,"Bengaluru, India",Hourly updates on FREE and PAID 18+ and 45+ va...,2021-06-21 08:44:34,26,0,0,False,2021-10-07 03:00:15,45 urban bengaluru covidvaccine availability 0...,"['URBAN', 'Bengaluru', 'CovidVaccine', 'COVISH...",VaxBlr,0,0,False,"[45, urban, bengaluru, covidvaccine, availabil...",45 urban bengaluru covidvaccine availability f...


In [21]:
df_tweet.to_csv("../data/vaccination_tweets_cleaned.csv")