Corona Tweets Text Preprocessing (NLTK)

In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
import re
import unicodedata
from contractions import CONTRACTION_MAP, expand_contractions

In [2]:
# load data
data = pd.read_csv('spam.csv', encoding = 'ISO-8859-1')
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
data.tail()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,
5571,ham,Rofl. Its true to its name,,,


In [4]:
data['v1'].unique()

array(['ham', 'spam'], dtype=object)

In [5]:
# change label name & scale to between 0-4
data['labels'] = data['v1'].map({'ham':0, 'spam':1})
data = data.rename(columns={'v2':'docs'})
data = data.iloc[:,1::4]
data.head()

Unnamed: 0,docs,labels
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [6]:
data.tail()

Unnamed: 0,docs,labels
5567,This is the 2nd time we have tried 2 contact u...,1
5568,Will Ì_ b going to esplanade fr home?,0
5569,"Pity, * was in mood for that. So...any other s...",0
5570,The guy did some bitching but I acted like i'd...,0
5571,Rofl. Its true to its name,0


In [7]:
print(f'Null Data: \n{data.isnull().sum()}\n')
print('Empty Cell: \n{}'.format(data[data['docs'] == ''].index))

Null Data: 
docs      0
labels    0
dtype: int64

Empty Cell: 
Int64Index([], dtype='int64')


In [8]:
data['docs'][42]

'07732584351 - Rodger Burns - MSG = We tried to call you re your reply to our sms for a free nokia mobile + free camcorder. Please call now 08000930705 for delivery tomorrow'

In [9]:
# remove html elements
no_html = [i.get_text() for i in [BeautifulSoup(i, "html.parser") for i in data['docs']]]

In [10]:
no_html[42]

'07732584351 - Rodger Burns - MSG = We tried to call you re your reply to our sms for a free nokia mobile + free camcorder. Please call now 08000930705 for delivery tomorrow'

In [11]:
# remove urls
url_pattern = r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)'
no_url = [re.sub(url_pattern, '', i) for i in no_html]

In [12]:
no_url[42]

'07732584351 - Rodger Burns - MSG = We tried to call you re your reply to our sms for a free nokia mobile + free camcorder. Please call now 08000930705 for delivery tomorrow'

In [13]:
# remove accented characters
no_accent = [unicodedata.normalize('NFKD', i).encode('ascii', 'ignore').decode('utf-8', 'ignore') for i in no_url]

In [14]:
no_accent[42]

'07732584351 - Rodger Burns - MSG = We tried to call you re your reply to our sms for a free nokia mobile + free camcorder. Please call now 08000930705 for delivery tomorrow'

In [15]:
# LEARN THIS
# expanding contractions
expanded_contractions = [expand_contractions(i) for i in no_accent]

In [16]:
expanded_contractions[42]

'07732584351 - Rodger Burns - MSG = We tried to call you re your reply to our sms for a free nokia mobile + free camcorder. Please call now 08000930705 for delivery tomorrow'

In [17]:
# remove special characters (digits included)
sp_chr_pattern = r'[^a-zA-Z\s]'
no_sp_chr = [re.sub(sp_chr_pattern, '', i) for i in expanded_contractions]

In [18]:
no_sp_chr[42]

'  Rodger Burns  MSG  We tried to call you re your reply to our sms for a free nokia mobile  free camcorder Please call now  for delivery tomorrow'

In [19]:
# remove multiple spaces
single_space = [re.sub('\s+',' ', i) for i in no_sp_chr]

In [20]:
single_space[42]

' Rodger Burns MSG We tried to call you re your reply to our sms for a free nokia mobile free camcorder Please call now for delivery tomorrow'

In [21]:
# lowercasing
lower = [i.lower() for i in single_space]

In [22]:
lower[42]

' rodger burns msg we tried to call you re your reply to our sms for a free nokia mobile free camcorder please call now for delivery tomorrow'

In [23]:
# stopwords removal
stopword = stopwords.words('english')
without_stopwords = [' '.join(z) for z in [[y for i, y in enumerate(x.split()) if y not in stopword] for x in lower]]

In [24]:
without_stopwords[42]

'rodger burns msg tried call reply sms free nokia mobile free camcorder please call delivery tomorrow'

In [25]:
# stemming
steme = nltk.porter.PorterStemmer()
stemed = [' '.join(z) for z in [[steme.stem(y) for i, y in enumerate(x.split())]for x in without_stopwords]] 

In [26]:
stemed[42]

'rodger burn msg tri call repli sm free nokia mobil free camcord pleas call deliveri tomorrow'

In [27]:
# # not really good
# # lemmatization
# lemmatizer = WordNetLemmatizer()
# lemmatized = [' '.join(z) for z in [[lemmatizer.lemmatize(y) for i, y in enumerate(x.split())]for x in without_stopwords]]

In [28]:
# lemmatized[187]

In [29]:
data['preprocessed_docs'] = stemed
data = data.iloc[:, 1:]
data.head()

Unnamed: 0,labels,preprocessed_docs
0,0,go jurong point crazi avail bugi n great world...
1,0,ok lar joke wif u oni
2,1,free entri wkli comp win fa cup final tkt st m...
3,0,u dun say earli hor u c alreadi say
4,0,nah think goe usf live around though


In [30]:
data.tail()

Unnamed: 0,labels,preprocessed_docs
5567,1,nd time tri contact u u pound prize claim easi...
5568,0,b go esplanad fr home
5569,0,piti mood soani suggest
5570,0,guy bitch act like would interest buy someth e...
5571,0,rofl true name


In [31]:
print(f'Null Data: \n{data.isnull().sum()}\n')
print('Empty Cell: \n{}'.format(data[data['preprocessed_docs'] == ''].index))

Null Data: 
labels               0
preprocessed_docs    0
dtype: int64

Empty Cell: 
Int64Index([959, 1611, 2805, 3374, 4573, 4822], dtype='int64')


In [32]:
data = data.drop(data.index[data[data['preprocessed_docs'] == ''].index])

In [33]:
data.to_csv('preprocessed_dataset.csv', index=False)