In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("spam.csv", encoding="latin1")

In [3]:
df.sample(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
967,ham,What do u want when i come back?.a beautiful n...,,,
1753,ham,Jus came back fr lunch wif my sis only. U leh?,,,
3401,ham,'An Amazing Quote'' - \Sometimes in life its d...,,,
2580,spam,U are subscribed to the best Mobile Content Se...,,,
1253,ham,Mum say we wan to go then go... Then she can s...,,,


In [4]:
df.shape

(5572, 5)

In [5]:
# 1. Data cleaning 
# 2. EDA
# 3. Text preprocessing
# 4. Model Building 
# 5. Evaluation
# 6. Improvement

## 1. Data Cleaning

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [7]:
# drop last 3 columns
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)

In [8]:
df.sample(5)

Unnamed: 0,v1,v2
685,ham,Have you finished work yet? :)
1306,spam,Enjoy the jamster videosound gold club with yo...
5274,ham,"Hi. Hope ur day * good! Back from walk, table ..."
3361,ham,No messages on her phone. I'm holding it now
1094,ham,Now only i reached home. . . I am very tired n...


In [9]:
# renaming the cols
df.rename(columns = {'v1' : 'target','v2':'text'},inplace = True)

In [10]:
df.sample(5)

Unnamed: 0,target,text
1095,ham,Ryder unsold.now gibbs.
3877,ham,What you need. You have a person to give na.
2564,ham,"Under the sea, there lays a rock. In the rock,..."
5084,ham,Hey happy birthday...
3732,ham,Old Orchard near univ. How about you?


In [11]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [12]:
df['target'] = encoder.fit_transform(df['target'])
# 0 -> ham
# 1 -> spam

In [13]:
df.head()

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [14]:
# missing values
df.isnull().sum()

target    0
text      0
dtype: int64

In [15]:
#check for duplicate values
df.duplicated().sum()

np.int64(403)

In [16]:
#remove duplicate values
df = df.drop_duplicates(keep="first")

In [17]:
df.duplicated().sum()

np.int64(0)

In [18]:
df.shape

(5169, 2)

## 2. EDA

In [19]:
df['target'].value_counts()

target
0    4516
1     653
Name: count, dtype: int64

In [20]:
import matplotlib.pyplot as plt
plt.pie(df['target'].value_counts(),labels=['ham','spam'],autopct = "%0.2f")
plt.show()

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
# Data is imbalanced

In [None]:
%pip install nltk
import nltk #natural language toolkit
nltk.download('punkt')
nltk.download('punkt_tab')

Note: you may need to restart the kernel to use updated packages.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Pralo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Pralo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
df['num_characters'] = df['text'].apply(len)

In [None]:
df.head()

Unnamed: 0,target,text,num_characters
0,0,"Go until jurong point, crazy.. Available only ...",111
1,0,Ok lar... Joking wif u oni...,29
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,0,U dun say so early hor... U c already then say...,49
4,0,"Nah I don't think he goes to usf, he lives aro...",61


In [None]:
# number of words
df['num_words'] = df['text'].apply(lambda x:len(nltk.word_tokenize(x)))

In [None]:

df['num_sentences'] = df['text'].apply(lambda x: len(nltk.sent_tokenize(x)))

In [None]:
df.head()

Unnamed: 0,target,text,num_sentences,num_words
0,0,"Go until jurong point, crazy.. Available only ...",2,24
1,0,Ok lar... Joking wif u oni...,2,8
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,2,37
3,0,U dun say so early hor... U c already then say...,1,13
4,0,"Nah I don't think he goes to usf, he lives aro...",1,15
