In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("spam.csv", encoding="latin1")

In [3]:
df.sample(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
3409,ham,Joy's father is John. Then John is the ____ of...,,,
2260,spam,SplashMobile: Choose from 1000s of gr8 tones e...,,,
1406,spam,"URGENT, IMPORTANT INFORMATION FOR O2 USER. TOD...",,,
4376,ham,How much is torch in 9ja.,,,
4031,ham,I'm very happy for you babe ! Woo hoo party on...,,,


In [4]:
df.shape

(5572, 5)

In [5]:
# 1. Data cleaning 
# 2. EDA
# 3. Text preprocessing
# 4. Model Building 
# 5. Evaluation
# 6. Improvement

## 1. Data Cleaning

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [7]:
# drop last 3 columns
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)

In [8]:
df.sample(5)

Unnamed: 0,v1,v2
4725,ham,Jason says it's cool if we pick some up from h...
1059,spam,EASTENDERS TV Quiz. What FLOWER does DOT compa...
1271,ham,If you still havent collected the dough pls le...
5517,ham,Miles and smiles r made frm same letters but d...
4991,ham,I'm hungry buy smth home...


In [9]:
# renaming the cols
df.rename(columns = {'v1' : 'target','v2':'text'},inplace = True)

In [10]:
df.sample(5)

Unnamed: 0,target,text
195,ham,Gud mrng dear hav a nice day
4205,ham,"Get the door, I'm here"
2556,ham,Fuck babe ... What happened to you ? How come ...
312,ham,He says he'll give me a call when his friend's...
2150,ham,"The table's occupied, I'm waiting by the tree"


In [11]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [12]:
df['target'] = encoder.fit_transform(df['target'])
# 0 -> ham
# 1 -> spam

In [13]:
df.head()

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [14]:
# missing values
df.isnull().sum()

target    0
text      0
dtype: int64

In [15]:
#check for duplicate values
df.duplicated().sum()

np.int64(403)

In [16]:
#remove duplicate values
df = df.drop_duplicates(keep="first")

In [17]:
df.duplicated().sum()

np.int64(0)

In [18]:
df.shape

(5169, 2)

## 2. EDA

In [19]:
df['target'].value_counts()

target
0    4516
1     653
Name: count, dtype: int64

In [20]:
import matplotlib.pyplot as plt
plt.pie(df['target'].value_counts(),labels=['ham','spam'],autopct = "%0.2f")
plt.show()

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
# Data is imbalanced

In [24]:
%pip install nltk
import nltk #natural language toolkit
nltk.download('punkt')
nltk.download('punkt_tab')

Note: you may need to restart the kernel to use updated packages.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Pralo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Pralo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [21]:
df['num_characters'] = df['text'].apply(len)

In [22]:
df.head()

Unnamed: 0,target,text,num_characters
0,0,"Go until jurong point, crazy.. Available only ...",111
1,0,Ok lar... Joking wif u oni...,29
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,0,U dun say so early hor... U c already then say...,49
4,0,"Nah I don't think he goes to usf, he lives aro...",61


In [25]:
# number of words
df['num_words'] = df['text'].apply(lambda x:len(nltk.word_tokenize(x)))

In [26]:

df['num_sentences'] = df['text'].apply(lambda x: len(nltk.sent_tokenize(x)))

In [27]:
df.head()

Unnamed: 0,target,text,num_characters,num_words,num_sentences
0,0,"Go until jurong point, crazy.. Available only ...",111,24,2
1,0,Ok lar... Joking wif u oni...,29,8,2
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,155,37,2
3,0,U dun say so early hor... U c already then say...,49,13,1
4,0,"Nah I don't think he goes to usf, he lives aro...",61,15,1


In [28]:
df[['num_characters','num_words','num_sentences']].describe()

Unnamed: 0,num_characters,num_words,num_sentences
count,5169.0,5169.0,5169.0
mean,78.977945,18.455794,1.965564
std,58.236293,13.324758,1.448541
min,2.0,1.0,1.0
25%,36.0,9.0,1.0
50%,60.0,15.0,1.0
75%,117.0,26.0,2.0
max,910.0,220.0,38.0


In [None]:
# ham messages
df[df['target']==0][['num_characters','num_words','num_sentences']].describe()

Unnamed: 0,num_characters,num_words,num_sentences
count,4516.0,4516.0,4516.0
mean,70.459256,17.123782,1.820195
std,56.358207,13.49397,1.383657
min,2.0,1.0,1.0
25%,34.0,8.0,1.0
50%,52.0,13.0,1.0
75%,90.0,22.0,2.0
max,910.0,220.0,38.0


In [30]:
#spam
df[df['target']==1][['num_characters','num_words','num_sentences']].describe()

Unnamed: 0,num_characters,num_words,num_sentences
count,653.0,653.0,653.0
mean,137.891271,27.667688,2.970904
std,30.137753,7.008418,1.488425
min,13.0,2.0,1.0
25%,132.0,25.0,2.0
50%,149.0,29.0,3.0
75%,157.0,32.0,4.0
max,224.0,46.0,9.0
