In [None]:
import pandas as pd
import numpy as np

In [None]:
path = '/content/spam.tsv'
df = pd.read_csv(path, delimiter = '\t')

#Exploratory Data Analysis

In [None]:
df.head(10)

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2
5,spam,FreeMsg Hey there darling it's been 3 week's n...,147,8
6,ham,Even my brother is not like to speak with me. ...,77,2
7,ham,As per your request 'Melle Melle (Oru Minnamin...,160,6
8,spam,WINNER!! As a valued network customer you have...,157,6
9,spam,Had your mobile 11 months or more? U R entitle...,154,2


In [None]:
df.isna().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [None]:
df.label.value_counts()/len(df) * 100

ham     86.593683
spam    13.406317
Name: label, dtype: float64

#Balancing the dataset

In [None]:
ham = df[df['label'] == 'ham']
spam = df[df['label'] == 'spam']

In [None]:
ham.shape,spam.shape

((4825, 4), (747, 4))

In [None]:
ham = ham.sample(spam.shape[0])

In [None]:
ham.shape

(747, 4)

In [None]:
ham

Unnamed: 0,label,message,length,punct
3224,ham,Well that must be a pain to catch,33,0
3492,ham,Ok.,3,1
5425,ham,Otherwise had part time job na-tuition..,40,3
5032,ham,Hey... Very inconvenient for your sis a not huh?,48,4
4827,ham,"Haha, just what I was thinkin",29,1
...,...,...,...,...
4969,ham,Future is not what we planned for tomorrow.......,133,14
3249,ham,Also track down any lighters you can find,41,0
1028,ham,Are you not around or just still asleep? :V,43,2
1191,ham,We're done...,13,4


In [None]:
data = ham.append(spam, ignore_index = True)

  data = ham.append(spam, ignore_index = True)


In [None]:
data

Unnamed: 0,label,message,length,punct
0,ham,Well that must be a pain to catch,33,0
1,ham,Ok.,3,1
2,ham,Otherwise had part time job na-tuition..,40,3
3,ham,Hey... Very inconvenient for your sis a not huh?,48,4
4,ham,"Haha, just what I was thinkin",29,1
...,...,...,...,...
1489,spam,Want explicit SEX in 30 secs? Ring 02073162414...,90,3
1490,spam,ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...,158,5
1491,spam,Had your contract mobile 11 Mnths? Latest Moto...,160,8
1492,spam,REMINDER FROM O2: To get 2.50 pounds free call...,147,3


#Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data['message'], data['label'], test_size=0.3, random_state=0, shuffle=True)

Importing libraries for Classifier Pipeline

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

#Build the Classifier Pipeline

In [None]:
classifier = Pipeline([("tfidf", TfidfVectorizer()) , \
                       ("classifier", RandomForestClassifier(n_estimators=100))])

# Train the Classifier

In [None]:
classifier.fit(X_train, y_train)

#Test the Classifier

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
y_pred, y_test

#Evaluate the Classifer

In [None]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [None]:
accuracy_score(y_test, y_pred)

0.955456570155902

In [None]:
confusion_matrix(y_test, y_pred)

array([[227,   0],
       [ 20, 202]])

In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         ham       0.92      1.00      0.96       227
        spam       1.00      0.91      0.95       222

    accuracy                           0.96       449
   macro avg       0.96      0.95      0.96       449
weighted avg       0.96      0.96      0.96       449



#Test the Classifer on Sample Text Message

In [None]:
test1 = ['Hello, You are learning natural Language Processing']
test2 = ['Hope you are doing good and learning new things !']
test3 = ['Congratulations, You won a lottery ticket worth $1 Million ! To claim call on 446677']
test4 = ['Play the megamillion lottery and win a lot of money in $$$ millions! Buy the ticket now! Call us for more information 123-456-789']


In [None]:
print(classifier.predict(test1))
print(classifier.predict(test2))
print(classifier.predict(test3))
print(classifier.predict(test4))

['ham']
['ham']
['spam']
['spam']
