In [1]:
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
columns = ['Label','Text']
df = pd.read_csv(r"C:\Users\jainmiahsk\Desktop\DataSets\SpamClassifier\SMSSpamCollection",sep = '\t',names = columns)
df.head()

Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.isnull().sum()

Label    0
Text     0
dtype: int64

In [4]:
df.shape

(5572, 2)

In [5]:
len(df)

5572

## This data is not balanced as ham is more than spam. If this data is not balanced then we dont get accuracy

In [6]:
df.Label.value_counts()

ham     4825
spam     747
Name: Label, dtype: int64

## Balancing This Data

In [7]:
ham = df[df.Label=='ham']
ham.head()

Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
6,ham,Even my brother is not like to speak with me. ...


In [8]:
ham.shape

(4825, 2)

In [9]:
spam = df[df.Label == 'spam']
spam.head()

Unnamed: 0,Label,Text
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
5,spam,FreeMsg Hey there darling it's been 3 week's n...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...
11,spam,"SIX chances to win CASH! From 100 to 20,000 po..."


In [10]:
spam.shape

(747, 2)

In [11]:
ham = ham.sample(spam.shape[0])
ham.shape,spam.shape

((747, 2), (747, 2))

In [12]:
data = ham.append(spam, ignore_index = True)
data.tail(),data.shape

(     Label                                               Text
 1489  spam  Want explicit SEX in 30 secs? Ring 02073162414...
 1490  spam  ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...
 1491  spam  Had your contract mobile 11 Mnths? Latest Moto...
 1492  spam  REMINDER FROM O2: To get 2.50 pounds free call...
 1493  spam  This is the 2nd time we have tried 2 contact u..., (1494, 2))

## Data Preparation

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

  from numpy.core.umath_tests import inner1d


In [35]:
X_train,X_test,y_train,y_test = train_test_split(data['Text'],data['Label'],test_size= 0.3,random_state=0,shuffle=True,stratify = data['Label'])

In [28]:
X_train.head()

1128    U’ve Bin Awarded £50 to Play 4 Instant Cash. C...
742     No it will reach by 9 only. She telling she wi...
477     Havent mus ask if u can 1st wat. Of meet 4 lun...
542     I like you peoples very much:) but am very shy...
1277    Ringtone Club: Gr8 new polys direct to your mo...
Name: Text, dtype: object

In [29]:
y_train.head()

1128    spam
742      ham
477      ham
542      ham
1277    spam
Name: Label, dtype: object

# Bag of Words

In [30]:
vectorizer = TfidfVectorizer()

In [31]:
X_train = vectorizer.fit_transform(X_train)

In [32]:
X_train.shape

(1045, 3800)

# Pipeline and Random Forest Classifier

In [33]:
clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',RandomForestClassifier(n_estimators=100,n_jobs=-1))])

In [49]:
X_train,X_test,y_train,y_test = train_test_split(data['Text'],data['Label'],test_size= 0.3,random_state=0,shuffle=True,stratify = data['Label'])

In [36]:
clf.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ..._jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [37]:
y_pred = clf.predict(X_test)

In [38]:
confusion_matrix(y_test,y_pred)

array([[224,   1],
       [ 27, 197]], dtype=int64)

In [39]:
print(classification_report(y_test,y_pred))

             precision    recall  f1-score   support

        ham       0.89      1.00      0.94       225
       spam       0.99      0.88      0.93       224

avg / total       0.94      0.94      0.94       449



In [40]:
accuracy_score(y_test,y_pred)

0.9376391982182628

# Testing The Model

In [42]:
clf.predict(['Dear user, this is gentle remainder of your Data Usage Bill for the month of December. Please pay Rs.450'])

array(['ham'], dtype=object)

In [46]:
clf.predict(['you won 1000000 prize please click here'])

array(['spam'], dtype=object)

In [47]:
clf.predict(['please send your bank details to win Rs.10000000000'])

array(['spam'], dtype=object)

In [48]:
clf.predict(['you have few missed calls on this number please check it'])

array(['ham'], dtype=object)

# SVM

In [67]:
clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',SVC(C=5000,gamma='auto'))])

In [68]:
X_train,X_test,y_train,y_test = train_test_split(data['Text'],data['Label'],test_size= 0.3,random_state=0,shuffle=True,stratify = data['Label'])

In [69]:
clf.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [70]:
y_pred = clf.predict(X_test)

In [71]:
confusion_matrix(y_test,y_pred)

array([[216,   9],
       [ 16, 208]], dtype=int64)

In [72]:
accuracy_score(y_test,y_pred)

0.9443207126948775