In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [26]:
sms_df=pd.read_csv('smsspamcollection.tsv',sep='\t')

In [27]:
sms_df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [28]:
X=sms_df['message']
y=sms_df['label']

In [29]:
X.isnull().sum()

0

In [30]:
X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.33, random_state=42)

In [31]:
count_vect=CountVectorizer()

In [32]:
X

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
5       FreeMsg Hey there darling it's been 3 week's n...
6       Even my brother is not like to speak with me. ...
7       As per your request 'Melle Melle (Oru Minnamin...
8       WINNER!! As a valued network customer you have...
9       Had your mobile 11 months or more? U R entitle...
10      I'm gonna be home soon and i don't want to tal...
11      SIX chances to win CASH! From 100 to 20,000 po...
12      URGENT! You have won a 1 week FREE membership ...
13      I've been searching for the right words to tha...
14                    I HAVE A DATE ON SUNDAY WITH WILL!!
15      XXXMobileMovieClub: To use your credit, click ...
16                             Oh k...i'm watching here:)
17      Eh u r

In [33]:
#Fit vectorizer to data (build vocab,count number of words)
#Transoform  orignal text to vector
X_train_count=count_vect.fit_transform(X_train)

In [34]:
X_train_count.shape

(3733, 7082)

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [36]:
vectorizer=TfidfVectorizer()

In [37]:
X_train_tfidf=vectorizer.fit_transform(X_train)

In [38]:
from sklearn.svm import LinearSVC

In [39]:
clf=LinearSVC()

In [40]:
clf.fit(X_train_tfidf,y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [41]:
#Creating pipeline to vectorize and classify data
from sklearn.pipeline import Pipeline

In [42]:
text_clf=Pipeline([('tfidf',TfidfVectorizer()),('svc',LinearSVC())])

In [43]:
text_clf.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [44]:
predict=text_clf.predict(X_test)

In [45]:
from sklearn.metrics import confusion_matrix,classification_report

In [46]:
print(confusion_matrix(y_test,predict))

[[1586    7]
 [  12  234]]


In [47]:
print(classification_report(y_test,predict))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1593
        spam       0.97      0.95      0.96       246

   micro avg       0.99      0.99      0.99      1839
   macro avg       0.98      0.97      0.98      1839
weighted avg       0.99      0.99      0.99      1839



In [48]:
text_clf.predict(['hello how are u?',' loan for £950 is approved for you if you receive this SMS. 1 min verification & cash in 1 hr at www.[redacted].co.uk to opt out reply stop'])

array(['ham', 'spam'], dtype=object)