In [2]:
import pandas as pd

In [3]:
messages=pd.read_csv('smsspamcollection/SMSSpamCollection', sep='\t', names=['label', 'message'])

In [4]:
print(messages.head())

  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [5]:
print(messages.describe())

       label                 message
count   5572                    5572
unique     2                    5169
top      ham  Sorry, I'll call later
freq    4825                      30


In [6]:
print(messages.groupby('label').describe())

      message                                                               
        count unique                                                top freq
label                                                                       
ham      4825   4516                             Sorry, I'll call later   30
spam      747    653  Please call our customer service representativ...    4


In [7]:
messages['length'] = messages['message'].apply(len)

In [8]:
import string
from nltk.corpus import stopwords

In [9]:
def text_process(mess):
    nopunc=[char for char in mess if char not in string.punctuation]
    nopunc=''.join(nopunc)
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import SVC

In [11]:
pipeline=Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)), 
    ('tfidf', TfidfTransformer()),
    ('classifier', SVC(kernel='sigmoid', gamma=1.0))
])

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
msg_train, msg_test, label_train, label_test=train_test_split(messages['message'], messages['label'], test_size=0.3, random_state=101)

In [14]:
pipeline.fit(msg_train, label_train)

Pipeline(memory=None,
     steps=[('bow', CountVectorizer(analyzer=<function text_process at 0x000001F611D3FD08>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocesso...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [15]:
pred=pipeline.predict(msg_test)

In [16]:
from sklearn.metrics import classification_report, confusion_matrix

In [17]:
print(confusion_matrix(label_test, pred))

[[1472    3]
 [  29  168]]


In [18]:
print(classification_report(label_test, pred))

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1475
        spam       0.98      0.85      0.91       197

   micro avg       0.98      0.98      0.98      1672
   macro avg       0.98      0.93      0.95      1672
weighted avg       0.98      0.98      0.98      1672



In [19]:
from sklearn.metrics import accuracy_score
print(accuracy_score(label_test, pred))

0.9808612440191388
