In [1]:
import numpy as np
import pandas as pd

In [2]:
dataframe=pd.read_csv("SMSSpamCollection.tsv", sep="\t")

In [3]:
dataframe.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
dataframe["label"].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [5]:
# we have 5572 messages and 4825 are ham.
#therefore 4825/5572 =86% are ham. If we were to choose either ham or spam, then 86% would randomly be corret for this dataset

# Therefore the text classifier needs to be better than 86% to perform better than random selection


In [6]:
X=dataframe["message"]
y=dataframe["label"]

In [7]:
X.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: message, dtype: object

In [8]:
y.head()

0     ham
1     ham
2    spam
3     ham
4     ham
Name: label, dtype: object

In [9]:
#Split into training and test

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1)

In [10]:
# scikit-learn count vectorizer

In [11]:
#Import count vectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
#Instance of Count Vectorizer
count_vectorizer=CountVectorizer()

In [13]:
#2 methods - we count fit and then transform
# or we could use sklearn to do this

In [14]:
X_train_counts = count_vectorizer.fit_transform(X_train)

In [15]:
X_train_counts

<3900x7155 sparse matrix of type '<class 'numpy.int64'>'
	with 51338 stored elements in Compressed Sparse Row format>

In [16]:
X_train.shape

(3900,)

In [18]:
#Transform count vectorization to freq qith TF-IDF
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()

In [19]:
X_train_transform=tfidf_transformer.fit_transform(X_train_counts)

In [20]:
X_train_transform.shape

(3900, 7155)

In [21]:
#Load Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()

In [22]:
X_train_tfidf=tfidf_vectorizer.fit_transform(X_train)

In [23]:
X_train_tfidf.shape

(3900, 7155)

In [24]:
from sklearn.svm import LinearSVC

In [25]:
X_test.head()

1078                         Yep, by the pretty sculpture
4028        Yes, princess. Are you going to make me moan?
958                            Welp apparently he retired
4642                                              Havent.
4674    I forgot 2 ask ü all smth.. There's a card on ...
Name: message, dtype: object

In [26]:
# Create instance of Linear SVC
classifier=LinearSVC()

In [27]:
classifier.fit(X_train_tfidf,y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [28]:
X_test_transform =tfidf_vectorizer.transform(X_test)

In [29]:
predictions =classifier.predict(X_test_transform)

In [30]:
predictions.shape

(1672,)

In [31]:
from sklearn import metrics
print(metrics.confusion_matrix(y_test, predictions))

[[1437    5]
 [  20  210]]


In [32]:
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1442
        spam       0.98      0.91      0.94       230

   micro avg       0.99      0.99      0.99      1672
   macro avg       0.98      0.95      0.97      1672
weighted avg       0.98      0.99      0.98      1672



In [33]:
print(metrics.accuracy_score(y_test,predictions))

0.9850478468899522


In [None]:
classifier.predict(["I'm going to work soon"])