In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [3]:
spam = pd.read_csv(r"C:\Users\Prasad\Desktop\ml\data_ml\spam.csv")
spam.head()

Unnamed: 0,class,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Feature Extraction
The first thing you need to do when creating a machine learning model is to decide what to use as features. You call features the pieces of information that you take from the text and give to the algorithm so it can work its magic. For example, if you were doing classification on health, some features could be a person’s height, weight, gender, and so on. You would exclude things that maybe are known but aren’t useful to the model, like a person’s name or favorite color.

In this case though, you don’t even have numeric features. You just have text. You need to somehow convert this text into numbers that you can do calculations on.

So what do you do? Simple! you use word frequencies. That is, you ignore word order and sentence construction, treating every sentence as a set of the words it contains. Our features will be the counts of each of these words. Even though it may seem too simplistic an approach, it works surprisingly well.



In [6]:
X = spam['text']
y = spam['class']

X.shape, y.shape

((5572,), (5572,))

In [23]:
vect = CountVectorizer(ngram_range=(1,3), min_df=5, max_features=8000)
X_vec = vect.fit_transform(X).toarray();
X_vec[:5]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, random_state=0)

In [29]:
%%time

knn = KNeighborsClassifier()
knn.fit(X_train,y_train)
knn.score(X_train, y_train), knn.score(X_test, y_test)

CPU times: total: 11.3 s
Wall time: 2.16 s


(0.9344340751375927, 0.914572864321608)

In [42]:
## SVM  
# %%time

svm = SVC()
svm.fit(X_train, y_train), svm.score(X_test, y_test)

(SVC(), 0.9791816223977028)

In [44]:
%%time

nb = GaussianNB()
nb.fit(X_train,y_train)
nb.score(X_train, y_train), nb.score(X_test, y_test)

CPU times: total: 1.22 s
Wall time: 1.26 s


(0.9492701603254368, 0.9389806173725772)

In [46]:
sample = 'You won 25 lakh'
vec = vect.transform([sample]).toarray()
print(vec)
nb.predict(vec)

[[0 0 0 ... 0 0 0]]


array(['spam'], dtype='<U4')

In [50]:
sample1 = 'Congratulations you are eligible for loan upto 1cr'
vec1 = vect.transform([sample1]).toarray()
print(vec1)
nb.predict(vec1)

[[0 0 0 ... 0 0 0]]


array(['spam'], dtype='<U4')

In [52]:
sample2 = 'Hi Nithin, I need your help.'
vec2 = vect.transform([sample]).toarray()
nb.predict(vec2)

array(['spam'], dtype='<U4')