# Text Classification with Naive Bayes

Done by Sahil Phule

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score

In [2]:
df= pd.read_csv("SMSSpamCollection",sep='\t', names=['spam', 'txt'])

In [3]:
df.head()

Unnamed: 0,spam,txt
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
#TFIDF Vectorizer, 
stopset = set(stopwords.words('english'))
vectorizer = TfidfVectorizer(use_idf=True, lowercase=True, strip_accents='ascii', stop_words=stopset)

In [5]:
df.spam.unique()

array(['ham', 'spam'], dtype=object)

There are only two variables so in order to avoid the problem of collinearity lets not use the ham  column

In [6]:
# SO now its spam = 1, ham = 0
df.spam = pd.get_dummies(df.spam)['spam']

In [7]:
df.spam.astype(int).unique()

array([0, 1])

In [8]:
y=df.spam

In [9]:
#convert df.txt from text to features
X= vectorizer.fit_transform(df.txt)

In [10]:
#6918 observations x 2022 unique words.
print y.shape
print X.shape

(5572,)
(5572, 8587)


In [11]:
#Test Train Split as usual
X_train, X_test,y_train, y_test = train_test_split(X, y, random_state=42)

In [12]:
#we will train a naive_bayes classifier
clf = naive_bayes.MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [13]:
#We can test our model's accuracy like this:

roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])

0.98589322144123448

That seems a pretty good model. Lets test it.

### Testing The classifier

For testing purposesI have added a spam text from the site https://www.adaptivemobile.com/press-centre/press-releases/five-top-spam-texts-for-2012-revealed-in-adaptivemobiles-ongoing-threat-ana


In [14]:
movie_reviews_array=np.array(["Our records indicate your Pension is under performing to see higher growth and up to 25% cash release reply PENSION for a free review. To opt out reply STOP"])

movie_review_vector = vectorizer.transform(movie_reviews_array)

print clf.predict(movie_review_vector)

[ 1.]


The classisier correctly predicted the text as spam