In [1]:
# usual imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib notebook
from sklearn.cross_validation import train_test_split
# Each is a different implemntation of a text transform tool: Bag of Words & Tfidf
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer



#### Let's first play with Yelp data. Earlier, we performed sentiment analysis on this dataset using Random Forest.  For this practice project you shall refer to our earlier codes i.e. [notebook 1](https://github.com/ga-students/DS-SF-24/blob/master/Code/Lecture13.ipynb) and [notebook 2](https://github.com/ga-students/DS-SF-24/blob/master/Code/Lecture13-Practice-Solution.ipynb)

In [2]:
# let's load data and put it in a dataframe
url = "https://raw.githubusercontent.com/ga-students/DS-SF-24/master/Data/yelp_labelled.txt"
Yelp_data = pd.read_csv(url , sep = "\t", names = ['text','sentiment'])
Yelp_data.dropna(inplace = True)
Yelp_data.head()

Unnamed: 0,text,sentiment
0,Wow... Loved this place.,1
3,Crust is not good.,0
4,Not tasty and the texture was just nasty.,0
10,Stopped by during the late May bank holiday of...,1
11,The selection on the menu was great and so wer...,1


#### Split data to 80% training and 20% test set. (Use Random State  = 24) 

In [3]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
# Resetting our data
X_train,X_test,y_train,y_test = train_test_split(Yelp_data['text'],
                                                 Yelp_data['sentiment'],
                                                 test_size = 0.2,
                                                 random_state = 24)

#### Here are few libararies we do need from here on

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from sklearn.naive_bayes import BernoulliNB


#### Use Pipeline and define CountVectorizer() as 'vect' and MultiNomial Naive Bayes as your 'clf' - classifier. Then set your parameters to

'vect__min_df':[1,2,3,5,10], 

'vect__max_df':[50,100,150,200,500,1000,1200], 

'clf__alpha':[0,0.1,0.2,0.5,.8,1]


In [8]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('clf', MultinomialNB())])

parameters = {'vect__min_df':[1,2,3,5,10],
              'vect__max_df':[50,100,150,200,500,1000,1200],
              'clf__alpha':[0,0.1,0.2,0.5,.8,1]}

gs_clf = GridSearchCV(text_clf, parameters, n_jobs = -1)

#### Using GridSearchCV find the best parameters and use it to calculate test error. Did you beat Random Forest?

In [9]:
fit_grid = gs_clf.fit(X_train, y_train)


In [10]:
fit_grid.score(X_test, y_test) 

0.77000000000000002

#### Use Pipeline and define CountVectorizer() as 'vect' and Bernoulli Naive Bayes as your 'clf' - classifier. Then set your parameters to

'vect__min_df':[1,2,3,5,10], 

'vect__max_df':[50,100,150,200,500,1000,1200], 

'clf__alpha':[0,0.1,0.2,0.5,.8,1]


In [11]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('clf', BernoulliNB())])

parameters = {'vect__min_df':[1,2,3,5,10],
              'vect__max_df':[50,100,200,500,1000,1200],
              'clf__alpha':[0,0.1,0.2,0.5,.8,1]}

gs_clf = GridSearchCV(text_clf, parameters, n_jobs = -1)

#### Using GridSearchCV find the best parameters and use it to calculate test error. Did you beat Random Forest?


In [12]:
fit_grid = gs_clf.fit(X_train, y_train)
fit_grid.score(X_test, y_test)


0.72999999999999998

#### What parameters are chosen by GridSearchCV?

In [13]:
gs_clf.best_params_

{'clf__alpha': 0.2, 'vect__max_df': 200, 'vect__min_df': 2}

#### Now it's time for a new dataset! Let's play with SMS dataset. We would like to develop a model by which filter spam/ham text messages. Let's explore this dataset first.

In [14]:
import pandas as pd
url = "https://raw.githubusercontent.com/ga-students/DS-SF-24/master/Data/SMSSpamCollection.tsv"
col_names = ['label', 'message']
smsData = pd.read_csv(url, sep='\t', header = 0, names = col_names)
smsData.head(5)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [15]:
smsData.shape

(5572, 2)

#### Repeat the procedure you applied on Yelp data on SMS data. Can you get better results by using Bernoulli Naive Bayes or MultiNomial Naive Bayes? What is the best score on test set using best tuning parameters?

In [16]:
X_train,X_test,y_train,y_test = train_test_split(smsData['message'],
                                                 smsData['label'],
                                                 test_size=0.2, 
                                                 random_state = 24)

In [17]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('clf', MultinomialNB())])

parameters = {'vect__min_df':[1,2,3,5,10],
              'vect__max_df':[50,100,200,500,1000,1200],
              'clf__alpha':[0,0.1,0.2,0.5,.8,1]}

gs_clf = GridSearchCV(text_clf, parameters, n_jobs = -1)

In [18]:
fit_grid = gs_clf.fit(X_train, y_train)
fit_grid.score(X_test, y_test)

0.98654708520179368

In [19]:
fit_grid.best_params_

{'clf__alpha': 0.5, 'vect__max_df': 200, 'vect__min_df': 1}

In [20]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('clf', BernoulliNB())])

parameters = {'vect__min_df':[1,2,3,5,10],
              'vect__max_df':[50,100,200,500,1000,1200],
              'clf__alpha':[0,0.1,0.2,0.5,.8,1]}

gs_clf = GridSearchCV(text_clf, parameters, n_jobs = -1)

In [21]:
fit_grid = gs_clf.fit(X_train, y_train)
fit_grid.score(X_test, y_test)

0.98565022421524662

In [22]:
fit_grid.best_params_

{'clf__alpha': 0.1, 'vect__max_df': 500, 'vect__min_df': 1}

Answer: The accuracy of our best model is 98.5%!

#### Print out misclassified instances in your test set. 

In [23]:
#Misclassified instances
count  = range(len(y_test))
for i in count:
    if fit_grid.predict(X_test)[i] != y_test.values[i]:
        print (X_test.values[i])

2/2 146tf150p
SMS. ac sun0819 posts HELLO:"You seem cool, wanted to say hi. HI!!!" Stop? Send STOP to 62468
LookAtMe!: Thanks for your purchase of a video clip from LookAtMe!, you've been charged 35p. Think you can do better? Why not send a video in a MMSto 32323.
Hi this is Amy, we will be sending you a free phone number in a couple of days, which will give you an access to all the adult parties...
Thanks for the Vote. Now sing along with the stars with Karaoke on your mobile. For a FREE link just reply with SING now.
You can donate £2.50 to UNICEF's Asian Tsunami disaster support fund by texting DONATE to 864233. £2.50 will be added to your next bill
I want some cock! My hubby's away, I need a real man 2 satisfy me. Txt WIFE to 89938 for no strings action. (Txt STOP 2 end, txt rec £1.50ea. OTBox 731 LA1 7WS. )
Check Out Choose Your Babe Videos @ sms.shsex.netUN fgkslpoPW fgkslpo
Not heard from U4 a while. Call me now am here all night with just my knickers on. Make me beg for it like

In [30]:
sum(smsData['label'] == 'ham')/float(len(smsData))

0.86593682699210339