# GA Data Science (DAT19) - Lab 18

### Scratching the surface of NLP

In [1]:
# usual imports
import numpy as np
import matplotlib.pyplot as plt
import pylab as pl
import pandas as pd
from bokeh.plotting import figure,show,output_notebook

output_notebook()
%matplotlib inline

from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier

from sklearn.naive_bayes import MultinomialNB
# Each is a different implemntation of a text transform tool: Bag of Words & Tfidf
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

We will be demonstrating this process on the standard sklearn text introduction (with one or two diversions). We'll be working with labeled review data

In [2]:
rows = []
with open('../data/amazon_cells_labelled.txt') as f:
    for i,line in enumerate(f.readlines()):
        row = line.split('\n')[0].split('\t')
        if row[1] == '':
            row[1] = np.nan
        else:
            row[1] = int(row[1])
        rows.append(row)

In [3]:
all_data = pd.DataFrame(rows,columns=['text','sentiment'])
all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15004 entries, 0 to 15003
Data columns (total 2 columns):
text         15004 non-null object
sentiment    1000 non-null float64
dtypes: float64(1), object(1)
memory usage: 234.5+ KB


In [4]:
data = all_data.dropna()
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 1 to 2937
Data columns (total 2 columns):
text         1000 non-null object
sentiment    1000 non-null float64
dtypes: float64(1), object(1)
memory usage: 23.4+ KB


The data is sentiment on Amazon reviews! But in order to use it we need to create a feature space. Please review the documentation for these functions:
http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

You'll see that we have a ton of options available. I'll start out with the simplest default.

In [5]:
count_vect = count_vect = CountVectorizer(stop_words='english')
bag_o_words = count_vect.fit_transform(data['text'])
bag_o_words

<1000x1642 sparse matrix of type '<type 'numpy.int64'>'
	with 4702 stored elements in Compressed Sparse Row format>

Note that this is a sparse matrix! You can convert to dense matrices with `.todense()` (named for clarity, clearly)

In [6]:
bag_o_words.todense().shape

(1000, 1642)

In [7]:
#Note we can see what the words are!
count_vect.get_feature_names()
count_vect.get_stop_words()

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

In [8]:
X_train,X_test,y_train,y_test = train_test_split(bag_o_words,data['sentiment'], random_state=42,test_size=0.2)

In [9]:
clf = MultinomialNB().fit(X_train, y_train)
clf.score(X_test,y_test)

0.82999999999999996

In [10]:
wrongs = []
for p,t,i in zip(clf.predict(X_test),y_test.values,y_test.index):
    if p==t:
        pass
    else:
        wrongs.append(i)

In [11]:
data.loc[wrongs]['text']

1950                                  These are fabulous!
2768                                       Lousy product.
2614    The pairing of the two devices was so easy it ...
643     I did not bother contacting the company for fe...
2843    You also cannot take pictures with it in the c...
1838                                Better than expected.
244     If you plan to use this in a car forget about it.
987     Battery life still not long enough in Motorola...
1778    My experience was terrible..... This was my fo...
2570    It seems completely secure, both holding on to...
1814    It does everything the description said it would.
54      I have yet to run this new battery below two b...
67      This is a simple little phone to use, but the ...
2292          Plan on ordering from them again and again.
2123    Couldn't use the unit with sunglasses, not goo...
1012    As many people complained, I found this headse...
2016    The volume for the ringer is REAL good (you ha...
2072          

Let's try with a decisiontree

In [12]:
dtc = DecisionTreeClassifier(max_depth=30).fit(X_train, y_train)
dtc.score(X_test,y_test)

0.76000000000000001

KNeighbors?

In [13]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(5).fit(X_train, y_train)
knn.score(X_test,y_test)

0.69499999999999995

This is awesome. Why do we think that MultinomialNB performs relatively well here?

Why better than Decision Trees or KNeighbors?

## Exercise 1

1) Implement a MultinomialNB classifier using CountVectorizor with the options:
max_features, min_df, and max_df

In [14]:
# CountVectorizer(max_features=?)
count_vect_1 = CountVectorizer(max_features=1000,
                               min_df=3,
                               max_df=50
                              )
bag_o_words_1 = count_vect_1.fit_transform(data['text'])
bag_o_words_1

<1000x505 sparse matrix of type '<type 'numpy.int64'>'
	with 4565 stored elements in Compressed Sparse Row format>

2) Evaluate the performance

In [15]:
X_train_1,X_test_1,y_train_1,y_test_1 = train_test_split(bag_o_words_1,data['sentiment'], random_state=42,test_size=0.2)
clf_1 = MultinomialNB().fit(X_train_1, y_train_1)
clf_1.score(X_test_1,y_test_1)

0.80500000000000005

3) Try with Tfidf, using the same settings

In [16]:
count_vect_tfidf = TfidfVectorizer(max_features=1000,
                                   min_df=3,
                                   max_df=50
                                  )
bag_o_words_tfidf = count_vect_tfidf.fit_transform(data['text'])
bag_o_words_tfidf

X_train_tfidf,X_test_tfidf,y_train_tfidf,y_test_tfidf = train_test_split(bag_o_words_tfidf,data['sentiment'], random_state=42,test_size=0.2)
clf_tfidf = MultinomialNB().fit(X_train_tfidf, y_train_tfidf)
clf_tfidf.score(X_test_tfidf,y_test_tfidf)

0.81499999999999995

Which was better?

## Making this process faster

Ok so if you're still with me (I hope you are!) then you will have realized that Exercise 1 was rather tedious. Data Scientists agree. Thus there is a framework in sklearn that combines GridSearch with Pipelines!

In [17]:
# Resetting our data
X_train,X_test,y_train,y_test = train_test_split(data['text'],data['sentiment'], random_state=42,test_size=0.2)

In [18]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('clf', MultinomialNB()) ])
fit_pipe = text_clf.fit(X_train,y_train)

In [19]:
fit_pipe.score(X_test,y_test)

0.83999999999999997

In [20]:
y_test.value_counts()

1.0    107
0.0     93
Name: sentiment, dtype: int64

In [21]:
from sklearn.metrics import classification_report,confusion_matrix
predicted = fit_pipe.predict(X_test)
print classification_report(y_test,predicted,target_names=['negative','positive'])

             precision    recall  f1-score   support

   negative       0.84      0.82      0.83        93
   positive       0.84      0.86      0.85       107

avg / total       0.84      0.84      0.84       200



Well, we've repeated exactly what we did before. What's the point? (Hint: grid search is the point)

In [22]:
from sklearn.grid_search import GridSearchCV

parameters = {'vect__min_df':[1,2,3],
              'vect__max_df':[5,10,100,1000],
             'clf__alpha':[.01,1,100]}

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=1)

What's going on here? Note: if you have a windows machine, keep n_jobs=1

In [23]:
fit_grid = gs_clf.fit(X_train,y_train)

In [24]:
fit_grid.score(X_test,y_test)

0.84499999999999997

In [25]:
fit_grid.best_params_

{'clf__alpha': 1, 'vect__max_df': 100, 'vect__min_df': 1}

### Exercise 2
Rebuild the Pipeline using TfidfVectorizor and Decision Trees

In [26]:
text_clf = Pipeline([('vect', TfidfVectorizer()),
                     ('clf', DecisionTreeClassifier()) ])
fit_pipe = text_clf.fit(X_train,y_train)
fit_pipe.score(X_test,y_test)

0.75

Fit your pipe with GridSearch and see how it performs!

In [27]:
parameters = {'vect__min_df':[1,2,3],
              'vect__max_df':[5,10,100,1000],
             'clf__max_depth':[1,30,100,1000]}

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=1)
fit_grid = gs_clf.fit(X_train,y_train)

print fit_grid.score(X_test,y_test)

0.75


In [28]:
fit_grid.best_params_

{'clf__max_depth': 1000, 'vect__max_df': 100, 'vect__min_df': 1}

### Bonus!

There is extra data to test this on: imdb_labelled.txt and yelp_labelled.txt.

Try out the pipelines there!

In [29]:
# IMDB
rows = []
with open('../data/imdb_labelled.txt') as f:
    for i,line in enumerate(f.readlines()):
        row = line.split('\n')[0].split('\t')
        if row[1] == '':
            row[1] = np.nan
        else:
            row[1] = int(row[1])
        rows.append(row)

        
all_imdb_data = pd.DataFrame(rows,columns=['text','sentiment'])
#all_imdb_data.info()

data_imdb = all_data.dropna()
#data_imdb.info()

X_train,X_test,y_train,y_test = train_test_split(data_imdb['text'],data_imdb['sentiment'], random_state=42,test_size=0.2)

text_clf_imdb = Pipeline([('vect', CountVectorizer()),
                          ('clf', DecisionTreeClassifier()) ])
fit_pipe1 = text_clf_imdb.fit(X_train,y_train)
print "Pripeline with CountVectorizer Score: %.4f " % fit_pipe1.score(X_test,y_test)

text_clf_tfidf = Pipeline([('vect', TfidfVectorizer()),
                           ('clf', DecisionTreeClassifier()) ])
fit_pipe2 = text_clf_tfidf.fit(X_train,y_train)
print "Pripeline with TfidfVectorizer Score: %.4f " % fit_pipe2.score(X_test,y_test)

Pripeline with CountVectorizer Score: 0.7850 
Pripeline with TfidfVectorizer Score: 0.7200 


In [30]:
# YELP
rows = []
with open('../data/yelp_labelled.txt') as f:
    for i,line in enumerate(f.readlines()):
        row = line.split('\n')[0].split('\t')
        if row[1] == '':
            row[1] = np.nan
        else:
            row[1] = int(row[1])
        rows.append(row)

        
all_yelp_data = pd.DataFrame(rows,columns=['text','sentiment'])
#all_yelp_data.info()

data_yelp = all_data.dropna()
#data_yelp.info()

X_train,X_test,y_train,y_test = train_test_split(data_yelp['text'],data_yelp['sentiment'], random_state=42,test_size=0.2)


text_clf_yelp = Pipeline([('vect', CountVectorizer()),
                          ('clf', DecisionTreeClassifier()) ])
fit_pipe3 = text_clf_yelp.fit(X_train,y_train)
print "Pripeline with CountVectorizer Score: %.4f " % fit_pipe3.score(X_test,y_test)

text_clf_tfidf = Pipeline([('vect', TfidfVectorizer()),
                           ('clf', DecisionTreeClassifier()) ])
fit_pipe4 = text_clf_tfidf.fit(X_train,y_train)
print "Pripeline with TfidfVectorizer Score: %.4f " % fit_pipe4.score(X_test,y_test)

Pripeline with CountVectorizer Score: 0.8050 
Pripeline with TfidfVectorizer Score: 0.7050 
