In [40]:
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
from sklearn.base import TransformerMixin
import numpy as np

In [69]:
url = '../data/yelp.csv'
yelp = pd.read_csv(url, encoding='unicode-escape', usecols=[3, 4, 7, 8, 9])
yelp = yelp[(yelp.stars == 1) | (yelp.stars == 5)]

yelp.head()

Unnamed: 0,stars,text,cool,useful,funny
0,5,My wife took me here on my birthday for breakf...,2,5,0
1,5,I have no idea why some people give bad review...,0,0,0
3,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",1,2,0
4,5,General Manager Scott Petello is a good egg!!!...,0,0,0
6,5,Drop what you're doing and drive here. After I...,7,7,4


In [70]:
pipe = Pipeline([('count_vect', CountVectorizer()), ('nb', MultinomialNB())])
mean = cross_val_score(pipe, yelp['text'], yelp.stars, cv=5, scoring='accuracy').mean()
mean

0.92340270862185181

In [71]:
pipe = Pipeline([
  ('features', FeatureUnion([
        ('counts', CountVectorizer()),
        ('tf_idf', TfidfVectorizer())
  ])),
  ('classifier', MultinomialNB())
])
mean = cross_val_score(pipe, yelp['text'], yelp.stars, cv=5, scoring='accuracy').mean()
mean

0.89207692255332383

In [73]:
class TextLengthExtractor(TransformerMixin):
    """Takes in dataframe, extracts text column, outputs the number of words in each text"""

    def average_word_length(self, name):
        """Helper code to compute average word length of a name"""
        return len(name.split())

    def transform(self, X, y=None):
        """The workhorse of this feature extractor"""
        return pd.DataFrame(X.apply(self.average_word_length))

    def fit(self, X, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self

In [74]:
a = TextLengthExtractor()
a.fit_transform(yelp['text']).head()

Unnamed: 0,text
0,155
1,257
3,76
4,86
6,292


In [75]:
pipe = Pipeline([('text_length', TextLengthExtractor()), ('nb', MultinomialNB())])
mean = cross_val_score(pipe, yelp['text'], yelp['stars'], cv=5, scoring='accuracy').mean()
mean

0.81669134642032881

In [78]:
pipe = Pipeline([
  ('features', FeatureUnion([
        ('counts', CountVectorizer()),
        ('tf_idf', TfidfVectorizer()),
        ('text_length', TextLengthExtractor())
  ])),
  ('classifier', MultinomialNB())
])
mean = cross_val_score(pipe, yelp['text'], yelp['stars'], cv=5, scoring='accuracy').mean()
mean

0.8176711385819917

In [80]:
# Pipeline can also be used with GridSearchCV
pipe = Pipeline([
  ('features', FeatureUnion([
        ('counts', CountVectorizer()),
        ('tf_idf', TfidfVectorizer())
  ])),
  ('classifier', MultinomialNB())
])
# search for an optimal n_neighbors value using GridSearchCV
gram_range = [(1, n) for n in range(1, 3)]
param_grid = {
    'features__counts__ngram_range': gram_range,
    'features__tf_idf__ngram_range': gram_range,
}
grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
grid.fit(yelp['text'], yelp['stars'])
print grid.best_score_, grid.best_params_

0.892070484581 {'features__counts__ngram_range': (1, 1), 'features__tf_idf__ngram_range': (1, 1)}
