In [1]:
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.base import TransformerMixin
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn import metrics

In [2]:
# Some text

X = (
    "the pizza pizza beer copyright",
    "the pizza burger beer copyright",
    "the the pizza beer beer copyright",
    "the burger beer beer copyright",
    "the coke burger coke copyright",
    "the coke burger burger",
)

In [3]:
# EXERCISE (WAKE UP ITS CODING TIME)

# Use a count vectorizer to count vectorize X and show a 
# user-friendly dataframe to see the count of words in each sentence
# Recall this is otherwise known as a document term matrix, make it








In [4]:
# ANSWER

word_vect = CountVectorizer(analyzer='word')
words = pd.DataFrame(word_vect.fit_transform(X).toarray(), columns=word_vect.get_feature_names())
words

Unnamed: 0,beer,burger,coke,copyright,pizza,the
0,1,0,0,1,2,1
1,1,1,0,1,1,1
2,2,0,0,1,1,2
3,2,1,0,1,0,1
4,0,1,2,1,0,1
5,0,2,1,0,0,1


In [5]:
char_vect = CountVectorizer(analyzer='char')
chars = pd.DataFrame(char_vect.fit_transform(X).toarray(), columns=char_vect.get_feature_names())
chars

Unnamed: 0,Unnamed: 1,a,b,c,e,g,h,i,k,o,p,r,t,u,y,z
0,4,2,1,1,3,1,2,3,0,1,3,2,2,0,1,4
1,4,1,2,1,4,2,2,2,0,1,2,4,2,1,1,2
2,5,1,2,1,6,1,3,2,0,1,2,3,3,0,1,2
3,4,0,3,1,6,2,2,1,0,1,1,5,2,1,1,0
4,4,0,1,3,4,2,2,1,2,3,1,3,2,1,1,0
5,3,0,2,1,4,2,1,0,1,1,0,4,1,2,0,0


In [6]:
# This is a feature union. If automatically horizontally stacks our count and tfidf vectorizers
# http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.FeatureUnion.html
fs = FeatureUnion([
    ("words", CountVectorizer(analyzer='word')),
    ("chars", CountVectorizer(analyzer='char')),
])

In [7]:
fs #see our params

FeatureUnion(n_jobs=1,
       transformer_list=[('words', CountVectorizer(analyzer='word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,...  strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None))],
       transformer_weights=None)

In [8]:
# To see them put together

words_plus_chars = pd.DataFrame(fs.fit_transform(X).toarray(), columns=word_vect.get_feature_names()+char_vect.get_feature_names())
words_plus_chars

Unnamed: 0,beer,burger,coke,copyright,pizza,the,Unnamed: 7,a,b,c,...,h,i,k,o,p,r,t,u,y,z
0,1,0,0,1,2,1,4,2,1,1,...,2,3,0,1,3,2,2,0,1,4
1,1,1,0,1,1,1,4,1,2,1,...,2,2,0,1,2,4,2,1,1,2
2,2,0,0,1,1,2,5,1,2,1,...,3,2,0,1,2,3,3,0,1,2
3,2,1,0,1,0,1,4,0,3,1,...,2,1,0,1,1,5,2,1,1,0
4,0,1,2,1,0,1,4,0,1,3,...,2,1,2,3,1,3,2,1,1,0
5,0,2,1,0,0,1,3,0,2,1,...,1,0,1,1,0,4,1,2,0,0


In [9]:
print words.shape
print "+"
print chars.shape
print "========"
print words_plus_chars.shape

(6, 6)
+
(6, 16)
(6, 22)


In [10]:
# Pipeline http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
# is a module that chains different sklearn modules together in order

In [11]:
# Test the various methods of the pipeline (anova).
iris = load_iris()
X = iris.data
y = iris.target

In [12]:
# Test with selectkbest + LogisticRegression
# SelectKBest http://scikit-learn.org/stable/modules/feature_selection.html#univariate-feature-selection
# will automatically attempt to reduce the number of features by performing 
# statistical tests (including ANOVA or Chi Squared)

logreg = LogisticRegression()
filter1 = SelectKBest(k=2) # select the best 2 features
pipe = Pipeline([('anova', filter1), ('logistic', logreg)])


mean = cross_val_score(pipe, X, y, cv=10, scoring='accuracy').mean()
mean

0.85999999999999999

In [13]:
# You can use make_pipeline to make simpler pipelines
easy_pipe = make_pipeline(SelectKBest(k=2), LogisticRegression())
mean = cross_val_score(easy_pipe, X, y, cv=10, scoring='accuracy').mean()
mean

0.85999999999999999

In [14]:
df = pd.read_table('../data/sms.tsv',
                   sep='\t', header=None, names=['label', 'msg'])
df.head()

Unnamed: 0,label,msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [15]:
# EXERCISE


# cross validate two naive bayes models.
# One model should attempt to fit to the count vectorized text
# The other should fit to the tfidfed text
# Each model whould be fit over a 10 fold validation with accuracy









In [16]:

# Answers

vect = CountVectorizer()
all_dtm = vect.fit_transform(df.msg)
nb = MultinomialNB()
mean = cross_val_score(nb, all_dtm, df.label, cv=10, scoring='accuracy').mean()
mean

0.98026032061360824

In [17]:
vect = TfidfVectorizer()
all_dtm = vect.fit_transform(df.msg)
nb = MultinomialNB()
mean = cross_val_score(nb, all_dtm, df.label, cv=10, scoring='accuracy').mean()
mean

0.96320881558493299

In [18]:
# A more complicated pipeline

pipe = Pipeline([
  ('features', FeatureUnion([
        ('counts', CountVectorizer()),
        ('tf_idf', TfidfVectorizer())
  ])),
  ('classifier', MultinomialNB())
])
mean = cross_val_score(pipe, df.msg, df.label, cv=10, scoring='accuracy').mean()
mean

0.9840269835443396

In [19]:
# Not only is this PROPER cross validation of count vectorizers, its better than either model combined!

In [21]:
# Pipeline can also be used with GridSearchCV, WARNING SLOWWW
pipe = Pipeline([
  ('features', FeatureUnion([
        ('counts', CountVectorizer()),
        ('tf_idf', TfidfVectorizer())
  ])),
  ('classifier', MultinomialNB())
])
# search for an optimal n_neighbors value using GridSearchCV
gram_range = [(1, n) for n in range(1, 3)]
param_grid = {
    'features__counts__ngram_range': gram_range,
    'features__tf_idf__ngram_range': gram_range,
}
print param_grid
print
grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
grid.fit(df.msg, df.label)
print grid.best_score_, grid.best_params_

{'features__counts__ngram_range': [(1, 1), (1, 2)], 'features__tf_idf__ngram_range': [(1, 1), (1, 2)]}

0.985642498205 {'features__counts__ngram_range': (1, 2), 'features__tf_idf__ngram_range': (1, 1)}


In [None]:
# We can also make our own custom transformations!

In [22]:
class DatetimeTransformer(TransformerMixin):
    def transform(self, X, **transform_params): 
        if 'date' in X:
            years = X['date'].apply(lambda x: x.year)
            months = X['date'].apply(lambda x: x.month)
            days = X['date'].apply(lambda x: x.day)
            return pd.DataFrame({'year':years, 'month':months, 'day':days})
        return X

    def fit(self, X, y=None, **fit_params):
        return self

In [23]:
url = '../data/yelp.csv'
yelp = pd.read_csv(url, encoding='unicode-escape', usecols=[1, 3])
yelp['date'] = pd.to_datetime(yelp['date'])
yelp = yelp[(yelp.stars == 1) | (yelp.stars == 5)] # only look at best and worst ratings
yelp.head()

Unnamed: 0,date,stars
0,2011-01-26,5
1,2011-07-27,5
3,2010-05-27,5
4,2012-01-05,5
6,2010-02-12,5


In [24]:
h = DatetimeTransformer() # our custom transformer
h.fit_transform(yelp).head()

Unnamed: 0,day,month,year
0,26,1,2011
1,27,7,2011
3,27,5,2010
4,5,1,2012
6,12,2,2010


In [25]:
yelp.head() # original data unscathed

Unnamed: 0,date,stars
0,2011-01-26,5
1,2011-07-27,5
3,2010-05-27,5
4,2012-01-05,5
6,2010-02-12,5


In [26]:
# pipeline with our custom transformer and logistic regression

pipe = Pipeline([('transform_datetime', DatetimeTransformer()), ('logistic', logreg)])
mean = cross_val_score(pipe, yelp.drop('stars', axis=1), yelp.stars, cv=10, scoring='accuracy').mean()
mean

0.81669161774511989

In [27]:
# Hmmmmmm not bad... but what was that null accuracy?

In [28]:
yelp['stars'].value_counts() / yelp.shape[0]

5    0.816691
1    0.183309
Name: stars, dtype: float64

In [29]:
# It appears dates aren't helping at all..

In [30]:
# read three columns from the dataset into a DataFrame
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
col_names = ['label', 'color', 'proline']
wine = pd.read_csv(url, header=None, names=col_names, usecols=[0, 10, 13])
wine.head()

Unnamed: 0,label,color,proline
0,1,5.64,1065
1,1,4.38,1050
2,1,5.68,1185
3,1,7.8,1480
4,1,4.32,735


In [31]:
wine.describe()

Unnamed: 0,label,color,proline
count,178.0,178.0,178.0
mean,1.938202,5.05809,746.893258
std,0.775035,2.318286,314.907474
min,1.0,1.28,278.0
25%,1.0,3.22,500.5
50%,2.0,4.69,673.5
75%,3.0,6.2,985.0
max,3.0,13.0,1680.0


In [32]:
# define X and y
feature_cols = ['color', 'proline']
X = wine[feature_cols]
y = wine.label

In [33]:
# split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


# standardize X_train
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)


# Recall that standard scalar does a basic z-score normalization
# So for each element, x, it computes z = (x - mean) / std
# This gives the entire series a mean of 0 and a standard deviation (and variance) of 1

# check that it standardized properly

# First column of X_train_scaled
print X_train_scaled[:, 0].mean() # 0
print X_train_scaled[:, 0].std()  # 1

# Second column of X_train_scaled
print X_train_scaled[:, 1].mean() # 0
print X_train_scaled[:, 1].std()  # 1

-3.90664944003e-16
1.0
1.6027279754e-16
1.0


In [34]:
# standardize X_test
X_test_scaled = scaler.transform(X_test)

In [35]:
# Is anything wrong here?

# First column of X_train_scaled
print X_test_scaled[:, 0].mean()
print X_test_scaled[:, 0].std()


# Second column of X_train_scaled
print X_test_scaled[:, 1].mean()
print X_test_scaled[:, 1].std()

0.0305898576303
0.866822198488
0.0546533341088
1.14955947533


In [58]:
# EXERCISE

# Run a KNN (3 neighbors) on the training set and find the accuracy on the testing set
# Do this TWICE once for the unscaled data, and once for the scaled data
# What do you notice about the accuracies


# Thought experiment: Why am I not asking for a cross validation?








In [59]:
# Answers

# KNN accuracy on original data
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
y_pred_class = knn.predict(X_test)
print metrics.accuracy_score(y_test, y_pred_class)

0.644444444444


In [60]:
# KNN accuracy on scaled data
knn.fit(X_train_scaled, y_train)
y_pred_class = knn.predict(X_test_scaled)
print metrics.accuracy_score(y_test, y_pred_class)

0.866666666667


In [61]:
# So we agree, scaling data helps us out in KNN

In [62]:
# EXERCISE


# Cross validate a KNN with 3 neighbors on the X and y variables with  (UNSCALED)
# 5 folds and output average accuracy for the models







In [63]:
# ANSWER proper cross-validation on the original (unscaled) data

knn = KNeighborsClassifier(n_neighbors=3)
cross_val_score(knn, X, y, cv=5, scoring='accuracy').mean()

0.71983168041991563

In [64]:
# Why is this improper cross-validation on the scaled data?

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
cross_val_score(knn, X_scaled, y, cv=5, scoring='accuracy').mean()

0.90104247104247115

In [65]:
# EXERCISE

# Make a pipeline that scales data and then runs a KNN with 3 neighbors
# cross validate your pipeline using cv=5 with accuacy metric













In [66]:
# ANSWER


# fix the cross-validation process using Pipeline
pipe = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=3))
cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()

# It is NOT as good... But it is more accurate to our true estimate (and also better than the unscaled data)

0.89516011810129448

In [36]:
# search for an optimal n_neighbors value using GridSearchCV
pipe = make_pipeline(StandardScaler(), KNeighborsClassifier())
neighbors_range = range(1, 21)
param_grid = dict(kneighborsclassifier__n_neighbors=neighbors_range)
grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
grid.fit(X, y)
print grid.best_score_, grid.best_params_

0.910112359551 {'kneighborsclassifier__n_neighbors': 1}
