In [92]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline, Pipeline, FeatureUnion
from sklearn.cross_validation import cross_val_score
from textblob import TextBlob
from sklearn.naive_bayes import MultinomialNB

# Why do we make these pipelines?

## Let's start with why we don't..
* We don't user pipelines to chain together "normal" non sklearn functions
    * pipe = make_pipeline(lambda x:x+1, lambda x:x*2) # does not work

In [42]:
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
col_names = ['label', 'color', 'proline']
wine = pd.read_csv(url, header=None, names=col_names, usecols=[0, 10, 13])
wine.head()

Unnamed: 0,label,color,proline
0,1,5.64,1065
1,1,4.38,1050
2,1,5.68,1185
3,1,7.8,1480
4,1,4.32,735


In [43]:
# define X and y
feature_cols = ['color', 'proline']
X = wine[feature_cols]
y = wine.label

## Why we use Pipelines
* For cross validation modularization, what??
* for gridsearching (for the same reason)

In [118]:
# define X and y
feature_cols = ['color', 'proline']
X = wine[feature_cols]
y = wine.label


# improper cross-validation on the scaled data. Why is that?
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
cross_val_score(knn, X_scaled, y, cv=5, scoring='accuracy').mean()

0.90104247104247115

In [119]:
# proper cross validation
pipe = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=3))
cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()

0.89516011810129448

In [46]:
pipe.steps # see the steps

[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)),
 ('kneighborsclassifier',
  KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
             metric_params=None, n_neighbors=3, p=2, weights='uniform'))]

In [47]:
pipe.named_steps #see the steps with names, useful for grid searching

{'kneighborsclassifier': KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
            metric_params=None, n_neighbors=3, p=2, weights='uniform'),
 'standardscaler': StandardScaler(copy=True, with_mean=True, with_std=True)}

In [122]:
# search for an optimal n_neighbors value using GridSearchCV
neighbors_range = range(1, 31)
param_grid = dict(kneighborsclassifier__n_neighbors=neighbors_range)
from sklearn.grid_search import GridSearchCV
grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
grid.fit(X, y)
print grid.best_score_
print grid.best_params_

0.910112359551
{'kneighborsclassifier__n_neighbors': 1}


In [127]:
# Note the nomenclauture for the parameters

In [130]:
optimal_model = KNeighborsClassifier(grid.best_params_['kneighborsclassifier__n_neighbors'])
optimal_model.fit(X, y)
optimal_model.predict([4, 4])

array([2])

In [131]:
# Let's play with some spam/ham data

In [132]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
vec = CountVectorizer(ngram_range=(1,5))
df = pd.read_table('https://raw.githubusercontent.com/sinanuozdemir/DAT6/master/data/sms.tsv',
                   sep='\t', header=None, names=['label', 'msg'])
df.head()

Unnamed: 0,label,msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [133]:
vec.fit_transform(df['msg'])

<5572x207807 sparse matrix of type '<type 'numpy.int64'>'
	with 339886 stored elements in Compressed Sparse Row format>

In [134]:
pipe = make_pipeline(CountVectorizer(ngram_range=(1,5)), KNeighborsClassifier(n_neighbors=3))

In [135]:
cross_val_score(pipe, df['msg'], df['label'], cv=5, scoring='accuracy').mean()

0.90864995853829367

In [136]:
# Let's add a new analyzer by lemmatizing everyting
def split_into_lemmas(message):
    message = unicode(message, 'utf8').lower()
    words = TextBlob(message).words
    # for each word, take its "base form" = lemma 
    return [word.lemma for word in words]

In [137]:
split_into_lemmas("hello there that is a pack of wolves")

[u'hello', u'there', u'that', u'is', u'a', u'pack', u'of', u'wolf']

In [138]:
pipe = make_pipeline(CountVectorizer(ngram_range=(1,5),analyzer=split_into_lemmas), KNeighborsClassifier(n_neighbors=3))

In [139]:
cross_val_score(pipe, df['msg'], df['label'], cv=5, scoring='accuracy').mean()

0.92964809879962329

In [163]:
# Exercise

# grid search using a pipeline of countvectorizer and KNN and find the optimal combination of neighbors and ngram ranges
# et, (1, 2), (1, 3), etc

# search for an optimal n_neighbors value using GridSearchCV
pipe = make_pipeline(CountVectorizer(), KNeighborsClassifier())
print pipe.named_steps
param_grid = dict(kneighborsclassifier__n_neighbors=**replace me**, countvectorizer__ngram_range=**replace me**)
print param_grid
grid = GridSearchCV(**replace me**, param_grid, cv=5, scoring='accuracy')
grid.fit(df['msg'], **replace me**)
print grid.best_score_
print grid.best_params_

{'countvectorizer': CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None), 'kneighborsclassifier': KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_neighbors=5, p=2, weights='uniform')}
{'countvectorizer__ngram_range': [(1, 1), (1, 2)], 'kneighborsclassifier__n_neighbors': [1, 2]}
0.949030868629
{'countvectorizer__ngram_range': (1, 1), 'kneighborsclassifier__n_neighbors': 1}


# A bit more complicated...

In [166]:
pipeline = Pipeline([
  ('counts', CountVectorizer()),
  ('tf_idf', TfidfTransformer()),
  ('classifier', MultinomialNB())
])

In [167]:
cross_val_score(pipeline, df['msg'], df['label'], cv=5, scoring='accuracy').mean()

0.95800420252634633

In [142]:
pipeline.predict('hello') # Won't work!!!! becaue it is trying to FIT to data first
# must think of pipeline as its own model
# It can be pickled..

NotFittedError: CountVectorizer - Vocabulary wasn't fitted.

In [169]:
# Exercise, complete the code below that will
# 1. countvectorize df['msg']
# 2. tfidf transform the data
# 3. fit a naive bayes to the entire data set
vec = CountVectorizer()
vec.fit(df['msg'])
t = TfidfTransformer()
tfidfed = t.fit_transform(**replace me**)
nb = MultinomialNB()
bayes = nb.fit(**replace me**, **replace me**)

In [146]:
def megaPredict(text):
    return bayes.predict(**replace me**)

In [147]:
megaPredict(['100 dating service cal;l 09064012103 box334sk38ch', "Hey ralph, you coming tonight?"])

array(['spam', 'ham'], 
      dtype='|S4')

In [170]:
# Same thing done in pipelines

pipeline = Pipeline([
  ('counts', CountVectorizer()),
  ('tf_idf', TfidfTransformer()),
  ('classifier', MultinomialNB())
])
pipeline.fit( df['msg'], df['label'] )

Pipeline(steps=[('counts', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        ...         use_idf=True)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [149]:
pipeline.predict(['100 dating service cal;l 09064012103 box334sk38ch', "Hey ralph, you coming tonight?"])

array(['spam', 'ham'], 
      dtype='|S4')

In [150]:
# Using pipeline's featureunion
# which will concatinate different preprocessing steps together

# A feature union is a pipeline itself

from sklearn.datasets import load_iris
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import SelectKBest

In [151]:
iris = load_iris()
X = iris.data
X -= X.mean(axis=0)
y = iris.target

In [152]:
svd = TruncatedSVD(n_components=2, random_state=0)
# A singular value decomp (or PCA)
svd.fit_transform(X)[:5]
# onlt outputs 2 columns

array([[-2.68420713,  0.32660731],
       [-2.71539062, -0.16955685],
       [-2.88981954, -0.13734561],
       [-2.7464372 , -0.31112432],
       [-2.72859298,  0.33392456]])

In [153]:
select = SelectKBest(k=1)
# only one column
select.fit_transform(X, y)[:5]

array([[-2.35866667],
       [-2.35866667],
       [-2.45866667],
       [-2.25866667],
       [-2.35866667]])

In [154]:
fs = FeatureUnion([("svd", svd), ("select", select)])
fs.fit_transform(X, y)[:5]

array([[-2.68420713,  0.32660731, -2.35866667],
       [-2.71539062, -0.16955685, -2.35866667],
       [-2.88981954, -0.13734561, -2.45866667],
       [-2.7464372 , -0.31112432, -2.25866667],
       [-2.72859298,  0.33392456, -2.35866667]])

In [155]:
from sklearn.svm import LinearSVC
# A crazy pipeline within pipelines
# Pipe-ception
# I use lienarsvc beacuse I know linear is best kernel (from previous work and linearSVC is then faster)

pipeline = Pipeline([
    ('feats', FeatureUnion([
        ('count', CountVectorizer()), # can pass in either a pipeline
        ('tfidf', TfidfVectorizer()) # or a transformer
    ])),
    ('clf', LinearSVC())  # classifier
])

In [156]:
cross_val_score(pipeline, df['msg'], df['label'], cv=5, scoring='accuracy').mean()

0.98438608496832014

In [115]:
pipeline.fit(df['msg'], df['label'])

Pipeline(steps=[('feats', FeatureUnion(n_jobs=1,
       transformer_list=[('count', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_r...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [116]:
pipeline.predict(['100 dating service cal;l 09064012103 box334sk38ch', "Hey ralph, you coming tonight?"])

array(['spam', 'ham'], dtype=object)

In [None]:
# Exercise
# 1. Make  pipeline that takes in a union of three PCA components and a tfidf vectorizer of up to 2 ngrams 
# coupled with a linearSVC

# 2. Do a grid search to find the best params

# 3. fit the entire pipeline with the entire data set

