# Classifier Pipeline

**First**, let's import our libraries.

In [1]:
import pandas as pd
import nltk

In [2]:
# round of lib imports
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.linear_model import LogisticRegression as LR
from sklearn.svm import LinearSVC

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report as clsr
from sklearn.cross_validation import train_test_split as tts
from sklearn.metrics import confusion_matrix



---

**data in...**

In [3]:
# Classifier data
f_out = '../data/processed/class/tw'
df = pd.read_csv(f_out)

df.head(2)

FileNotFoundError: File b'../data/processed/class/classifier-cleaned.csv' does not exist

In [4]:
# split here!
X_train, X_test, y_train, y_test = tts(df.X.values, df.y2.values)

**functions**

In [5]:
def test_model(X_test, y_test, clf):
    
    
    res = clf.predict(X_test)

    print (confusion_matrix(y_test, res))
    print (clsr(y_test, res))

---

**Pipeline**

In [8]:
# params
parameters = {
    # CountVector
    'vect__max_df': (0.25, 0.5, 1.0),
    'vect__max_features': (5000, 10000, 15000),
    'vect__ngram_range': ((1,1), (1,2), (1,3), (2,3)),
    'vect__stop_words': (None, 'english'),
    #'vect__preprocessor': ()
    
    # Tfidf Trans
    'tfidf__norm': ('l1', 'l2'),
    'tfidf__use_idf': (True, False),
    'tfidf__smooth_idf': (True, False),
    'tfidf__sublinear_tf': (True, False),
    
    # clf
    'clf__C': (0.5, 1, 2),
    'clf__class_weight': (None, 'balanced'),
    'clf__max_iter': (500, 1000, 2500, 5000)
}

parameters_test = {
    # CountVector
    'vect__max_df': (0.25, 0.5, 1.0)
}

In [9]:
# pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC())
])

In [11]:
# grid search
grid_search = GridSearchCV(pipeline, parameters_test, n_jobs=4, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=4)]: Done   9 out of   9 | elapsed:    0.9s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))]),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'vect__max_df': (0.25, 0.5, 1.0)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

**Done!** -- Print out

In [12]:
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

print ()
# see detaailed scores
test_model(X_test, y_test, grid_search.best_estimator_)

Best score: 0.845
Best parameters set:
	clf__C: 1.0
	clf__class_weight: None
	clf__max_iter: 1000
	tfidf__norm: 'l2'
	tfidf__smooth_idf: True
	tfidf__sublinear_tf: False
	tfidf__use_idf: True
	vect__max_df: 0.25
	vect__max_features: None
	vect__ngram_range: (1, 1)
	vect__stop_words: None

[[2860  185]
 [ 379  203]]
             precision    recall  f1-score   support

          0       0.88      0.94      0.91      3045
          1       0.52      0.35      0.42       582

avg / total       0.83      0.84      0.83      3627



**Saving** -- to pickle

In [14]:
from sklearn.externals import joblib
out = '../data/processed/class/3-28/'

# Model
out_model = out + 'model.pkl'
joblib.dump(grid_search.best_estimator_, out_model)

# Log
out_log = out + 'params.txt'
with open(out_log, 'w') as f:
    for param_name in sorted(parameters.keys()):
        temp = param_name +': ' + str(best_parameters[param_name])
        f.write(temp)
    f.close()