In [1]:
import text_processing as text

In [2]:
pickled_corpus_folder = "/Users/schlinkertc/Flatiron/Lecture_Notes/nyc-ds-100719-lectures/week-11/NLP/classification-assessment/processed_corpus"

In [3]:
reader = text.PickledCorpusReader(pickled_corpus_folder)
labels = reader.categories()
loader = text.CorpusLoader(reader, 5, shuffle=True, categories=labels)

In [4]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score


def identity(words):
    return words

In [5]:
def create_pipeline(estimator, reduction=False):

    steps = [
        ('normalize', text.TextNormalizer()),
        ('vectorize', TfidfVectorizer(
            tokenizer=identity, preprocessor=None, lowercase=False
        ))
    ]

    if reduction:
        steps.append((
            'reduction', TruncatedSVD(n_components=10000)
        ))

    # Add the estimator
    steps.append(('classifier', estimator))
    return Pipeline(steps)

In [6]:
models = []
for form in (RandomForestClassifier, SGDClassifier):
    models.append(create_pipeline(form(), True))
    models.append(create_pipeline(form(), False))


In [7]:
# from sklearn.metrics import f1_score,accuracy_score
# import numpy as np

# for model in models:
#     scores = []
    
#     for X_train, X_test, y_train, y_test in loader:
#         model.fit(X_train,y_train)
#         y_pred = model.predict(X_test)
        
#         score = f1_score(y_test,y_pred,average='weighted')
#         scores.append(score)
        
#     print("f1 of {} is {:0.3f}".format(model.named_steps['classifier'], np.mean(scores)))

Trying a different approach to incorporate GridSearch

In [17]:
SGD_pipe = Pipeline([('normalize',text.TextNormalizer()),
                 ('vectorizer', TfidfVectorizer(
                     tokenizer=identity, preprocessor=None, lowercase=False)),
                 ('classifier', SGDClassifier(random_state=42))])


SGD_params= {'loss': ['hinge','log'],
             'penalty': ['l1','l2'],
             'alpha': (1e-2, 1e-3)}


from sklearn.model_selection import GridSearchCV
from sklearn import metrics


# Create grid search 
scorer = metrics.make_scorer(metrics.f1_score, average = 'weighted')
SGD_search = GridSearchCV(estimator=SGDClassifier(), param_grid=SGD_params, cv=3, verbose=3, n_jobs = -1, scoring=scorer)

In [18]:
SGD_search.estimator

Pipeline(memory=None,
         steps=[('normalize', TextNormalizer(language=None)),
                ('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=False, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, str...
                ('classifier',
                 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                              

In [20]:
SGD_scores = []
for X_train, X_test, y_train, y_test in loader:
    best_SGD = SGD_pipe.fit(X_train,y_train)
    y_pred = best_SGD.predict(X_test)

    score = f1_score(y_test,y_pred,average='weighted')
    print(score)
    scores.append(score)
    
print("f1 of best SGD is {:0.3f}".format(np.mean(scores)))

KeyboardInterrupt: 