In [3]:
import text_processing as text

In [4]:
pickled_corpus_folder = "/Users/schlinkertc/Flatiron/Lecture_Notes/nyc-ds-100719-lectures/week-11/NLP/classification-assessment/processed_corpus"

In [5]:
reader = text.PickledCorpusReader(pickled_corpus_folder)
labels = reader.categories()
loader = text.CorpusLoader(reader, 5, shuffle=True, categories=labels)

In [18]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score


def identity(words):
    return words

In [28]:
def create_pipeline(estimator, reduction=False):

    steps = [
        ('normalize', text.TextNormalizer()),
        ('vectorize', TfidfVectorizer(
            tokenizer=identity, preprocessor=None, lowercase=False
        ))
    ]

    if reduction:
        steps.append((
            'reduction', TruncatedSVD(n_components=10000)
        ))

    # Add the estimator
    steps.append(('classifier', estimator))
    return Pipeline(steps)

In [30]:
models = []
for form in (RandomForestClassifier, SGDClassifier):
    models.append(create_pipeline(form(), True))
    models.append(create_pipeline(form(), False))


In [59]:
from sklearn.metrics import f1_score,accuracy_score
import numpy as np

for model in models:
    scores = []
    
    for X_train, X_test, y_train, y_test in loader:
        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)
        
        score = f1_score(y_test,y_pred,average='weighted')
        scores.append(score)
        
    print("f1 of {} is {:0.3f}".format(model.named_steps['classifier'], np.mean(scores)))

In [65]:
models[0]

Pipeline(memory=None,
         steps=[('normalize', TextNormalizer(language=None)),
                ('vectorize',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=False, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, stri...
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
              

Trying a different approach to incorporate GridSearch

In [66]:

pipe = Pipeline([('normalize',text.TextNormalizer()),
                 ('vectorizer', TfidfVectorizer(
                     tokenizer=identity, preprocessor=None, lowercase=False)),
                 ('classifier', classifier)])


In [61]:
search_space = [{'normalize': [text.TextNormalizer()],
                 'vectorizer': [TfidfVectorizer(
                     tokenizer=identity, preprocessor=None, lowercase=False)],
                 'classifier': [LogisticRegression()],
                 'classifier__penalty': ['l2','l1'],
                 'classifier__C': np.logspace(0, 4, 10)},
                {'normalize': [text.TextNormalizer()],
                 'vectorizer': [TfidfVectorizer(
                     tokenizer=identity, preprocessor=None, lowercase=False)],
                 'classifier': [RandomForestClassifier()],
                 'classifier__n_estimators': [100, 200],
                 'classifier__max_features': [3,4]},
                {'normalize': [text.TextNormalizer()],
                 'vectorizer': [TfidfVectorizer(
                     tokenizer=identity, preprocessor=None, lowercase=False)],
                 'classifier': [SGDClassifier()],
                 'classifier__max_depth': [6,8,10],
                 'classifier__n_estimators': [10,100,200],
                 'classifier__criterion': ['gini','entropy']}
        ]

from sklearn.model_selection import GridSearchCV
from sklearn import metrics


# Create grid search 
scorer = metrics.make_scorer(metrics.f1_score, average = 'weighted')
search = GridSearchCV(pipe, search_space, cv=3, verbose=3, n_jobs = -1, scoring=scorer)

In [62]:

for X_train, X_test, y_train, y_test in loader:
    best_model = search.fit(X_train,y_train)
    y_pred = best_model.predict(X_test)

    score = f1_score(y_test,y_pred,average='weighted')
    scores.append(score)
    
    print("f1 of {} is {:0.3f}".format(best_model.named_steps['classifier'], np.mean(scores)))

TypeError: Singleton array array(<generator object CorpusLoader.documents at 0x1a3c73c390>,
      dtype=object) cannot be considered a valid collection.

In [67]:
search.

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('normalize',
                                        TextNormalizer(language=None)),
                                       ('vectorizer',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=False,
                                                        max_df=1.0,
                                                        max_features=None,
                                                    