In [1]:
import numpy as np
import pandas as pd
from preprocessor import Cleaner, Stemmer
from util import Loader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_files 

In [2]:
loader = Loader()
data = loader.from_files(path="datasets/classic3")

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
steps = [
        ('cleanning', Cleaner(remove_accents=True, remove_alpha_numeric=True, max_word_lenght=2)),
        ('stemmer', Stemmer(lang='english')),
        ('tfidf', TfidfVectorizer(max_df=0.90, min_df=0.05)),
        ('mnb', MultinomialNB())
]
pipe = Pipeline(steps=steps)

In [4]:
params = {'cleanning__remove_accents':[True,False],
    'cleanning__remove_alpha_numeric':[True,False],
    'stemmer__lang':['english'],
    'tfidf__max_df':[1.0,0.95],
    'tfidf__min_df':[0.0,0.01],
    'mnb__alpha':[0.001,0.01,0.1,]
}
scoring = ['accuracy','f1_micro', 'f1_macro']

In [5]:
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(pipe, params, cv=10, n_jobs=-1, verbose=True,scoring=scoring,refit='accuracy',return_train_score=True)
clf.fit(data['corpus'],data['class_index'])

Fitting 10 folds for each of 48 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 10.0min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 41.3min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 94.6min
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed: 101.7min finished


In [6]:
df = pd.DataFrame(clf.cv_results_)
df.to_csv('resultado.csv')