In [2]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [3]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB, CategoricalNB

In [4]:
categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']
dataset = fetch_20newsgroups(subset='train', categories=categories, random_state=42)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [5]:
X = dataset.data
y = dataset.target

In [10]:
pipeline = Pipeline([
                     ('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())
])

In [15]:
params = {
    'clf__alpha': [1.0, 2.0, 3.0, 4.0],
    'clf__fit_prior': [True, False],
}

In [16]:
gs_cv = GridSearchCV(estimator=pipeline, param_grid=params, cv=6, n_jobs=-1)

In [17]:
gs_cv.fit(X, y)

GridSearchCV(cv=6, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prep

In [18]:
gs_cv.best_score_

0.9233532272325377

In [19]:
gs_cv.best_params_

{'clf__alpha': 1.0, 'clf__fit_prior': False}

In [20]:
pipeline = Pipeline([
                     ('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', ComplementNB())
])

In [21]:
params = {
    'clf__alpha': [1.0, 2.0, 3.0, 4.0, 5.0],
    'clf__fit_prior': [True, False],
    'clf__norm': [True, False],
}

In [22]:
gs_cv = GridSearchCV(estimator=pipeline, param_grid=params, cv=6, n_jobs=-1)

In [23]:
gs_cv.fit(X, y)

GridSearchCV(cv=6, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prep

In [24]:
gs_cv.best_score_

0.9552493086517297

In [25]:
gs_cv.best_params_

{'clf__alpha': 1.0, 'clf__fit_prior': True, 'clf__norm': False}

In [26]:
pipeline = Pipeline([
                     ('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', BernoulliNB())
])

In [27]:
params = {
    'clf__alpha': [1.0, 1.2, 2.0, 2.2, 3.0],
    'clf__binarize': [1.0, 2.0, 3.0],
    'clf__fit_prior': [True, False],
}

In [28]:
gs_cv = GridSearchCV(estimator=pipeline, param_grid=params, cv=6, n_jobs=-1)

In [29]:
gs_cv.fit(X, y)

GridSearchCV(cv=6, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prep

In [30]:
gs_cv.best_score_

0.2653966081607314