In [11]:
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import fetch_20newsgroups
import numpy as np


from sklearn.model_selection import RandomizedSearchCV

In [3]:
data = fetch_20newsgroups(
            subset='all', 
            categories=[
                'rec.autos',
                'rec.motorcycles',
                'rec.sport.baseball',
                'rec.sport.hockey'
            ], 
            remove=('headers', 'footers', 'quotes')
        )

In [4]:
pipeline = Pipeline([
                     ("vectorizer", CountVectorizer((1,2))),
                     ("tf", TfidfTransformer()),
                     ("algo", LogisticRegression())
                    ])

In [50]:
params={
    "vectorizer__ngram_range": [[1,2]],
    "vectorizer__max_df": [1.0],
    "vectorizer__min_df": [0],
    "vectorizer__analyzer": ['word'],
    "tf__norm": ['l2'],
    "algo__solver": ['saga'],
    "algo__class_weight": ['balanced', None],
    "algo__C": np.arange(30, 60., 5),
    "algo__random_state": [42]
}

In [51]:
random_search = RandomizedSearchCV(pipeline, param_distributions=params, n_iter=10, scoring='accuracy', cv=3)

In [52]:
%%time
res = random_search.fit(X=data.data, y=data.target)



Wall time: 15min 30s


In [53]:
print(res.best_score_)
print(res.best_params_)

0.8585071626036692
{'vectorizer__ngram_range': [1, 2], 'vectorizer__min_df': 0, 'vectorizer__max_df': 1.0, 'vectorizer__analyzer': 'word', 'tf__norm': 'l2', 'algo__solver': 'saga', 'algo__random_state': 42, 'algo__class_weight': None, 'algo__C': 50.0}
