In [5]:
import numpy as np
import os
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
import joblib
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier

import optuna

In [3]:
print(optuna.__version__)

2.10.1


In [4]:
data = fetch_20newsgroups()

X = data['data'][:5000]
y = data['target'][:5000]

In [6]:
model = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),   
    ('rf', RandomForestClassifier())
])

In [8]:
def objective(trial):    
    
    joblib.dump(study, 'study.pkl')
    
    tfidf__analyzer = trial.suggest_categorical('tfidf__analyzer', ['word', 'char', 'char_wb']) 
    tfidf__lowercase = trial.suggest_categorical('tfidf__lowercase', [False, True]) 
    tfidf__max_features = trial.suggest_int('tfidf__max_features', 500, 10_000) 
    rf__n_estimators = trial.suggest_int('rf__num_estimators', 300, 500) 
    rf__max_depth = trial.suggest_int('rf__max_depth', 5, 15) 
    rf__min_samples_split = trial.suggest_int('rf__min_samples_split', 10, 30) 
    
   
    

    params = {
        'tfidf__analyzer': tfidf__analyzer,
        'tfidf__lowercase': tfidf__lowercase,
        'tfidf__max_features': tfidf__max_features,
        'rf__n_estimators': rf__n_estimators,
        'rf__max_depth': rf__max_depth,
        'rf__min_samples_split': rf__min_samples_split,
       
    }
    
    model.set_params(**params)

    return  -np.mean(cross_val_score(model, X, y, cv=3, n_jobs=-1,scoring='neg_log_loss'))

In [12]:
# by default, the direction is to minimizae, but can set it to maximize too
#study = optuna.create_study(direction='minimize')
study = optuna.create_study()


#study.optimize(objective, timeout=3600)
study.optimize(objective, n_trials=20)


# to recoard the value for the last time
joblib.dump(study, 'study.pkl')

[32m[I 2022-07-02 18:02:45,482][0m A new study created in memory with name: no-name-93fec9e1-ee73-43cd-874d-86cec23855b8[0m
[32m[I 2022-07-02 18:02:56,828][0m Trial 0 finished with value: 2.547156150826899 and parameters: {'tfidf__analyzer': 'char_wb', 'tfidf__lowercase': False, 'tfidf__max_features': 9902, 'rf__num_estimators': 314, 'rf__max_depth': 7, 'rf__min_samples_split': 30}. Best is trial 0 with value: 2.547156150826899.[0m
[32m[I 2022-07-02 18:03:04,139][0m Trial 1 finished with value: 2.613899734292334 and parameters: {'tfidf__analyzer': 'char', 'tfidf__lowercase': True, 'tfidf__max_features': 1537, 'rf__num_estimators': 385, 'rf__max_depth': 6, 'rf__min_samples_split': 11}. Best is trial 0 with value: 2.547156150826899.[0m
[32m[I 2022-07-02 18:03:15,746][0m Trial 2 finished with value: 2.5160821688053225 and parameters: {'tfidf__analyzer': 'char', 'tfidf__lowercase': False, 'tfidf__max_features': 4277, 'rf__num_estimators': 442, 'rf__max_depth': 8, 'rf__min_sample

['study.pkl']