# Dependency management

In [None]:
import pandas as pd
import os
import mlflow
from mlflow.models import infer_signature

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.metrics import (accuracy_score, mean_absolute_error,
                             precision_score, recall_score, matthews_corrcoef)
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB

# Minio Bucket setup

In [None]:
MINIO_ENDPOINT = os.environ['MINIO_ENDPOINT']

In [None]:
bucket_name = 'datasets'
filepath = f'{bucket_name}/20news/train.csv'

# Load pandas dataframe from S3

In [None]:
df = pd.read_csv(
    f's3://{filepath}',
    storage_options={
        'client_kwargs': {'endpoint_url': MINIO_ENDPOINT}
    }
)

# Start your code here

In [None]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [None]:
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3),
}

In [None]:
gs_clf = GridSearchCV(
    text_clf, 
    parameters, 
    scoring=['accuracy', 'f1_macro'],
    refit='f1_macro',
    cv=5, 
    verbose=4,
    n_jobs=-1,
)

In [None]:
gs_clf.fit(df['text'], df['target'])

In [None]:
results = gs_clf.cv_results_

In [None]:
df_results = pd.DataFrame(results)

In [None]:
signature = infer_signature(df['text'], df['target'])

In [None]:
signature

## Log results into MLFlow

In [None]:
client = mlflow.MlflowClient()

In [None]:
mlflow.set_experiment('20news_clf')

In [None]:
with mlflow.start_run(run_name='MultinomialNB'):
    for idex, row  in df_results.iterrows():
        with mlflow.start_run(run_name=f'HP_{idex}', nested=True):
            
            mlflow.log_params(row['params'])
            for metric in gs_clf.scorer_.keys():
                # print(metric, row[f'mean_test_{metric}'])
                mlflow.log_metric(f'mean_{metric}', row[f'mean_test_{metric}'])
                mlflow.log_metric(f'std_{metric}', row[f'std_test_{metric}'])
            if gs_clf.best_index_ == idex:
                this_run = mlflow.sklearn.log_model(
                    gs_clf.best_estimator_, 
                    artifact_path='model', 
                    signature=signature, 
                    # input_example=df.drop(columns='target')[:2]
                )
                client.set_tag(this_run.run_id, 'evaluated', False)