# Dependency management

In [None]:
import pandas as pd
import os
import mlflow
import json

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.metrics import (accuracy_score, mean_absolute_error, f1_score,
                             precision_score, recall_score, matthews_corrcoef)
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB

# Minio Bucket setup

In [None]:
MINIO_ENDPOINT = os.environ.get('MINIO_ENDPOINT')
MLFLOW_EXP_NAME = os.environ.get('MLFLOW_EXP_NAME', '20news_clf')

In [None]:
bucket_name = 'datasets'
filepath = f'{bucket_name}/20news/test.csv'

# Load pandas dataframe from S3

In [None]:
df = pd.read_csv(
    f's3://{filepath}',
    storage_options={
        'client_kwargs': {'endpoint_url': MINIO_ENDPOINT}
    }
)

# Start your code here

In [None]:
from mlflow import MlflowClient

# Initialize an MLflow Client
client = MlflowClient()

In [None]:
best_run = None
best_score = -9999

experiment_id = client.get_experiment_by_name(MLFLOW_EXP_NAME).experiment_id


for run in client.search_runs(experiment_id, filter_string="tag.`evaluated` = 'False'"):
    artifact_path = json.loads(run.data.tags['mlflow.log-model.history'])[0]['artifact_path']
    clf = mlflow.sklearn.load_model(model_uri=f'runs:/{run.info.run_id}/{artifact_path}')
    y_pred = clf.predict(df['text'])
    score = f1_score(df['target'], y_pred, average='macro')
    if score > best_score:
        best_score = score
        best_run = run
    client.set_tag(run.info.run_id,'evaluated', True)

## Register highest performance model

In [None]:
try:
    client.create_registered_model(MLFLOW_EXP_NAME)
except:
    pass

In [None]:
if best_run:
    client.create_model_version(
        MLFLOW_EXP_NAME, 
        f'runs:/{best_run.info.run_id}/{artifact_path}', 
        best_run.info.run_id, 
        description="Selected by model evaluation step"
    )
else:
    print("No run selected")