In [None]:
# Parameters
input_csv_file = "poc/data/data_train.csv"
C_list = [.1, 1.0]
max_features_list = [500, 1500, 3000]
random_state=0

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate, RepeatedStratifiedKFold
import mlflow
from itertools import product

In [None]:
df = pd.read_csv(input_csv_file).dropna()

In [None]:
def log_results(d):
    for metrics, values in d.items():
        mlflow.log_metric(metrics + '_avg', values.mean())
        mlflow.log_metric(metrics + '_std', values.std())

In [None]:
for C, max_features in product(C_list, max_features_list):
    with mlflow.start_run():
        mlflow.log_param('C', C)
        mlflow.log_param('max_features', max_features)
        classifier = LogisticRegression(C=C,
                                        solver='lbfgs',
                                        multi_class='multinomial')
        vectorizer = CountVectorizer(max_features=max_features,
                                     stop_words='english')
        pipeline = Pipeline([('vectorizer', vectorizer),
                         (classifier.__repr__().split('(')[0], classifier)])
        d = cross_validate(pipeline, 
                           X=df['data'], 
                           y=df['target'],
                           scoring=['accuracy', 'precision_macro', 'f1_micro', 'f1_macro'],
                           cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=random_state))
        log_results(d)
        
    