In [None]:
# only need to run this the first time you run the notebook!
!jupyter labextension install jupyterlab-plotly

In [None]:
from data import load_data
import numpy as np
import plotly.express as px
import plotly.io as pio
from pathlib import Path
import mlflow
import os
import pandas as pd
pio.renderers.default = "jupyterlab"


In [None]:
mlflow.set_tag('objective','dataset exploration')
data_seed = 81
data = load_data(seed=data_seed)
mlflow.log_param('random_seed_dataset', data_seed)

In [None]:
from sklearn.manifold import TSNE
tsne_params = {'random_state': 25, 'perplexity': 150}
tsne = TSNE(init='random', 
              learning_rate='auto', 
              **tsne_params)
x_tsne = tsne.fit_transform(data.X_all)
mlflow.log_params(tsne_params)
mlflow.log_metric('kl_divergence', tsne.kl_divergence_)

In [None]:
plot_df = pd.DataFrame({'x': x_tsne[:,0], 'y': x_tsne[:,1], 'category': [str(x) for x in data.y_all]})
fig = px.scatter(plot_df, x='x', y='y',
                color='category', # string gives better colors,
                category_orders={'category': sorted(plot_df['category'].unique())},
                title='t-SNE projection')
fig.update_traces(marker_size=10)
fig.show()
# In the general case, we can use mlflow.log_artifact() to log any file
# However, mlflow provides convenience functions log_dict(), log_figure(), 
# log_image(), and log_text(), which allow logging python objects as artifacts
# directly, instead of needing to first save them to disk and calling log_artifact().
mlflow.log_figure(fig, 't-sne.html')
# make sure 
mlflow.end_run()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

with mlflow.start_run(run_name='random-forest'):
    mlflow.set_tag('objective', 'classification')
    n_est = [1, 3, 5, 7] # number of estimators during parameter sweep
    
    # metrics to keep track of
    train_accs = []
    val_accs = []
    best_val_acc = -1.0
    best_train_acc = -1.0
    best_model = None
    best_n_est = 0
    
    # parameter sweep
    for n in n_est:
        # initialize and train model
        model = RandomForestClassifier(n, random_state=22) # SVC(kernel='rbf')
        model.fit(data.X_train, data.y_train)
        
        # evaluate performance
        yp_train = model.predict(data.X_train)
        yp_val = model.predict(data.X_val)
        train_acc = accuracy_score(data.y_train, yp_train)
        val_acc = accuracy_score(data.y_val, yp_val)
        
        # record training and validation accuracy scores
        train_accs.append(train_acc)
        val_accs.append(val_acc)
        
        # keep track of best performing model
        if val_acc > best_val_acc:
            best_model = model
            best_val_acc = val_acc
            best_train_acc = train_acc
            best_n_est = n
    
    # evaluate best performing model on test set
    yp_test = model.predict(data.X_test)
    test_acc = accuracy_score(data.y_test, yp_test)
    
    # log parameters and model
    mlflow.log_param('n_estimators', best_n_est)
    mlflow.log_metrics({'train_acc': best_train_acc,
                       'val_acc': best_val_acc,
                       'test_acc': test_acc})
    mlflow.sklearn.log_model(model, 'estimator')
    
    # save plot of training and validation accuracy scores for parameter sweep
    df_plot = pd.DataFrame({'n_estimators': [*n_est, *n_est], 'accuracy': [*train_accs, *val_accs], 
                       'dataset': [*['train' for _ in n_est], *['val' for _ in n_est]]})
    fig = px.bar(df_plot, x='n_estimators', y='accuracy', color='dataset', barmode='group')
    mlflow.log_figure(fig, 'acc_vs_n.html')
    fig.show()

# Activity
Try changing the hyperparameters or using another model for the classification.
A good model to try next would be scikit learn's SVM classifier (sklearn.svm.SVC), and adjusting the SVM
regularization C parameter. This can be done by replacing the RandomForestClassifier with the SVM,
and varying C between values of 0.1 and 10 (instead of changing the number of trees in the forest).

In [None]:
mlflow.search_runs([mlflow.get_experiment_by_name('Default').experiment_id,], filter_string='tags.objective="classification"')