# Tracking Machine Learning experiments

`SQLiteTracker` provides a simple yet powerful way to track ML experiments using a SQLite database.

In [1]:
from sklearn_evaluation import SQLiteTracker

from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
tracker = SQLiteTracker('my_experiments.db')

In [3]:
# Parameters
product = "source/user_guide/SQLiteTracker.ipynb"


In [4]:
iris = load_iris(as_frame=True)
X, y = iris['data'], iris['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

models = [RandomForestRegressor(), LinearRegression(), Lasso()]

In [5]:
for m in models:
    model = type(m).__name__
    print(f'Fitting {model}')

    # .new() returns a uuid and creates an entry in the db
    uuid = tracker.new()
    m.fit(X_train, y_train)
    y_pred = m.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)

    # add data with the .update(uuid, {'param': 'value'}) method
    tracker.update(uuid, {'mse': mse, 'model': model, **m.get_params()})

Fitting RandomForestRegressor
Fitting LinearRegression
Fitting Lasso


Or use `.insert(uuid, params)` to supply your own ID:

In [6]:
svr = SVR()
svr.fit(X_train, y_train)
y_pred = svr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

tracker.insert('my_uuid', {'mse': mse, 'model': type(svr).__name__, **svr.get_params()})

`tracker` shows last experiments by default:

In [7]:
tracker

uuid,created,parameters,comment
3d1a84bb82b842ec8d32dc7f33e890d4,2020-12-14 23:48:17,"{""mse"": 0.008444000000000002, ""model"": ""RandomForestRegressor"", ""bootstrap"": true, ""ccp_alpha"": 0.0, ""criterion"": ""mse"", ""max_depth"": null, ""max_features"": ""auto"", ""max_leaf_nodes"": null, ""max_samples"": null, ""min_impurity_decrease"": 0.0, ""min_impurity_split"": null, ""min_samples_leaf"": 1, ""min_samples_split"": 2, ""min_weight_fraction_leaf"": 0.0, ""n_estimators"": 100, ""n_jobs"": null, ""oob_score"": false, ""random_state"": null, ""verbose"": 0, ""warm_start"": false}",
feb6bfbdc11f422db2c66e90467a570f,2020-12-14 23:48:17,"{""mse"": 0.04260034113761788, ""model"": ""LinearRegression"", ""copy_X"": true, ""fit_intercept"": true, ""n_jobs"": null, ""normalize"": false}",
b2cd46c95d274bbeb106b04e61d50989,2020-12-14 23:48:17,"{""mse"": 0.4317655183287657, ""model"": ""Lasso"", ""alpha"": 1.0, ""copy_X"": true, ""fit_intercept"": true, ""max_iter"": 1000, ""normalize"": false, ""positive"": false, ""precompute"": false, ""random_state"": null, ""selection"": ""cyclic"", ""tol"": 0.0001, ""warm_start"": false}",
my_uuid,2020-12-14 23:48:17,"{""mse"": 0.030419125413621413, ""model"": ""SVR"", ""C"": 1.0, ""cache_size"": 200, ""coef0"": 0.0, ""degree"": 3, ""epsilon"": 0.1, ""gamma"": ""scale"", ""kernel"": ""rbf"", ""max_iter"": -1, ""shrinking"": true, ""tol"": 0.001, ""verbose"": false}",


## Querying experiments

In [8]:
ordered = tracker.query("""
SELECT uuid,
       json_extract(parameters, '$.model') AS model,
       json_extract(parameters, '$.mse') AS mse
FROM experiments
ORDER BY json_extract(parameters, '$.mse') ASC
""")
ordered

Unnamed: 0_level_0,model,mse
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1
3d1a84bb82b842ec8d32dc7f33e890d4,RandomForestRegressor,0.008444
my_uuid,SVR,0.030419
feb6bfbdc11f422db2c66e90467a570f,LinearRegression,0.0426
b2cd46c95d274bbeb106b04e61d50989,Lasso,0.431766


The query method returns a data frame with "uuid" as the index:

In [9]:
type(ordered)

pandas.core.frame.DataFrame

## Adding comments


In [10]:
tracker.comment(ordered.index[0], 'Best performing experiment')

User `tracker[uuid]` to get a single experiment:

In [11]:
tracker[ordered.index[0]]

Unnamed: 0_level_0,created,parameters,comment
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3d1a84bb82b842ec8d32dc7f33e890d4,2020-12-14 23:48:17,"{""mse"": 0.008444000000000002, ""model"": ""Random...",Best performing experiment


## Getting recent experiments

The recent method also returns a data frame:

In [12]:
df = tracker.recent()
df

Unnamed: 0_level_0,created,parameters,comment
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3d1a84bb82b842ec8d32dc7f33e890d4,2020-12-14 23:48:17,"{""mse"": 0.008444000000000002, ""model"": ""Random...",Best performing experiment
feb6bfbdc11f422db2c66e90467a570f,2020-12-14 23:48:17,"{""mse"": 0.04260034113761788, ""model"": ""LinearR...",
b2cd46c95d274bbeb106b04e61d50989,2020-12-14 23:48:17,"{""mse"": 0.4317655183287657, ""model"": ""Lasso"", ...",
my_uuid,2020-12-14 23:48:17,"{""mse"": 0.030419125413621413, ""model"": ""SVR"", ...",


Pass `normalize=True` to convert the nested JSON dictionary into columns:

In [13]:
df = tracker.recent(normalize=True)
df

Unnamed: 0_level_0,created,mse,model,bootstrap,ccp_alpha,criterion,max_depth,max_features,max_leaf_nodes,max_samples,...,tol,C,cache_size,coef0,degree,epsilon,gamma,kernel,shrinking,comment
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3d1a84bb82b842ec8d32dc7f33e890d4,2020-12-14 23:48:17,0.008444,RandomForestRegressor,True,0.0,mse,,auto,,,...,,,,,,,,,,Best performing experiment
feb6bfbdc11f422db2c66e90467a570f,2020-12-14 23:48:17,0.0426,LinearRegression,,,,,,,,...,,,,,,,,,,
b2cd46c95d274bbeb106b04e61d50989,2020-12-14 23:48:17,0.431766,Lasso,,,,,,,,...,0.0001,,,,,,,,,
my_uuid,2020-12-14 23:48:17,0.030419,SVR,,,,,,,,...,0.001,1.0,200.0,0.0,3.0,0.1,scale,rbf,True,


In [14]:
# delete our example database
from pathlib import Path
Path('my_experiments.db').unlink()