In [15]:
import os
import pickle

### Preprocess train-val-test splits of data

In [26]:
!python ./scripts/preprocess_data.py --inp_data_path ./data --dest_path ./data/experiment_tracking --fileout_vect ./artifacts/experiment_tracking/tfidf_vectorizer.pkl

INFO : loading pickle files
INFO : loading tf-idf vectorizer
INFO : preprocessing data splits
INFO : saving preprocessed pickle files


In [30]:
# check preprocessed files
print(os.listdir("./data/experiment_tracking/"))

# check tfidf vectorizer pickle file
print(os.listdir("./artifacts/experiment_tracking"))

['test.pkl', 'train.pkl', 'val.pkl']
['tfidf_vectorizer.pkl']


In [33]:
# check train data pickle
with open("./data/experiment_tracking/train.pkl", 'rb') as ftrain:
    dfx, dfy, dfx_cols, dfy_cols = pickle.load(ftrain)
print(type(dfx), type(dfy), type(dfx_cols), type(dfy_cols))
print(dfx.shape, dfy.shape, len(dfx_cols), len(dfy_cols))

<class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'list'> <class 'list'>
(32345, 796) (32345,) 796 1


### Experiment Tracking using mlflow

Run below command in terminal and open the url where mlflow is being served to track model training experiments:

mlflow ui --backend-store-uri sqlite:///mlflow.db

In [23]:
# baseline sklearn models
!python ./scripts/train.py



In [25]:
# random forest classifiers tuned using hyperopt
!python ./scripts/hpo.py


  0%|          | 0/15 [00:00<?, ?trial/s, best loss=?]
  7%|▋         | 1/15 [01:26<20:14, 86.77s/trial, best loss: -0.7213797733001989]
 13%|█▎        | 2/15 [04:40<32:23, 149.50s/trial, best loss: -0.7365723367822792]
 20%|██        | 3/15 [08:37<37:57, 189.82s/trial, best loss: -0.7365723367822792]
 27%|██▋       | 4/15 [20:12<1:11:21, 389.23s/trial, best loss: -0.7365723367822792]
 33%|███▎      | 5/15 [27:56<1:09:20, 416.03s/trial, best loss: -0.7365723367822792]
 40%|████      | 6/15 [31:02<50:39, 337.73s/trial, best loss: -0.7466880356220631]  
 47%|████▋     | 7/15 [43:06<1:01:52, 464.10s/trial, best loss: -0.7466880356220631]
 53%|█████▎    | 8/15 [47:35<46:53, 401.89s/trial, best loss: -0.7639681589301875]  
 60%|██████    | 9/15 [48:10<28:44, 287.43s/trial, best loss: -0.7639681589301875]
 67%|██████▋   | 10/15 [1:00:08<35:01, 420.25s/trial, best loss: -0.7639681589301875]
 73%|███████▎  | 11/15 [1:07:56<28:59, 434.92s/trial, best loss: -0.7639681589301875]
 80%|████████  |

### Model Registry

In [119]:
# register top model from best 5 candidate models
!python ./scripts/register_model.py --tracked_exps hyperopt_rfc --candidate_exp candidate_mtest2

2024/08/19 21:48:27 INFO mlflow.tracking.fluent: Experiment with name 'candidate_mtest2' does not exist. Creating a new experiment.
Successfully registered model 'reviewsentinel_dev'.
Created version '1' of model 'reviewsentinel_dev'.


In [None]:
# code to delete registered model(and its versions); to be implemented later in the scripts

# Delete versions 1,2, and 3 of the model
# client = MlflowClient()
# versions = [1, 2, 3]
# for version in versions:
#     client.delete_model_version(
#         name="sk-learn-random-forest-reg-model", version=version
#     )

# Delete a registered model along with all its versions
# client.delete_registered_model(name="sk-learn-random-forest-reg-model")

In [None]:
# code to remove deleted runs from mlflow directory; to be implemented later in the scripts
# ps: this only deletes the folders from windows filesystem but not its deleted entry from mlflow lifecycle

# import mlflow
# import shutil
    
# def get_run_dir(artifacts_uri):
#     return artifacts_uri[8:-10]
    
    
# def remove_run_dir(run_dir):
#     shutil.rmtree(run_dir, ignore_errors=True)
    
# experiment_id = 1
# deleted_runs = 2
    
# exp = mlflow.tracking.MlflowClient(tracking_uri='./mlflow/mlruns')
    
# runs = exp.search_runs(str(experiment_id), run_view_type=deleted_runs)
    
# _ = [remove_run_dir(get_run_dir(run.info.artifact_uri)) for run in runs]