### Import libraries

In [1]:
import pathlib
import warnings

import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (
    StratifiedKFold,
    GridSearchCV,
)
from sklearn.utils import shuffle, parallel_backend
from sklearn.exceptions import ConvergenceWarning
from joblib import dump

import sys

sys.path.append("../utils")
from split_utils import get_features_data
from train_utils import get_dataset, get_X_y_data

### Specify results directory, load training data

In [2]:
# set numpy seed to make random operations reproduceable
np.random.seed(0)

# create results directory
results_dir = pathlib.Path("models/multi_class_models/")
results_dir.mkdir(parents=True, exist_ok=True)

# load training data from indexes and features dataframe
data_split_path = pathlib.Path(f"../1.split_data/indexes/data_split_indexes.tsv")
features_dataframe_path = pathlib.Path("../0.download_data/data/labeled_data.csv.gz")

# dataframe with only the labeled data we want (exclude certain phenotypic classes)
features_dataframe = get_features_data(features_dataframe_path)
data_split_indexes = pd.read_csv(data_split_path, sep="\t", index_col=0)

# get training data from labeled data
training_data = get_dataset(features_dataframe, data_split_indexes, "train")
training_data

Unnamed: 0,Mitocheck_Phenotypic_Class,Cell_UUID,Location_Center_X,Location_Center_Y,Metadata_Plate,Metadata_Well,Metadata_Frame,Metadata_Site,Metadata_Plate_Map_Name,Metadata_DNA,...,DP__efficientnet_1270,DP__efficientnet_1271,DP__efficientnet_1272,DP__efficientnet_1273,DP__efficientnet_1274,DP__efficientnet_1275,DP__efficientnet_1276,DP__efficientnet_1277,DP__efficientnet_1278,DP__efficientnet_1279
0,Large,21da27ab-873a-41f4-ab98-49170cae9a2d,397,618,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,0.052590,-0.053785,-0.504303,-0.414150,-0.124684,1.236054,18.393635,-0.671193,0.808065,2.137708
1,Large,82f7949b-4ea2-45c8-8dd9-7854caf49077,359,584,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,-1.651595,-1.081079,-0.292396,0.281349,0.392258,3.295728,-1.171027,-0.789511,-0.786297,0.567598
2,Large,cec7234f-fe35-4411-aded-f8112bb31219,383,685,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,-1.345569,0.710935,-0.239072,-0.246432,0.569382,1.540583,-1.195304,0.088027,-0.908981,2.376957
3,Large,43d9e7c9-c9ec-45ce-8820-048bfb896989,932,532,LT0013_38,42,75,1,LT0013_38_42,LT0013_38/LT0013_38_42_75.tif,...,-1.625424,-0.694173,-0.057779,0.038326,0.676392,2.434467,-0.458172,-0.346555,-1.230486,0.737356
4,Large,63ce6652-338e-4afd-9c77-dbc0e903bf92,477,130,LT0013_38,42,75,1,LT0013_38_42,LT0013_38/LT0013_38_42_75.tif,...,-1.096411,-0.049845,0.220380,1.158381,-0.902400,6.174941,-0.740166,-0.455844,-0.975201,0.667681
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2911,OutOfFocus,380728fc-28b0-423f-b8a7-07be1af590d9,383,219,LT0601_01,217,49,1,LT0601_01_217,LT0601_01/LT0601_01_217_49.tif,...,-1.025056,10.113665,-1.352879,-2.820126,-0.238044,0.290222,-1.544321,0.599942,-0.986115,-0.153870
2912,OutOfFocus,30ed67c7-8de2-4d78-bce9-3fa1aff28565,975,294,LT0603_03,2,49,1,LT0603_03_2,LT0603_03/LT0603_03_2_49.tif,...,-0.662933,4.916376,-0.743990,-0.576664,-0.323437,-1.545750,-1.283859,0.159241,-1.476267,-2.449875
2913,OutOfFocus,2960b13e-6090-4592-b2a9-d1c4c1b24b50,898,302,LT0603_03,2,49,1,LT0603_03_2,LT0603_03/LT0603_03_2_49.tif,...,-1.511415,10.815635,-1.186674,-0.887601,0.070245,-0.428093,-0.974360,0.142016,-1.912889,0.607889
2914,OutOfFocus,fbc9ce6a-2b29-4115-b218-4ee5b8c50ac1,946,281,LT0603_03,2,49,1,LT0603_03_2,LT0603_03/LT0603_03_2_49.tif,...,-1.352269,12.909424,-2.151344,-0.970625,-0.443021,-0.515716,-2.719972,-1.273280,-0.928599,-1.974504


### Train model on each combination of model type and feature type

In [3]:
# specify model types and feature types
model_types = ["final", "shuffled_baseline"]
feature_types = ["CP", "DP", "CP_and_DP"]

# create stratified data sets for k-fold cross validation
straified_k_folds = StratifiedKFold(n_splits=10, shuffle=False)

# create logistic regression model with following parameters
log_reg_model = LogisticRegression(
    penalty="elasticnet", solver="saga", max_iter=100, n_jobs=-1, random_state=0
)

# specify parameters to tune for
parameters = {"C": np.logspace(-3, 3, 7), "l1_ratio": np.linspace(0, 1, 11)}
print(f"Parameters being tested during grid search: {parameters}\n")

# create grid search with cross validation with hypertuning params
grid_search_cv = GridSearchCV(
    log_reg_model,
    parameters,
    cv=straified_k_folds,
    n_jobs=-1,
    scoring="f1_weighted",
)

# train model on each combination of model type and feature type
for model_type in model_types:
    for feature_type in feature_types:
        print(f"Training {model_type} model on {feature_type} features...")

        X, y = get_X_y_data(training_data, feature_type)
        print(f"X has shape {X.shape}, y has shape {y.shape}")

        # shuffle columns of X (features) dataframe independently to create shuffled baseline
        if model_type == "shuffled_baseline":
            for column in X.T:
                np.random.shuffle(column)

        # fit grid search cv to X and y data
        # capture convergence warning from sklearn
        # this warning does not affect the model but takes up lots of space in the output
        with parallel_backend("multiprocessing"):
            with warnings.catch_warnings():
                warnings.filterwarnings(
                    "ignore", category=ConvergenceWarning, module="sklearn"
                )
                grid_search_cv = grid_search_cv.fit(X, y)

        # print info for best estimator
        print(f"Best parameters: {grid_search_cv.best_params_}")
        print(f"Score of best estimator: {grid_search_cv.best_score_}\n")

        # save final estimator
        dump(
            grid_search_cv.best_estimator_,
            f"{results_dir}/{model_type}__{feature_type}.joblib",
        )

Parameters being tested during grid search: {'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]), 'l1_ratio': array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])}

Training final model on CP features...
X has shape (2432, 157), y has shape (2432,)
Best parameters: {'C': 0.01, 'l1_ratio': 0.0}
Score of best estimator: 0.36433278877280373

Training final model on DP features...
X has shape (2432, 1280), y has shape (2432,)
Best parameters: {'C': 0.1, 'l1_ratio': 0.0}
Score of best estimator: 0.7508505527902235

Training final model on CP_and_DP features...
X has shape (2432, 1437), y has shape (2432,)
Best parameters: {'C': 0.1, 'l1_ratio': 0.0}
Score of best estimator: 0.5998805071421007

Training shuffled_baseline model on CP features...
X has shape (2432, 157), y has shape (2432,)
Best parameters: {'C': 0.01, 'l1_ratio': 0.6000000000000001}
Score of best estimator: 0.09416572394187064

Training shuffled_baseline model on DP features...
X has shape (2432, 12