In [None]:
import pandas as pd
import numpy as np
from fastprogress.fastprogress import progress_bar 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, balanced_accuracy_score, make_scorer, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score, precision_score, recall_score, accuracy_score
from matplotlib import pyplot as plt
import os
import pickle

In [None]:
# get data and split into test and train

def get_data(model_name, n_clusters):
    data = pd.read_csv(f'analysis/triplicate_batch_{model_name}_kmeans{n_clusters}_labels.csv')
    
    # change convergence column to binary from bool
    data['convergence'] = np.where(data['convergence']==True, 1, 0)
    
    # some of the values are extremely large or small, so make new values that do not go over 30 or under 0.0000001
    data['IC50'] = data['IC50'].where(data['IC50']<30, 30)
    data['IC50'] = data['IC50'].where(data['IC50']>0.0000001, 0)
    
    # define X and y
    X = data.iloc[:,:12]
    y = data.iloc[:,-1]
    
    # split into test and train sets
    random_state = 1
    test_size = 0.3

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)
    
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = get_data(model_name='VGG19', n_clusters=40)

In [None]:
# train a model and return predictions

def model_predict(model_name):
    rf = RandomForestClassifier(n_estimators=200,
                               max_depth=75,
                               min_samples_leaf=1,
                               min_samples_split=3)
    rf.fit(X_train, y_train)
    
    # save the model
    if os.path.exists('supervised') == False:
        os.makedirs('supervised')
    
    with open(f"supervised/{model_name}.pkl", "wb") as f:
        pickle.dump(rf, f)
    
    # get predictions
    y_pred = rf.predict(X_test)
    
    return y_pred, rf

In [None]:
y_pred, rf = model_predict(model_name='RF')

In [None]:
# score using multiple metrics

def get_metrics(y_test, y_pred):
    
    print('Accuracy:', np.round(accuracy_score(y_test, y_pred), 4))
    print('Balanced Accuracy', np.round(balanced_accuracy_score(y_test, y_pred), 4))
    print('Precision:', np.round(precision_score(y_test, y_pred, average='weighted'), 4))
    print('Recall:', np.round(recall_score(y_test, y_pred, average='weighted'), 4))
    print('F1 Score:', np.round(f1_score(y_test, y_pred, average='weighted'), 4))
    
get_metrics(y_test, y_pred)

In [None]:
# function to display the confusion matrix

def cm_display(y_true, y_pred, model_name):
    
    labels = pd.Series(pd.concat([pd.Series(y_true), pd.Series(y_pred)]).unique()).sort_values()

    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    fig, ax = plt.subplots(figsize=(10,10))
    disp.plot(ax=ax, xticks_rotation = 'vertical')
    plt.title('Random Forest Confusion Matrix')
    
    if os.path.exists('supervised') == False:
        os.makedirs('supervised')

    plt.savefig(f'supervised/{model_name}_confusion_matrix.png', bbox_inches='tight')
    

In [None]:
# plot the confusion matrix

cm_display(y_test, y_pred, model_name='RF')

In [None]:
# extract the feature importances

def feature_importances(X, trained_rf, model_name):

    plt.barh(X.columns, (trained_rf.feature_importances_)*100)
    plt.xlabel('% Importance')
    plt.title('Random Forest Feature Importance')

    if os.path.exists('supervised') == False:
        os.makedirs('supervised')

    plt.savefig(f'supervised/{model_name}_features.png', bbox_inches='tight')


In [None]:
# display the feature importances

feature_importances(X=X_test, trained_rf=rf, model_name='RF')