In [107]:
import pandas as pd
import numpy as np
from fastprogress.fastprogress import progress_bar 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.metrics import f1_score, balanced_accuracy_score, make_scorer
import seaborn as sns
from matplotlib import pyplot as plt

In [3]:
data = pd.read_csv('triplicate_batch_images_labels.csv')
data.drop('Unnamed: 0', axis=1, inplace=True)
data

Unnamed: 0,cell,drug,min_conc,max_conc,min_norm_intensity,max_norm_intensity,mean_sd,convergence,upper_limit,IC50,slope,lower_limit,file_path,image_name,cluster,label
0,8570,1034,0.007812,2.0,0.074097,0.664380,0.044709,True,0.901533,0.010218,-2.155016,0.152275,/Users/akfay/Documents/Capstone/triplicate_bat...,PC-14_1034_419_Glo_250_4_3.png,12,Mediocre
1,8570,1034,0.007812,2.0,0.087338,0.672330,0.040231,True,1.610422,0.005150,-1.883030,0.167002,/Users/akfay/Documents/Capstone/triplicate_bat...,PC-14_1034_419_Glo_250_4_6.png,12,Mediocre
2,8570,1034,0.007812,2.0,0.068806,0.717342,0.050703,True,14.883637,0.000010,-0.010746,-12.938230,/Users/akfay/Documents/Capstone/triplicate_bat...,PC-14_1034_419_Glo_250_4_9.png,29,Bad
3,8570,1034,0.007812,2.0,0.071687,0.569093,0.038261,True,0.744103,0.011272,-1.882265,0.129222,/Users/akfay/Documents/Capstone/triplicate_bat...,PC-14_1034_419_Glo_250_4_12.png,12,Mediocre
4,8570,1034,0.007812,2.0,0.058225,0.714683,0.055069,True,0.654130,0.026046,-6.163449,0.129686,/Users/akfay/Documents/Capstone/triplicate_bat...,PC-14_1034_419_Glo_250_4_15.png,19,Mediocre
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63231,6412,1192,0.009766,10.0,0.307581,1.335618,0.078498,True,1.165328,6.381202,-1.442833,-0.115036,/Users/akfay/Documents/Capstone/triplicate_bat...,HCC2218_1192_347_Glo_2000_4_3.png,38,Mediocre
63232,6413,1192,0.009766,10.0,0.671477,1.099878,0.064007,True,1.042716,22.063667,-0.497382,0.277841,/Users/akfay/Documents/Capstone/triplicate_bat...,AU565_1192_347_Glo_500_4_3.png,37,Mediocre
63233,6290,1192,0.009766,10.0,0.640827,1.023175,0.069889,True,0.911329,0.957016,-9.194456,0.793217,/Users/akfay/Documents/Capstone/triplicate_bat...,HCC1187_1192_347_Glo_3000_4_3.png,1,No Response
63234,6310,1192,0.009766,10.0,0.572646,1.129622,0.119326,True,0.928182,1782.928576,-0.634245,-6.424923,/Users/akfay/Documents/Capstone/triplicate_bat...,MDA-MB-231_1192_347_Glo_100_4_3.png,6,Mediocre


In [66]:
data = data[~data.duplicated('image_name', keep='first')]
data

Unnamed: 0,cell,drug,min_conc,max_conc,min_norm_intensity,max_norm_intensity,mean_sd,convergence,upper_limit,IC50,slope,lower_limit,file_path,image_name,cluster,label
0,8570,1034,0.007812,2.0,0.074097,0.664380,0.044709,True,0.901533,0.010218,-2.155016,0.152275,/Users/akfay/Documents/Capstone/triplicate_bat...,PC-14_1034_419_Glo_250_4_3.png,12,mediocre
1,8570,1034,0.007812,2.0,0.087338,0.672330,0.040231,True,1.610422,0.005150,-1.883030,0.167002,/Users/akfay/Documents/Capstone/triplicate_bat...,PC-14_1034_419_Glo_250_4_6.png,12,mediocre
2,8570,1034,0.007812,2.0,0.068806,0.717342,0.050703,True,14.883637,0.000010,-0.010746,-12.938230,/Users/akfay/Documents/Capstone/triplicate_bat...,PC-14_1034_419_Glo_250_4_9.png,29,bad
3,8570,1034,0.007812,2.0,0.071687,0.569093,0.038261,True,0.744103,0.011272,-1.882265,0.129222,/Users/akfay/Documents/Capstone/triplicate_bat...,PC-14_1034_419_Glo_250_4_12.png,12,mediocre
4,8570,1034,0.007812,2.0,0.058225,0.714683,0.055069,True,0.654130,0.026046,-6.163449,0.129686,/Users/akfay/Documents/Capstone/triplicate_bat...,PC-14_1034_419_Glo_250_4_15.png,19,mediocre
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63231,6412,1192,0.009766,10.0,0.307581,1.335618,0.078498,True,1.165328,6.381202,-1.442833,-0.115036,/Users/akfay/Documents/Capstone/triplicate_bat...,HCC2218_1192_347_Glo_2000_4_3.png,38,mediocre
63232,6413,1192,0.009766,10.0,0.671477,1.099878,0.064007,True,1.042716,22.063667,-0.497382,0.277841,/Users/akfay/Documents/Capstone/triplicate_bat...,AU565_1192_347_Glo_500_4_3.png,37,mediocre
63233,6290,1192,0.009766,10.0,0.640827,1.023175,0.069889,True,0.911329,0.957016,-9.194456,0.793217,/Users/akfay/Documents/Capstone/triplicate_bat...,HCC1187_1192_347_Glo_3000_4_3.png,1,no response
63234,6310,1192,0.009766,10.0,0.572646,1.129622,0.119326,True,0.928182,1782.928576,-0.634245,-6.424923,/Users/akfay/Documents/Capstone/triplicate_bat...,MDA-MB-231_1192_347_Glo_100_4_3.png,6,mediocre


In [63]:
data.to_csv('triplicate_batch_images_labels_updated.csv')

In [77]:
data['convergence'] = np.where(data['convergence']==True, 1, 0)
data

Unnamed: 0,cell,drug,min_conc,max_conc,min_norm_intensity,max_norm_intensity,mean_sd,convergence,upper_limit,IC50,slope,lower_limit,file_path,image_name,cluster,label
0,8570,1034,0.007812,2.0,0.074097,0.664380,0.044709,1,0.901533,0.010218,-2.155016,0.152275,/Users/akfay/Documents/Capstone/triplicate_bat...,PC-14_1034_419_Glo_250_4_3.png,12,mediocre
1,8570,1034,0.007812,2.0,0.087338,0.672330,0.040231,1,1.610422,0.005150,-1.883030,0.167002,/Users/akfay/Documents/Capstone/triplicate_bat...,PC-14_1034_419_Glo_250_4_6.png,12,mediocre
2,8570,1034,0.007812,2.0,0.068806,0.717342,0.050703,1,14.883637,0.000010,-0.010746,-12.938230,/Users/akfay/Documents/Capstone/triplicate_bat...,PC-14_1034_419_Glo_250_4_9.png,29,bad
3,8570,1034,0.007812,2.0,0.071687,0.569093,0.038261,1,0.744103,0.011272,-1.882265,0.129222,/Users/akfay/Documents/Capstone/triplicate_bat...,PC-14_1034_419_Glo_250_4_12.png,12,mediocre
4,8570,1034,0.007812,2.0,0.058225,0.714683,0.055069,1,0.654130,0.026046,-6.163449,0.129686,/Users/akfay/Documents/Capstone/triplicate_bat...,PC-14_1034_419_Glo_250_4_15.png,19,mediocre
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63231,6412,1192,0.009766,10.0,0.307581,1.335618,0.078498,1,1.165328,6.381202,-1.442833,-0.115036,/Users/akfay/Documents/Capstone/triplicate_bat...,HCC2218_1192_347_Glo_2000_4_3.png,38,mediocre
63232,6413,1192,0.009766,10.0,0.671477,1.099878,0.064007,1,1.042716,22.063667,-0.497382,0.277841,/Users/akfay/Documents/Capstone/triplicate_bat...,AU565_1192_347_Glo_500_4_3.png,37,mediocre
63233,6290,1192,0.009766,10.0,0.640827,1.023175,0.069889,1,0.911329,0.957016,-9.194456,0.793217,/Users/akfay/Documents/Capstone/triplicate_bat...,HCC1187_1192_347_Glo_3000_4_3.png,1,no response
63234,6310,1192,0.009766,10.0,0.572646,1.129622,0.119326,1,0.928182,1782.928576,-0.634245,-6.424923,/Users/akfay/Documents/Capstone/triplicate_bat...,MDA-MB-231_1192_347_Glo_100_4_3.png,6,mediocre


In [154]:
data['IC50'] = data['IC50'].where(data['IC50']<30, 30)
data['IC50'] = data['IC50'].where(data['IC50']>0.0000001, 0)

In [155]:
X = data.iloc[:,:12]
y = data.iloc[:,-1]

In [156]:
random_state = 1
test_size = 0.3

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)

In [157]:
scaler = StandardScaler()
X_train_scale = scaler.fit_transform(X_train)

f1 = make_scorer(f1_score, average='weighted')

In [158]:
n_estimators = [10,50,100,500]
max_depth = [1,5,10,50]
min_samples_split = [2,5]
min_samples_leaf = [1,2,5]

random_grid = {'n_estimators':n_estimators,
              'max_depth':max_depth,
              'min_samples_split':min_samples_split,
              'min_samples_leaf':min_samples_leaf}

rf = RandomForestClassifier()

rf_random = RandomizedSearchCV(
    estimator=rf,
    param_distributions=random_grid,
    verbose=1,
    cv=5,
    n_iter=20,
    scoring=f1,
    random_state=0)

rf_random.fit(X_train, y_train)

print(rf_random.best_score_)
print(rf_random.best_estimator_)
print(rf_random.best_params_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
0.8605853568259313
RandomForestClassifier(max_depth=50, min_samples_split=5)
{'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 50}
