In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import pickle
from tqdm import tqdm
mpl.rcParams['figure.dpi']= 200

In [2]:
data = pd.read_csv("Breast_GSE70947.csv")
data = data.to_numpy()
y = data[:,1]
y = 1 * (y == 'normal')
X = data[:,2:]
# splitting the data set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)

In [6]:
# checking the sets are balanced (so checking proportions of each class)
np.unique(y_train, return_counts=True)[1]/y_train.shape[0]
np.unique(y_test, return_counts=True)[1]/y_test.shape[0]
# compare against original dataset proportions
np.unique(y, return_counts=True)[1]/y.shape[0]

array([0.49480969, 0.50519031])

## Random Forest Classifier

In [7]:
# on a bunch of models, first do 5-fold CV, then choose "best" one, finally test model on test data
depths = np.int64(np.linspace(1, 50, 25, endpoint=True))
rfc_cv_scores = []
rfc_models = []
for depth in tqdm(depths):
    rfc = RandomForestClassifier(max_depth=depth, n_estimators=15, max_features=1, bootstrap=True, random_state=0) # check what random state's for
    score = np.mean(cross_val_score(rfc, X_train, y_train, cv=5))
    print("depth " + str(depth) + ": " + str(score))
    rfc_cv_scores = rfc_cv_scores + [score]
    rfc_models = rfc_models + [rfc]
    # if depth == depths[-1]:
    #     pickle.dump(rfc, open("rf_model", "wb"))
rfc_optim_model = rfc_models[np.argmax(rfc_cv_scores)]
pickle.dump(rfc_optim_model, open("rf_model", "wb"))

  4%|▍         | 1/25 [00:02<00:52,  2.19s/it]

depth 1: 0.7316067653276956


  8%|▊         | 2/25 [00:04<00:50,  2.18s/it]

depth 3: 0.7916490486257929


 12%|█▏        | 3/25 [00:06<00:48,  2.18s/it]

depth 5: 0.8010570824524313


 16%|█▌        | 4/25 [00:08<00:45,  2.18s/it]

depth 7: 0.7964059196617337


 20%|██        | 5/25 [00:10<00:43,  2.16s/it]

depth 9: 0.7823467230443975


 24%|██▍       | 6/25 [00:13<00:41,  2.16s/it]

depth 11: 0.828752642706131


 28%|██▊       | 7/25 [00:15<00:38,  2.15s/it]

depth 13: 0.8195560253699788


 32%|███▏      | 8/25 [00:17<00:36,  2.16s/it]

depth 15: 0.8288583509513741


 36%|███▌      | 9/25 [00:19<00:34,  2.16s/it]

depth 17: 0.8288583509513741


 40%|████      | 10/25 [00:21<00:32,  2.14s/it]

depth 19: 0.8288583509513741


 44%|████▍     | 11/25 [00:23<00:30,  2.15s/it]

depth 21: 0.8288583509513741


 48%|████▊     | 12/25 [00:25<00:27,  2.14s/it]

depth 23: 0.8288583509513741


 52%|█████▏    | 13/25 [00:28<00:25,  2.14s/it]

depth 25: 0.8288583509513741


 56%|█████▌    | 14/25 [00:30<00:23,  2.14s/it]

depth 27: 0.8288583509513741


 60%|██████    | 15/25 [00:32<00:21,  2.13s/it]

depth 29: 0.8288583509513741


 64%|██████▍   | 16/25 [00:34<00:19,  2.13s/it]

depth 31: 0.8288583509513741


 68%|██████▊   | 17/25 [00:36<00:17,  2.13s/it]

depth 33: 0.8288583509513741


 72%|███████▏  | 18/25 [00:38<00:14,  2.13s/it]

depth 35: 0.8288583509513741


 76%|███████▌  | 19/25 [00:40<00:12,  2.13s/it]

depth 37: 0.8288583509513741


 80%|████████  | 20/25 [00:42<00:10,  2.13s/it]

depth 39: 0.8288583509513741


 84%|████████▍ | 21/25 [00:45<00:08,  2.13s/it]

depth 41: 0.8288583509513741


 88%|████████▊ | 22/25 [00:47<00:06,  2.13s/it]

depth 43: 0.8288583509513741


 92%|█████████▏| 23/25 [00:49<00:04,  2.13s/it]

depth 45: 0.8288583509513741


 96%|█████████▌| 24/25 [00:51<00:02,  2.13s/it]

depth 47: 0.8288583509513741


100%|██████████| 25/25 [00:53<00:00,  2.14s/it]

depth 50: 0.8288583509513741





In [8]:
#  now we try on test data
loaded_model = pickle.load(open("rf_model", "rb"))
loaded_model.fit(X_train, y_train)
loaded_model.score(X_test, y_test)

0.8082191780821918

In [None]:
sns.set(rc={"figure.figsize":(12,5)})
sns.lineplot(x=depths,y=cv_scores, marker="o")
plt.xlabel("Forest depth")
plt.ylabel("Average 5-fold CV score")
plt.title("Determining RF depth via CV")
plt.savefig("rfs_cv.png")

## AdaBoost

In [9]:
# on a bunch of models, first do 5-fold CV, then choose "best" one, finally test model on test data
estimators = [1, 5, 15, 25, 50, 75]
ada_cv_scores = []
ada_models = []
for estimator in tqdm(estimators):
    abc = AdaBoostClassifier(n_estimators=estimator)    
    score = np.mean(cross_val_score(abc, X_train, y_train, cv=5))
    print("estimator " + str(estimator) + ": " + str(score))
    ada_cv_scores = ada_cv_scores + [score]
    ada_models = ada_models + [abc]
ada_optim_model = ada_models[np.argmax(ada_cv_scores)]
pickle.dump(ada_optim_model, open("ada_model", "wb"))

 17%|█▋        | 1/6 [00:06<00:33,  6.67s/it]

estimator 1: 0.8242071881606764


 33%|███▎      | 2/6 [00:32<01:12, 18.10s/it]

estimator 5: 0.8101479915433405


 50%|█████     | 3/6 [01:47<02:12, 44.11s/it]

estimator 15: 0.8747357293868921


 67%|██████▋   | 4/6 [03:51<02:30, 75.41s/it]

estimator 25: 0.8839323467230443


 83%|████████▎ | 5/6 [07:59<02:17, 137.74s/it]

estimator 50: 0.9025369978858351


100%|██████████| 6/6 [14:12<00:00, 142.07s/it]

estimator 75: 0.8932346723044396





In [10]:
#  now we try on test data
loaded_model = pickle.load(open("ada_model", "rb"))
loaded_model.fit(X_train, y_train)
loaded_model.score(X_test, y_test)

0.8767123287671232

In [None]:
a