In [31]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas
import numpy as np
import random

#read data csv
expression_data = pandas.read_csv("GSE89843_TEP_Count_Matrix.txt", header=0, sep=" ")
expression_data = expression_data.iloc[:,1:].T

patient_data = pandas.read_csv("PatientData.csv", header=0, sep="\t")
patient_group = patient_data.loc[:,["GEO ID (GSE89843)", "Classification group"]]
patient_group.set_index("GEO ID (GSE89843)", inplace=True)

merged_data = pandas.concat([expression_data, patient_group], axis=1, join="inner")
recoded_y = [1 if i == "NSCLC" else 0 for i in merged_data["Classification group"]]
merged_data.drop(columns=["Classification group"], inplace=True)

in_data = merged_data.to_numpy()
out_data = np.array(recoded_y)

def tvt_split(x, y, train_size, val_split, random_seed):
    rng = np.random.default_rng(seed=random_seed)
    rng.shuffle(x)
    rng = np.random.default_rng(seed=random_seed)
    rng.shuffle(y)
    
    train_boundary = int(train_size * len(x))
    valid_boundary = int(train_boundary + (len(x) - train_boundary) * val_split)
    
    train_x = x[0:train_boundary]
    train_y = y[0:train_boundary]
    valid_x = x[train_boundary:valid_boundary]
    valid_y = y[train_boundary:valid_boundary]
    test_x = x[valid_boundary:]
    test_y = y[valid_boundary:]
    
    return train_x, valid_x, test_x, train_y, valid_y, test_y 

#test-train split
x_train, x_val, x_test, y_train, y_val, y_test = tvt_split(in_data, out_data, train_size=0.6, val_split=0.5, random_seed=333)

#fit random forest
best_validation_score = 0.0
best_hyperparameters = {
    "n_estimators": 8,
    "max_features": "sqrt",
    "criterion": "gini",
    "class_weight": None
}

grid_options = {
    "n_estimators": [8, 16, 32, 64, 128],
    "max_features": ["sqrt", "log2", None],
    "criterion": ["gini", "entropy"],
    "class_weight": [None, "balanced", "balanced_subsample"]
}

for key in grid_options:
    for i in grid_options[key]:
        
        hyperparameters = best_hyperparameters.copy()
        hyperparameters[key] = i

        model = RandomForestClassifier(n_estimators=hyperparameters["n_estimators"],
                                       criterion=hyperparameters["criterion"],
                                       max_features=hyperparameters["max_features"],
                                       random_state=333,
                                       class_weight=hyperparameters["class_weight"])
        model.fit(x_train, y_train)

        hyperparameters_score = model.score(x_val, y_val)

        if hyperparameters_score > best_validation_score:
            best_validation_score = hyperparameters_score
            best_hyperparameters[key] = i

#evaluate random forest
model = RandomForestClassifier(n_estimators=best_hyperparameters["n_estimators"],
                               criterion=best_hyperparameters["criterion"],
                               max_features=best_hyperparameters["max_features"],
                               random_state=333,
                               class_weight=best_hyperparameters["max_features"])
model.fit(x_train, y_train)

print(model.score(x_test, y_test))
print(best_hyperparameters)

0.8205128205128205
{'n_estimators': 128, 'max_features': None, 'criterion': 'gini', 'class_weight': 'balanced_subsample'}
