In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.feature_selection import f_classif, SelectKBest, f_regression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier

In [46]:
GDSC_info = pd.read_csv("../input/hw2-pdx/GDSC_PDX_Paclitaxel_info.csv")
GDSC_data = pd.read_csv("../input/hw2-pdx/GDSC_PDX_Paclitaxel.csv")
CCLE_PDX = pd.read_csv("../input/hw2-pdx/CCLE_PDX_Paclitaxel.csv")

# Data cleaning

In [49]:
GDSC_info = GDSC_info.iloc[:, 1:]
GDSC_info["label"] = GDSC_info["IC50"] < GDSC_info["MAX_CONC_MICROMOLAR"]
GDSC_info["label"] = GDSC_info["label"].apply(lambda x: "S" if x is True else "R")
print(GDSC_info.columns)
print(GDSC_info.head())

In [50]:
correct_cols = ["CELL_LINE_NAME"]
correct_cols.extend(GDSC_data.columns[1:])
GDSC_data.columns = correct_cols

print(GDSC_data.columns)
GDSC_data.head()

In [51]:
X = pd.merge(GDSC_data, GDSC_info, on = "CELL_LINE_NAME")
y = X["label"]
X = X.drop(columns = ['MAX_CONC_MICROMOLAR', 'IC50', "CELL_LINE_NAME", "label"])

# Feature selection - RF

In [7]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 250, num = 5)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 10, num = 6)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
params_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'oob_score': [True]}
print(params_grid)

In [84]:
RFC = RandomForestClassifier(random_state = 201)
RFC_CV = GridSearchCV(estimator = RFC, param_grid = params_grid, cv = 5, verbose = 1, n_jobs = -1, scoring='roc_auc')
RFC_CV.fit(X, y)

In [85]:
print("Best estimators after Cross Validation is")
print(RFC_CV.best_params_)
print(f"with cross validation acc = {round(RFC_CV.best_score_, 4) * 100}%")
print(f"Training accuracy after refit= {round(RFC_CV.score(X, y) * 100, 4)}%")

In [86]:
num_selected_features = 20
RF_df = pd.DataFrame({"feature_names":X.columns, "feature_importances":RFC_CV.best_estimator_.feature_importances_})
RF_df = RF_df.sort_values(by = ["feature_importances"], ascending=False)
print(f"Top {num_selected_features} selected by RF classification are")
print(RF_df[:20]["feature_names"].tolist())

In [87]:
threshold = 0.0015
features = RF_df[RF_df["feature_importances"] > threshold]["feature_names"].tolist()
print(f"There are {len(features)} features selected by threshold {threshold}")
print(RF_df[RF_df["feature_importances"] > threshold]["feature_names"].tolist())

In [88]:
x = [i for i in range(X.shape[1])]
plt.plot(x, RF_df["feature_importances"])
plt.xlabel("Features sorted by features importance")
plt.ylabel("Feature importance")
plt.show()

# Selecting features and data preprocessing

In [8]:
L1_features = ['ANKFY1', 'C2orf68', 'CEP128', 'CHST2', 'GABPA', 'GYS2', 'HDAC1', 'ITGA4', 'JARID2', 'MDM1', 'OLFML2A', 'PPP3CB', 'PRIM2', 'PRPF4B', 'PTGES3', 'RAP2B', 'SMC6', 'TIMELESS', 'TRDMT1', 'TRIM25', 'TSLP', 'UBE2G1', 'ZNF318']
Anova_features = ['ABCB1', 'DOK4', 'SEZ6L2', 'PRPF4B', 'TBX3', 'RGS5', 'GPR22', 'COLEC11', 'SLC6A2', 'TFAP2B', 'ASTN2', 'C1QL4', 'PRIM2', 'PCSK1N', 'TRIM67', 'PNMA3', 'PODXL2', 'PHYHIPL', 'CRH', 'TMEM59L']
RF_features = ['ACTL7A', 'ALDH18A1', 'ABCB1', 'GYS2', 'KERA', 'TACR2', 'GPR22', 'THY1', 'FOXR1', 'C17orf64', 'MRPL14', 'NACC1', 'HSPA8', 'RBX1', 'MAP7D2', 'ATF1', 'PLCD4', 'DNAH10', 'CRH', 'HEATR4', 'ARHGDIA', 'RCC2', 'UCN3', 'FGF16', 'TMBIM6', 'PRRT4', 'SNCG', 'TRIM25', 'PRPSAP1', 'RECQL', 'TMEM203', 'VASP', 'PTOV1', 'ESPN', 'SLC7A11', 'LGALSL', 'IRAK1BP1', 'POLD2']
L1_set = set(L1_features)
Anova_set = set(Anova_features)
RF_set = set(RF_features)

In [59]:
union_features = L1_set | Anova_set | RF_set
intersect_features = L1_set & Anova_set & RF_set
print(f"Number of union features = {len(union_features)}")
print(f"Number of intersect features = {len(intersect_features)}")

In [60]:
#remove two features not in testing dataset
union_features.remove('ZNF318')
union_features.remove('TMEM203')
X_train = X[list(union_features)]
X_test = CCLE_PDX[list(union_features)]

In [63]:
SS = StandardScaler()
X_train = SS.fit_transform(X_train)
X_test = SS.transform(X_test)

In [78]:
X_test.shape

# Level One Model
- Ridge Regression
- Random Forest
- XGboost
- Support Vector Machine
- KNN

### Ridge Regression

In [134]:
clf_LG = LogisticRegressionCV(cv=5, penalty = "l2", random_state=5080).fit(X_train, y)
acc = clf_LG.score(X_train, y)
print("Best model hyperparameter is")
print(clf_LG.C_)
print(f"Training acc of best model = {round(acc * 100, 4)}%")
print("Cross Validation result in each parameter sets")
pd.DataFrame({"C":clf_LG.Cs_, "Cross valid acc": clf_LG.scores_["S"].mean(axis = 0)})

### Random Forest

In [129]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 500, num = 10)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 10, num = 6)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
params_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'oob_score': [True]}

In [135]:
RFC = RandomForestClassifier(random_state = 201)
RFC_CV = GridSearchCV(estimator = RFC, param_grid = params_grid, cv = 5, verbose = 1, n_jobs = -1)
RFC_CV.fit(X_train, y)

In [137]:
print("Best estimators after Cross Validation is")
print(RFC_CV.best_params_)
print(f"with cross validation acc = {round(RFC_CV.best_score_, 4) * 100}%")
print(f"Training accuracy after refit= {round(RFC_CV.score(X_train, y) * 100, 4)}%")

### SVM

In [13]:
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}

In [14]:
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=1)
grid.fit(X_train,y)

In [15]:
print(grid.best_estimator_)

### KNN

In [30]:
k_max = round(np.sqrt(X_train.shape[0]))
k_range = list(range(3, k_max, 3))
param_grid = {
    "n_neighbors": k_range,
    "weights": ["uniform", "distance"],
    "metric": ["euclidean", "manhattan"]
}

In [31]:
KNN = KNeighborsClassifier()
GS = GridSearchCV(KNN, 
    param_grid, 
    verbose = 1,
    cv = 5, n_jobs = -1).fit(X_train, y)

In [32]:
GS.best_estimator_.score(X_selected_SS, y)

In [26]:
print(GS.best_estimator_)

# Stacking model

In [72]:
Ridge = LogisticRegression(C = 0.04641589, penalty = "l2")
RF = RandomForestClassifier(random_state = 201, max_depth= 10,
                            min_samples_leaf= 1, min_samples_split= 5, n_estimators= 188, oob_score= True)
SVM = SVC(C=1, gamma=0.01, kernel='sigmoid')
KNN = KNeighborsClassifier(metric='manhattan', n_neighbors=3)
base_learners = [
    ("Ridge", Ridge),
    ("RF", RF),
    ("SVM", SVM),
    ("KNN", KNN)
]
clf = StackingClassifier(
    estimators = base_learners, 
    final_estimator = LogisticRegression(),
    cv = 10,
    n_jobs = -1,
    verbose = 1
)
clf.fit(X_train, y)

In [73]:
clf.score(X_train, y)

In [70]:
y.value_counts()

In [82]:
Ridge = LogisticRegression(C = 0.04641589, penalty = "l2")
Ridge.fit(X_train, y)
print(Ridge.score(X_train, y))
Ridge.predict(X_test)

In [71]:
pd.Series(y_pred).value_counts()