In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

### Import preprocessed mRNA expression level profile and TNM staging details for each patient

Meaning of TMN cancer stages: 
https://www.cancer.gov/about-cancer/diagnosis-staging/staging

Idea behind this analysis is to try and test classification algorithms, to see if mRNA expression profile contains enough information for TMN cancer staging.

Each patient sample is assigned a stage for T, N, and M, indicating the stage of cancer progression for the primary tumor, whether the cancer has spread to lymph nodes, or if there is distant metastasis. 

In [None]:
mRNA_stage_df = pd.read_csv("mRNA_levels_cancer_stage.txt")

Separating X and Y features

In [None]:
mRNA_df = mRNA_stage_df.drop(["PATIENT_ID", "M_STAGE", "N_STAGE", "T_STAGE"], axis = 1)
N_stage = mRNA_stage_df["N_STAGE"].values
M_stage = mRNA_stage_df["M_STAGE"].values
T_stage = mRNA_stage_df["T_STAGE"].values
mRNA_df.head()

Unnamed: 0.1,Unnamed: 0,A1BG,A1CF,A1CF.1,A1CF.2,A1CF.3,A2LD1,A2LD1.1,A2M,A2ML1,A4GALT,A4GNT,AAAS,AAAS.1,AACS,AADAC,AADACL2,AADACL3,AADACL3.1,AADACL4,AADAT,AADAT.1,AAGAB,AAK1,AAMP,AANAT,AANAT.1,AARS,AARS2,AARSD1,AARSD1.1,AARSD1.2,AARSD1.3,AASDH,AASDHPPT,AASS,AATF,AATK,AATK.1,ABAT,...,ZSCAN18.1,ZSCAN18.2,ZSCAN18.3,ZSCAN2,ZSCAN2.1,ZSCAN2.2,ZSCAN20,ZSCAN21,ZSCAN22,ZSCAN23,ZSCAN29,ZSCAN30,ZSCAN30.1,ZSCAN4,ZSCAN5A,ZSCAN5B,ZSWIM1,ZSWIM2,ZSWIM3,ZSWIM4,ZSWIM5,ZSWIM6,ZSWIM7,ZSWIM7.1,ZUFSP,ZW10,ZWILCH,ZWINT,ZWINT.1,ZWINT.2,ZXDA,ZXDB,ZXDC,ZXDC.1,ZYG11A,ZYG11B,ZYX,ZYX.1,ZZEF1,ZZZ3
0,1,2.1932,-0.1078,-0.136,-0.2424,-0.2237,-0.7489,-0.2228,-1.0624,-0.4599,0.5182,-0.4939,0.8676,-0.9954,2.4721,-0.9421,-0.3942,-1.3825,-2.4736,-1.8556,-0.4129,-1.2897,0.4027,0.4582,1.3838,-0.3868,-0.1841,2.8035,0.0887,-1.0011,-0.6256,-1.0288,-0.6351,-0.7955,-0.8185,-1.0664,0.412,0.4122,-1.4868,-0.8146,...,1.8305,1.1972,1.2219,0.3238,-0.1784,-0.1947,1.4346,-1.4931,3.0452,0.7853,-0.1509,-0.246,1.3714,-0.4527,1.4072,0.5062,0.2234,-0.7314,-0.4031,-1.0053,3.2261,-0.8413,-0.254,-0.485,-1.9779,-0.5921,-0.2813,-0.0488,1.0234,1.9311,-0.361,-0.7975,2.1574,0.3231,-0.287,0.7711,-0.4013,-2.187,0.5752,1.7372
1,2,0.6506,-0.2093,-0.1355,-0.2424,-0.2237,-0.3498,-0.4392,-0.6533,-0.4599,1.1816,-0.4656,-0.457,-0.3944,0.8892,-0.74,-0.3942,-1.3825,-2.4736,-1.8556,0.8198,1.9559,0.4076,-0.6589,1.2138,-1.4208,-0.1778,-0.3303,0.0979,-0.0633,-0.572,-1.0288,-0.6352,0.5704,0.7716,-1.1267,1.2436,1.9421,2.3481,-0.9512,...,-0.5611,-0.3543,-0.1608,-1.9698,-0.1821,-0.428,-0.555,-0.3113,-0.8029,-0.2386,-1.0892,-0.9522,1.0471,-0.5828,-1.8275,-0.7843,-1.1866,0.8646,1.1539,-1.2877,0.4721,-1.3617,-0.8319,0.5386,1.0649,0.9843,0.3566,2.013,2.4538,1.1699,0.1536,0.3212,-1.1306,-0.0323,0.3615,-0.1175,-0.2012,0.7754,-1.3275,0.279
2,3,0.0698,-0.2539,-0.1404,-0.2424,-0.2237,-0.1526,3.0165,-0.4834,-0.4599,-0.0993,-1.0189,-1.0672,-0.8412,0.5574,-0.9421,-0.3942,0.1461,-0.9113,-1.8556,0.5457,0.2024,0.6843,1.8559,0.7741,0.3106,0.2212,-0.52,-0.315,-0.8948,1.3483,0.1411,-0.6352,-2.3327,-0.0571,-0.4561,-0.1005,1.4877,0.6089,0.0763,...,0.8959,0.7554,-0.4042,1.1274,1.399,2.0919,0.3766,-0.0104,0.2402,0.5753,0.1959,0.1196,1.05,-0.4622,0.7947,0.3235,1.2099,-0.4878,0.6603,0.2226,0.1249,0.2583,-1.922,-1.1985,-1.3681,-0.4272,1.4475,-0.7796,0.1694,0.3336,-0.2242,-0.2974,0.9671,0.3481,0.0919,1.3154,-0.0333,-0.1646,-0.1237,-0.6929
3,4,0.7134,-0.2692,-0.1378,-0.2424,-0.2236,-0.5624,-0.4932,0.2838,-0.4599,0.2084,-0.9007,-1.2652,-0.8102,-0.9537,-0.5492,-0.3942,-1.3825,-2.4736,-1.8556,-0.4778,-0.1564,0.3517,-0.1247,-0.3633,-1.4208,-0.1993,-0.6659,0.5792,2.3373,-0.6256,-1.0288,-0.6352,-0.1186,1.6809,-0.8189,0.9866,-1.2695,-1.1678,-0.8862,...,0.382,0.0918,-0.3456,0.1891,0.3572,-0.2044,-0.5732,0.5979,0.1677,-0.2296,-0.0679,-2.129,-0.5531,-0.2512,-0.3776,-1.1377,-1.2387,-0.0699,-0.9347,-0.2334,0.4008,-1.3862,0.8607,0.3867,-0.3005,0.138,-0.158,0.2439,0.2344,0.3252,1.4015,2.1052,-0.1877,-0.9042,-0.6519,-0.0488,0.3487,-0.065,-0.4777,0.7618
4,5,0.4841,-0.3077,-0.1463,-0.2424,-0.2237,-0.0745,-0.6359,0.7415,-0.4599,-0.8501,-1.2229,0.9368,0.7829,-0.7958,0.9748,-0.3567,-1.3825,-2.4736,-1.8556,-0.1455,0.3675,1.8164,-0.3934,1.2561,-0.584,-0.1594,1.0826,0.2447,0.182,-0.6256,-1.0287,-0.6352,1.2247,0.62,-0.0418,-0.6588,0.1631,1.3545,-1.2004,...,0.9198,1.026,0.937,0.8617,-0.2778,0.1761,0.2619,1.3396,0.1745,0.8796,1.0738,0.1469,0.6825,0.1471,1.0386,1.1796,-0.4296,-0.933,1.3948,0.0981,0.3879,1.0778,-0.8023,0.4182,0.5022,0.5231,1.8318,0.2996,0.6521,0.7297,0.5347,-0.0776,0.2938,-0.1048,-0.4315,0.1188,0.7458,0.5608,-0.0328,-0.2793


### KNN classifier based on lymph node stage (N_stage)

CV scores not great for lymph node staging. 

In [27]:
X_train, X_test, y_train, y_test = train_test_split(mRNA_df, N_stage)

In [28]:
knn_classify = KNeighborsClassifier(n_neighbors = 5)
knn_classify.fit(X_train, y_train)

print("KNN classifier, n_neighbors = 5, KFold CV:")
print("Mean training set CV score:\n{:.3f}".format(
    np.mean(cross_val_score(knn_classify, X_train, y_train))))
print("Test set CV score:\n{:.3f}".format(
    np.mean(cross_val_score(knn_classify, X_test, y_test))))


KNN classifier, n_neighbors = 5, KFold CV:




Mean training set CV score:
0.353




Test set CV score:
0.583


GridSearchCV for KNN classifier based on lymph node stage

In [30]:
# stratified kfold
skfold = StratifiedKFold(n_splits = 5, shuffle = True)
# kfold with shuffling
kfold = KFold(n_splits = 5, shuffle = True)

knn_param_grid = {'n_neighbors': np.arange(1, 10, 2)}

knn_k_fold_grid = GridSearchCV(KNeighborsClassifier(), 
                        param_grid = knn_param_grid, 
                        cv = kfold)

knn_sk_fold_grid = GridSearchCV(KNeighborsClassifier(), 
                        param_grid = knn_param_grid, 
                        cv = skfold)

knn_k_fold_grid.fit(X_train, y_train)
knn_sk_fold_grid.fit(X_train, y_train)

print("KNN Classifier, lymph node stage, k-fold:")
print("Best mean CV score: {:.3f}".format(knn_k_fold_grid.best_score_))
print("Best parameters: {}".format(knn_k_fold_grid.best_params_))
print("Test-set score: {:.3f}".format(knn_k_fold_grid.score(X_test, y_test)))
print("\n")
print("KNN Classifier, lymph node stage, stratified k-fold:")
print("Best mean CV score: {:.3f}".format(knn_sk_fold_grid.best_score_))
print("Best parameters: {}".format(knn_sk_fold_grid.best_params_))
print("Test-set score: {:.3f}".format(knn_sk_fold_grid.score(X_test, y_test)))



KNN Classifier, lymph node stage, k-fold:
Best mean CV score: 0.427
Best parameters: {'n_neighbors': 7}
Test-set score: 0.389


KNN Classifier, lymph node stage, stratified k-fold:
Best mean CV score: 0.518
Best parameters: {'n_neighbors': 9}
Test-set score: 0.389


### KNN classifier based on Metastasis stage (M_stage)

- Good CV scores

In [24]:
X_train, X_test, y_train, y_test = train_test_split(mRNA_df, M_stage)

In [25]:
knn_classify = KNeighborsClassifier(n_neighbors = 5)
knn_classify.fit(X_train, y_train)

print("KNN classifier, n_neighbors = 5, KFold CV:")
print("Mean training set CV score:\n{:.3f}".format(
    np.mean(cross_val_score(knn_classify, X_train, y_train))))
print("Test set CV score:\n{:.3f}".format(
    np.mean(cross_val_score(knn_classify, X_test, y_test))))


KNN classifier, n_neighbors = 5, KFold CV:
Mean training set CV score:
0.871




Test set CV score:
0.950


This possibly means the mRNA expression profile is adequete for predicting whether the cancer has metastasized. 

Will we have better performance after feature selection? 

GridSearchCV for KNN classifier based on Metastasis stage

- comparing two CV strategies, stratified kfold and kfold (both with shuffling)
- Best parameter setting is 3 neighbors. 

In [26]:
# stratified kfold
skfold = StratifiedKFold(n_splits = 5, shuffle = True)
# kfold with shuffling
kfold = KFold(n_splits = 5, shuffle = True)

knn_param_grid = {'n_neighbors': np.arange(1, 10, 2)}

knn_k_fold_grid = GridSearchCV(KNeighborsClassifier(), 
                        param_grid = knn_param_grid, 
                        cv = kfold)

knn_sk_fold_grid = GridSearchCV(KNeighborsClassifier(), 
                        param_grid = knn_param_grid, 
                        cv = skfold)

knn_k_fold_grid.fit(X_train, y_train)
knn_sk_fold_grid.fit(X_train, y_train)

print("KNN Classifier, Metastasis stage, k-fold:")
print("Best mean CV score: {:.3f}".format(knn_k_fold_grid.best_score_))
print("Best parameters: {}".format(knn_k_fold_grid.best_params_))
print("Test-set score: {:.3f}".format(knn_k_fold_grid.score(X_test, y_test)))
print("\n")
print("KNN Classifier, Metastasis stage, stratified k-fold:")
print("Best mean CV score: {:.3f}".format(knn_sk_fold_grid.best_score_))
print("Best parameters: {}".format(knn_sk_fold_grid.best_params_))
print("Test-set score: {:.3f}".format(knn_sk_fold_grid.score(X_test, y_test)))

KNN Classifier, Metastasis stage, k-fold:
Best mean CV score: 0.891
Best parameters: {'n_neighbors': 3}
Test-set score: 0.944


KNN Classifier, Metastasis stage, stratified k-fold:
Best mean CV score: 0.871
Best parameters: {'n_neighbors': 3}
Test-set score: 0.944


### KNN classifier based on primary tumor stage (T_stage)

CV scores not great for primary tumor staging. 

In [18]:
X_train, X_test, y_train, y_test = train_test_split(mRNA_df, T_stage)

In [20]:
knn_classify = KNeighborsClassifier(n_neighbors = 5)
knn_classify.fit(X_train, y_train)

print("KNN classifier, n_neighbors = 5, KFold CV:")
print("Mean training set CV score:\n{:.3f}".format(
    np.mean(cross_val_score(knn_classify, X_train, y_train))))
print("Test set CV score:\n{:.3f}".format(
    np.mean(cross_val_score(knn_classify, X_test, y_test))))

KNN classifier, n_neighbors = 5, KFold CV:
Mean training set CV score:
0.480




Test set CV score:
0.333


GridSearchCV for KNN classifier based on primary tumor stage (T_stage)

In [22]:
# stratified kfold
skfold = StratifiedKFold(n_splits = 5, shuffle = True)
# kfold with shuffling
kfold = KFold(n_splits = 5, shuffle = True)

knn_param_grid = {'n_neighbors': np.arange(1, 10, 2)}

knn_k_fold_grid = GridSearchCV(KNeighborsClassifier(), 
                        param_grid = knn_param_grid, 
                        cv = kfold)

knn_sk_fold_grid = GridSearchCV(KNeighborsClassifier(), 
                        param_grid = knn_param_grid, 
                        cv = skfold)

knn_k_fold_grid.fit(X_train, y_train)
knn_sk_fold_grid.fit(X_train, y_train)

print("KNN Classifier, primary tumor stage, k-fold:")
print("Best mean CV score: {:.3f}".format(knn_k_fold_grid.best_score_))
print("Best parameters: {}".format(knn_k_fold_grid.best_params_))
print("Test-set score: {:.3f}".format(knn_k_fold_grid.score(X_test, y_test)))
print("\n")
print("KNN Classifier, primary tumor stage, stratified k-fold:")
print("Best mean CV score: {:.3f}".format(knn_sk_fold_grid.best_score_))
print("Best parameters: {}".format(knn_sk_fold_grid.best_params_))
print("Test-set score: {:.3f}".format(knn_sk_fold_grid.score(X_test, y_test)))

KNN Classifier, primary tumor stage, k-fold:
Best mean CV score: 0.478
Best parameters: {'n_neighbors': 1}
Test-set score: 0.222


KNN Classifier, primary tumor stage, stratified k-fold:
Best mean CV score: 0.482
Best parameters: {'n_neighbors': 1}
Test-set score: 0.222
