In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.svm import SVC

In [2]:
# trunc_illuminaga_rna_data has the same illuminaga RNA expression data as that downloaded from 
# class project website, though the keys in the header of trunc_illuminaga_rna_data.tsv now match
# patient.bcr_patient_barcode from the clinical data, which gives use better mapping between the
# clinical patients dataset and the RNA dataset.
gene_exp = pd.read_table('data/trunc_illuminahiseq_rna_data.tsv',
                                     header=0,
                                     index_col=0)

### Preprocessing and getting variances of gene across all patients

In [3]:
log_gene_exp_df = np.log(gene_exp.copy())

In [4]:
log_gene_exp_df.replace([np.inf, -np.inf], np.nan, inplace=True)  # must replace all -inf with NaN so that .min() will work.
min_fpkm_per_patient = list(log_gene_exp_df.min())
min_fpkm = min(min_fpkm_per_patient)
fpkm_floor = math.floor(min_fpkm)
log_gene_exp_df.replace(np.nan, fpkm_floor, inplace=True)

In [5]:
log_gene_exp_var_df = log_gene_exp_df.copy()
log_gene_exp_var_df['var'] = log_gene_exp_var_df.var(axis=1)
log_gene_exp_var_df = log_gene_exp_var_df.sort_values(by='var', ascending=False)
variance_df = log_gene_exp_var_df[['var']]
sorted_log_gene_exp_df = log_gene_exp_var_df.ix[:, :-1]
variance_list = [[x] for x in list(variance_df['var'])]

### Clustering

In [6]:
from sklearn.cluster import KMeans

In [7]:
# kmeans = KMeans(init='k-means++', n_clusters=100, n_init=10)
kmeans = KMeans(init='k-means++', n_clusters=10000, n_init=10)

In [8]:
kmeans.fit(variance_list)

KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=10000,
    n_init=10, n_jobs=1, precompute_distances='auto', random_state=None,
    tol=0.0001, verbose=0)

In [9]:
kmeans.labels_

array([ 59, 623,  15, ...,  73,  73,  73], dtype=int32)

In [10]:
def get_cluster_idx_dict(k_means_results):
    cluster_dict = {}
    for cluster_num in set(k_means_results.labels_):
        cluster_dict[cluster_num] = []
    for variance_idx in range(0, len(k_means_results.labels_)):
        cluster_num = k_means_results.labels_[variance_idx]
        cluster_dict[cluster_num].append(variance_idx)
    return cluster_dict

In [11]:
cluster_idx_dict = get_cluster_idx_dict(kmeans)

### Take PCA of gene clusters

In [12]:
# asdf = {}
# for key, values in cluster_idx_dict.items():
#     asdf[key] = []
#     for value in values:
#         if list(variance_df['var'])[value] > 30.0:
#             print(str(key)+"\n"+str(list(variance_df['var'])[value]))
#         asdf[key].append(list(variance_df['var'])[value])

# for key in asdf.keys():
#     print(str(key) + "\n" + str(asdf[key][1]) + "\t" + str(asdf[key][-1]))

In [13]:
# !!!This is really slow (minutes): there has got to be a way to return multiple rows from a df simultaneously with a list of indeces.
def get_cluster_df(cluster_index_list, gene_exp_df):
    df = pd.DataFrame()
    for idx in cluster_index_list:
        df = df.append(gene_exp_df.iloc[idx])
    return df

In [14]:
# cluster_gene_exp_df_dict = {}
# for cluster_key, cluster_gene_idx_list in cluster_idx_dict.items():
#     cluster_gene_exp_df_dict[cluster_key] = get_cluster_df(cluster_idx_dict[cluster_key], sorted_log_gene_exp_df)

pca = PCA(n_components=1)
# pca.fit(sorted_log_gene_exp_df.transpose())
cluster_gene_exp_pca_dict = {}
for cluster_key, cluster_gene_idx_list in cluster_idx_dict.items():
    cluster_df = get_cluster_df(cluster_idx_dict[cluster_key], sorted_log_gene_exp_df)
    # insert scaling below before PCA to see if this enhances classification performance.
    cluster_gene_exp_pca_dict[cluster_key] = pca.fit_transform(scale(cluster_df.transpose()))
#     cluster_gene_exp_pca_dict[cluster_key] = pca.transform(cluster_df.transpose())

In [15]:
cluster_pca_patient_df = pd.DataFrame()
for cluster_key in cluster_gene_exp_pca_dict.keys():
    cluster_patient_pca_dict = {}
    for idx in range(0, len(sorted_log_gene_exp_df.columns.values)):
        patient = sorted_log_gene_exp_df.columns.values[idx]
        pca_exp = cluster_gene_exp_pca_dict[cluster_key][idx][0]
        cluster_patient_pca_dict[patient] = pca_exp
    cluster_pca_patient_df = cluster_pca_patient_df.append(cluster_patient_pca_dict, ignore_index=True)
cluster_pca_patient_df

Unnamed: 0,TCGA-3L-AA1B,TCGA-4N-A93T,TCGA-4T-AA8H,TCGA-5M-AAT4,TCGA-5M-AAT5,TCGA-5M-AAT6,TCGA-5M-AATA,TCGA-5M-AATE,TCGA-A6-2671,TCGA-A6-2675,...,TCGA-QG-A5YV,TCGA-QG-A5YW,TCGA-QG-A5YX,TCGA-QG-A5Z1,TCGA-QG-A5Z2,TCGA-QL-A97D,TCGA-RU-A8FL,TCGA-SS-A7HO,TCGA-T9-A92H,TCGA-WS-AB45
0,-1.889674,-1.889674,-1.889674,0.893596,-0.102498,0.363660,0.428492,-1.889674,0.384869,0.171930,...,-1.889674,-1.889674,-1.889674,-1.889674,-0.086443,-0.206332,0.447000,0.138800,-1.889674,-1.889674
1,1.058762,1.961845,0.458783,0.949521,2.127280,-0.943269,0.256520,0.647792,-0.275091,-3.397365,...,2.149114,0.584426,2.066127,1.572017,-0.170506,1.559716,-0.997846,0.931365,0.671732,2.582814
2,0.727968,-0.036701,-0.852154,0.619169,0.259729,-0.025120,0.804763,0.876629,0.007869,-0.494059,...,0.381834,0.796650,0.200957,0.398247,0.233881,0.369281,-0.542983,0.638315,0.552395,1.123075
3,0.686536,-0.034740,0.709893,0.628460,0.679755,1.414341,-0.385568,0.768572,-0.073508,-1.761337,...,0.022995,-0.233516,0.779996,-1.761337,0.813279,0.787496,1.013707,0.352787,0.733773,-0.135214
4,-1.931400,0.446669,0.733146,0.785080,0.644602,0.362887,1.022221,0.872608,-0.916751,-0.892110,...,0.583937,0.917455,0.261157,0.900308,-0.238831,0.984702,0.375792,0.594104,1.178251,1.488263
5,-0.542956,-0.542956,1.660746,-0.542956,1.777909,2.156083,-0.542956,-0.542956,2.139617,1.555836,...,-0.542956,-0.542956,-0.542956,-0.542956,-0.542956,-0.542956,-0.542956,-0.542956,-0.542956,-0.542956
6,-1.248809,-1.248809,0.638075,-0.165596,0.154871,-1.248809,-0.182198,-1.248809,1.740623,1.594874,...,-0.011035,-0.209886,-1.248809,0.385410,-1.248809,-1.248809,-1.248809,-1.248809,-1.248809,-1.248809
7,-1.032136,0.588914,-1.032136,0.552893,-1.032136,-1.032136,0.528600,0.696474,1.356093,1.656844,...,-1.032136,0.488086,-1.032136,-1.032136,-1.032136,-1.032136,1.240670,-1.032136,-1.032136,0.937231
8,-1.478255,0.435344,1.058734,0.666554,1.058970,-1.702807,-0.547548,1.348319,-0.532874,-0.978540,...,0.914510,1.305216,0.188655,0.598838,-0.629258,2.016586,-0.590023,-0.484085,0.659702,0.111546
9,0.198103,-0.586780,-0.370532,-0.265091,-0.272118,-1.201708,-0.024915,-0.032688,1.842635,2.648844,...,-1.716376,-0.809119,-0.643657,-0.462641,-0.472761,-0.480839,1.859573,-1.649891,0.051632,1.931793


## Classification

### Get features and labels

In [16]:
clinical_data_df = pd.read_table('data/clinical/COADREAD.clin.merged.txt', index_col=0)

In [17]:
# Don't try to print all contents of patient_dict; too big! Will freeze browser.
patient_dict = {}
patient_dict['colon'] = {}
patient_dict['rectum'] = {}

tumor_tissue_site_nan_count = 0
patient_rna_exp_barcode_nan_count = 0
patient_rna_exp_barcode_not_in_rna_dataset_count = 0

column_header_list = list(clinical_data_df.columns.values)
for column_header in column_header_list:

    tumor_tissue_site = str(clinical_data_df.loc["patient.tumor_tissue_site"][column_header])
    if tumor_tissue_site != "nan":  # We only want patients which have a label.
        
        bcr_patient_barcode = clinical_data_df.loc["patient.bcr_patient_barcode"][column_header]
        bcr_patient_barcode = bcr_patient_barcode.upper()
        
        if bcr_patient_barcode in cluster_pca_patient_df.keys():
            patient_exp_list = list(cluster_pca_patient_df[bcr_patient_barcode])
            patient_dict[tumor_tissue_site][bcr_patient_barcode] = patient_exp_list
            
    # investigating quality of my data mapping
    else:
        tumor_tissue_site_nan_count += 1
        
print(tumor_tissue_site_nan_count)

4


#### Balancing training set between colon and rectum tissue.

In [18]:
print(len(patient_dict['rectum']))
print(len(patient_dict['colon']))

95
298


In [19]:
rectum_dict = patient_dict['rectum']

training_patient_list = list(rectum_dict.keys())[:60]
testing_patient_list = list(rectum_dict.keys())[60:]

training_feature_list = list(rectum_dict.values())[:60]
training_label_list = ['rectum' ] * len(training_feature_list)

testing_feature_list = list(rectum_dict.values())[60:]
testing_label_list = ['rectum'] * len(testing_feature_list)


colon_dict = patient_dict['colon']

training_patient_list += list(colon_dict.keys())[:60]
testing_patient_list += list(colon_dict.keys())[60:]

training_feature_list += list(colon_dict.values())[:60]
training_label_list += ['colon' ] * len(list(colon_dict.values())[:60])

testing_feature_list += list(colon_dict.values())[60:]
testing_label_list += ['colon'] * len(list(colon_dict.values())[60:])

print(len(training_feature_list))
print(len(training_patient_list))

print(len(testing_feature_list))
print(len(testing_patient_list))

120
120
273
273


### Execute classification

In [20]:
svm_classifier = SVC()
svm_classifier.fit(training_feature_list, training_label_list)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [21]:
training_predictions = svm_classifier.predict(training_feature_list)
testing_predictions = svm_classifier.predict(testing_feature_list)

In [22]:
testing_predictions

array(['rectum', 'rectum', 'rectum', 'rectum', 'colon', 'rectum', 'colon',
       'rectum', 'rectum', 'colon', 'rectum', 'rectum', 'rectum', 'rectum',
       'colon', 'rectum', 'rectum', 'rectum', 'rectum', 'colon', 'rectum',
       'rectum', 'rectum', 'rectum', 'rectum', 'rectum', 'rectum',
       'rectum', 'rectum', 'colon', 'rectum', 'rectum', 'rectum', 'rectum',
       'rectum', 'colon', 'rectum', 'rectum', 'rectum', 'rectum', 'colon',
       'colon', 'rectum', 'rectum', 'colon', 'rectum', 'colon', 'colon',
       'colon', 'rectum', 'colon', 'colon', 'rectum', 'colon', 'rectum',
       'colon', 'rectum', 'colon', 'rectum', 'rectum', 'colon', 'colon',
       'colon', 'rectum', 'rectum', 'rectum', 'colon', 'rectum', 'colon',
       'colon', 'rectum', 'colon', 'rectum', 'colon', 'colon', 'rectum',
       'rectum', 'rectum', 'colon', 'rectum', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colon', 'rectum', 'colon', 'rectum', 'rectum',
       'rectum', 'colon', 'colon', 'rectum',

In [23]:
np.array(testing_label_list)

array(['rectum', 'rectum', 'rectum', 'rectum', 'rectum', 'rectum',
       'rectum', 'rectum', 'rectum', 'rectum', 'rectum', 'rectum',
       'rectum', 'rectum', 'rectum', 'rectum', 'rectum', 'rectum',
       'rectum', 'rectum', 'rectum', 'rectum', 'rectum', 'rectum',
       'rectum', 'rectum', 'rectum', 'rectum', 'rectum', 'rectum',
       'rectum', 'rectum', 'rectum', 'rectum', 'rectum', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colo

### Investigate Classification Statistics

In [24]:
from sklearn import metrics

In [25]:
metrics.accuracy_score(testing_label_list, testing_predictions)

0.49450549450549453

In [26]:
# Essentially returns the Recall
def get_class_accuracy(class_name, label_list, predictions):
    correct_prediction_count = 0
    class_count = 0
    for index in range(0, len(label_list)):
        if label_list[index] == class_name:
            class_count += 1
            if predictions[index] == label_list[index]:
                correct_prediction_count += 1    
    return correct_prediction_count/class_count

In [27]:
get_class_accuracy('rectum', testing_label_list, testing_predictions)

0.8285714285714286

In [28]:
get_class_accuracy('colon', testing_label_list, testing_predictions)

0.44537815126050423

In [29]:
# Essentially returns the True Prediction Rate.
def get_average_per_class_accuracy(testing_label_list, testing_predictions):
    rectum_acc = get_class_accuracy('rectum', testing_label_list, testing_predictions)
    colon_acc = get_class_accuracy('colon', testing_label_list, testing_predictions)
    return (rectum_acc + colon_acc) / 2

In [30]:
get_average_per_class_accuracy(testing_label_list, testing_predictions)

0.6369747899159665

In [31]:
def get_recall(class_name, y_true, y_pred):
    bin_y_true = [1 if label == class_name else 0 for label in y_true]
    bin_y_pred = [1 if label == class_name else 0 for label in y_pred]
    return metrics.recall_score(bin_y_true, bin_y_pred)

In [32]:
rectum_true_prediction_rate = get_recall('rectum', testing_label_list, testing_predictions)
print(rectum_true_prediction_rate)

0.828571428571


In [33]:
colon_true_prediction_rate = get_recall('colon', testing_label_list, testing_predictions)
print(colon_true_prediction_rate)

0.445378151261


In [34]:
def get_precision(class_name, y_true, y_pred):
    bin_y_true = [1 if label == class_name else 0 for label in y_true]
    bin_y_pred = [1 if label == class_name else 0 for label in y_pred]
    return metrics.precision_score(bin_y_true, bin_y_pred)

In [35]:
rectum_precision = get_precision('rectum', testing_label_list, testing_predictions)
print(rectum_precision)

0.180124223602


In [36]:
colon_precision = get_precision('colon', testing_label_list, testing_predictions)
print(colon_precision)

0.946428571429


In [37]:
# Balanced Error Rate (BER)
rectum_false_prediction_rate = 1 - rectum_true_prediction_rate
colon_false_prediction_rate = 1 - colon_true_prediction_rate
print(0.5 * (rectum_false_prediction_rate + colon_false_prediction_rate))

0.363025210084
