In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.svm import SVC

In [2]:
# trunc_illuminaga_rna_data has the same illuminaga RNA expression data as that downloaded from 
# class project website, though the keys in the header of trunc_illuminaga_rna_data.tsv now match
# patient.bcr_patient_barcode from the clinical data, which gives use better mapping between the
# clinical patients dataset and the RNA dataset.
gene_exp = pd.read_table('data/trunc_illuminahiseq_rna_data.tsv',
                                     header=0,
                                     index_col=0)

### Preprocessing and getting variances of gene across all patients

In [3]:
log_gene_exp_df = np.log(gene_exp.copy())

In [4]:
log_gene_exp_df.replace([np.inf, -np.inf], np.nan, inplace=True)  # must replace all -inf with NaN so that .min() will work.
min_fpkm_per_patient = list(log_gene_exp_df.min())
min_fpkm = min(min_fpkm_per_patient)
fpkm_floor = math.floor(min_fpkm)
log_gene_exp_df.replace(np.nan, fpkm_floor, inplace=True)

In [5]:
log_gene_exp_var_df = log_gene_exp_df.copy()
log_gene_exp_var_df['var'] = log_gene_exp_var_df.var(axis=1)
log_gene_exp_var_df = log_gene_exp_var_df.sort_values(by='var', ascending=False)
variance_df = log_gene_exp_var_df[['var']]
sorted_log_gene_exp_df = log_gene_exp_var_df.ix[:, :-1]
variance_list = [[x] for x in list(variance_df['var'])]

### Clustering

In [6]:
from sklearn.cluster import KMeans

In [7]:
kmeans = KMeans(init='k-means++', n_clusters=20000, n_init=10)

In [8]:
kmeans.fit(variance_list)

KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=20000,
    n_init=10, n_jobs=1, precompute_distances='auto', random_state=None,
    tol=0.0001, verbose=0)

In [9]:
kmeans.labels_

array([ 68,  15, 615, ...,  93,  93,  93], dtype=int32)

In [10]:
def get_cluster_idx_dict(k_means_results):
    cluster_dict = {}
    for cluster_num in set(k_means_results.labels_):
        cluster_dict[cluster_num] = []
    for variance_idx in range(0, len(k_means_results.labels_)):
        cluster_num = k_means_results.labels_[variance_idx]
        cluster_dict[cluster_num].append(variance_idx)
    return cluster_dict

In [11]:
cluster_idx_dict = get_cluster_idx_dict(kmeans)

### Take PCA of gene clusters

In [12]:
# asdf = {}
# for key, values in cluster_idx_dict.items():
#     asdf[key] = []
#     for value in values:
#         if list(variance_df['var'])[value] > 30.0:
#             print(str(key)+"\n"+str(list(variance_df['var'])[value]))
#         asdf[key].append(list(variance_df['var'])[value])

# for key in asdf.keys():
#     print(str(key) + "\n" + str(asdf[key][1]) + "\t" + str(asdf[key][-1]))

In [13]:
# !!!This is really slow (minutes): there has got to be a way to return multiple rows from a df simultaneously with a list of indeces.
def get_cluster_df(cluster_index_list, gene_exp_df):
    df = pd.DataFrame()
    for idx in cluster_index_list:
        df = df.append(gene_exp_df.iloc[idx])
    return df

In [14]:
# cluster_gene_exp_df_dict = {}
# for cluster_key, cluster_gene_idx_list in cluster_idx_dict.items():
#     cluster_gene_exp_df_dict[cluster_key] = get_cluster_df(cluster_idx_dict[cluster_key], sorted_log_gene_exp_df)

pca = PCA(n_components=1)
# pca.fit(sorted_log_gene_exp_df.transpose())
cluster_gene_exp_pca_dict = {}
for cluster_key, cluster_gene_idx_list in cluster_idx_dict.items():
    cluster_df = get_cluster_df(cluster_idx_dict[cluster_key], sorted_log_gene_exp_df)
    # insert scaling below before PCA to see if this enhances classification performance.
    cluster_gene_exp_pca_dict[cluster_key] = pca.fit_transform(scale(cluster_df.transpose()))
#     cluster_gene_exp_pca_dict[cluster_key] = pca.transform(cluster_df.transpose())

In [15]:
cluster_pca_patient_df = pd.DataFrame()
for cluster_key in cluster_gene_exp_pca_dict.keys():
    cluster_patient_pca_dict = {}
    for idx in range(0, len(sorted_log_gene_exp_df.columns.values)):
        patient = sorted_log_gene_exp_df.columns.values[idx]
        pca_exp = cluster_gene_exp_pca_dict[cluster_key][idx][0]
        cluster_patient_pca_dict[patient] = pca_exp
    cluster_pca_patient_df = cluster_pca_patient_df.append(cluster_patient_pca_dict, ignore_index=True)
cluster_pca_patient_df

Unnamed: 0,TCGA-3L-AA1B,TCGA-4N-A93T,TCGA-4T-AA8H,TCGA-5M-AAT4,TCGA-5M-AAT5,TCGA-5M-AAT6,TCGA-5M-AATA,TCGA-5M-AATE,TCGA-A6-2671,TCGA-A6-2675,...,TCGA-QG-A5YV,TCGA-QG-A5YW,TCGA-QG-A5YX,TCGA-QG-A5Z1,TCGA-QG-A5Z2,TCGA-QL-A97D,TCGA-RU-A8FL,TCGA-SS-A7HO,TCGA-T9-A92H,TCGA-WS-AB45
0,-1.134993,0.764021,0.241175,-0.581230,0.267780,-0.124229,-1.823020,-1.104369,-0.032865,-0.696228,...,-0.237299,-0.548983,0.179960,0.737596,1.294118,0.980651,-0.490598,-0.392007,0.363794,-0.649105
1,0.411814,0.662045,0.957312,0.619764,0.816298,0.918318,0.591248,1.038366,0.360339,0.963428,...,0.385031,0.293612,-1.490819,0.339074,1.017751,0.889626,0.676838,0.824211,-1.490819,-1.490819
2,-0.092406,0.206522,0.695894,0.064364,-1.526762,0.289438,0.042866,0.002887,1.416519,0.774471,...,-0.112597,-1.526762,0.490853,0.290486,-1.526762,-1.526762,-0.081167,-0.080305,-1.526762,0.294190
3,0.277820,1.355355,0.094005,0.827228,0.691474,0.648245,1.487790,-0.329091,-0.028849,0.330364,...,1.474605,-0.445498,-0.113495,-4.376331,0.003925,1.362373,0.763576,0.302715,0.102715,0.438089
4,0.514814,0.335677,-0.002643,0.928492,0.221978,1.236064,0.813499,0.759551,-2.049884,-1.251634,...,-0.112198,-0.723841,0.269492,0.916986,-1.896247,0.309006,0.488063,0.139072,0.199228,1.651120
5,-1.228440,0.129799,0.799685,0.018401,0.716388,-0.039395,0.149305,-0.029774,1.669915,1.460167,...,-1.228440,-0.174276,-0.115714,-1.228440,0.058204,-1.228440,-1.228440,-1.228440,0.134705,0.198497
6,-1.001582,-1.001582,-1.001582,-1.001582,-1.001582,1.359708,1.291355,0.800128,-1.001582,0.733560,...,0.664104,1.156922,0.893015,-1.001582,-1.001582,-1.001582,1.522875,-1.001582,0.695416,1.766648
7,0.603804,-0.559079,1.024010,0.071699,0.036264,-0.015613,0.358592,1.285311,-0.326581,0.050245,...,0.322228,-3.252792,-0.288585,0.948627,-0.408051,-0.851146,0.238482,0.403674,0.622645,0.989464
8,0.080933,0.785874,0.386583,0.228962,-0.503947,-0.212377,0.026709,0.430795,2.033225,1.033459,...,-0.015689,-0.014207,0.116331,0.022984,-0.309683,-0.920406,-0.558170,0.747411,-0.389064,0.881431
9,-0.940606,0.611841,1.056781,-0.940606,-0.940606,-0.940606,-0.940606,0.714848,2.135171,1.265808,...,-0.940606,-0.940606,0.596158,0.552371,1.040404,0.527765,0.827948,0.828822,-0.940606,-0.940606


## Classification

### Get features and labels

In [16]:
clinical_data_df = pd.read_table('data/clinical/COADREAD.clin.merged.txt', index_col=0)

In [17]:
# Don't try to print all contents of patient_dict; too big! Will freeze browser.
patient_dict = {}
patient_dict['colon'] = {}
patient_dict['rectum'] = {}

tumor_tissue_site_nan_count = 0
patient_rna_exp_barcode_nan_count = 0
patient_rna_exp_barcode_not_in_rna_dataset_count = 0

column_header_list = list(clinical_data_df.columns.values)
for column_header in column_header_list:

    tumor_tissue_site = str(clinical_data_df.loc["patient.tumor_tissue_site"][column_header])
    if tumor_tissue_site != "nan":  # We only want patients which have a label.
        
        bcr_patient_barcode = clinical_data_df.loc["patient.bcr_patient_barcode"][column_header]
        bcr_patient_barcode = bcr_patient_barcode.upper()
        
        if bcr_patient_barcode in cluster_pca_patient_df.keys():
            patient_exp_list = list(cluster_pca_patient_df[bcr_patient_barcode])
            patient_dict[tumor_tissue_site][bcr_patient_barcode] = patient_exp_list
            
    # investigating quality of my data mapping
    else:
        tumor_tissue_site_nan_count += 1
        
print(tumor_tissue_site_nan_count)

4


#### Balancing training set between colon and rectum tissue.

In [18]:
print(len(patient_dict['rectum']))
print(len(patient_dict['colon']))

95
298


In [19]:
rectum_dict = patient_dict['rectum']

training_patient_list = list(rectum_dict.keys())[:60]
testing_patient_list = list(rectum_dict.keys())[60:]

training_feature_list = list(rectum_dict.values())[:60]
training_label_list = ['rectum' ] * len(training_feature_list)

testing_feature_list = list(rectum_dict.values())[60:]
testing_label_list = ['rectum'] * len(testing_feature_list)


colon_dict = patient_dict['colon']

training_patient_list += list(colon_dict.keys())[:60]
testing_patient_list += list(colon_dict.keys())[60:]

training_feature_list += list(colon_dict.values())[:60]
training_label_list += ['colon' ] * len(list(colon_dict.values())[:60])

testing_feature_list += list(colon_dict.values())[60:]
testing_label_list += ['colon'] * len(list(colon_dict.values())[60:])

print(len(training_feature_list))
print(len(training_patient_list))

print(len(testing_feature_list))
print(len(testing_patient_list))

120
120
273
273


### Execute classification

In [20]:
svm_classifier = SVC()
svm_classifier.fit(training_feature_list, training_label_list)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [21]:
training_predictions = svm_classifier.predict(training_feature_list)
testing_predictions = svm_classifier.predict(testing_feature_list)

In [22]:
testing_predictions

array(['rectum', 'colon', 'rectum', 'rectum', 'rectum', 'rectum', 'rectum',
       'rectum', 'rectum', 'rectum', 'colon', 'rectum', 'rectum', 'rectum',
       'rectum', 'rectum', 'rectum', 'rectum', 'rectum', 'rectum',
       'rectum', 'colon', 'colon', 'colon', 'rectum', 'rectum', 'rectum',
       'rectum', 'rectum', 'rectum', 'rectum', 'rectum', 'colon', 'colon',
       'colon', 'rectum', 'colon', 'colon', 'colon', 'rectum', 'colon',
       'rectum', 'colon', 'colon', 'rectum', 'colon', 'rectum', 'colon',
       'rectum', 'rectum', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'rectum', 'colon', 'colon', 'colon', 'colon', 'colon',
       'rectum', 'rectum', 'colon', 'rectum', 'rectum', 'rectum', 'rectum',
       'rectum', 'colon', 'rectum', 'rectum', 'colon', 'rectum', 'rectum',
       'colon', 'rectum', 'rectum', 'colon', 'rectum', 'colon', 'colon',
       'colon', 'rectum', 'colon', 'colon', 'rectum', 'rectum', 'colon',
       'colon', 'rectum', 'colon', 'colon', 're

In [23]:
np.array(testing_label_list)

array(['rectum', 'rectum', 'rectum', 'rectum', 'rectum', 'rectum',
       'rectum', 'rectum', 'rectum', 'rectum', 'rectum', 'rectum',
       'rectum', 'rectum', 'rectum', 'rectum', 'rectum', 'rectum',
       'rectum', 'rectum', 'rectum', 'rectum', 'rectum', 'rectum',
       'rectum', 'rectum', 'rectum', 'rectum', 'rectum', 'rectum',
       'rectum', 'rectum', 'rectum', 'rectum', 'rectum', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colo

### Investigate Classification Statistics

In [24]:
from sklearn import metrics

In [25]:
metrics.accuracy_score(testing_label_list, testing_predictions)

0.60073260073260071

In [26]:
# Essentially returns the Recall
def get_class_accuracy(class_name, label_list, predictions):
    correct_prediction_count = 0
    class_count = 0
    for index in range(0, len(label_list)):
        if label_list[index] == class_name:
            class_count += 1
            if predictions[index] == label_list[index]:
                correct_prediction_count += 1    
    return correct_prediction_count/class_count

In [27]:
get_class_accuracy('rectum', testing_label_list, testing_predictions)

0.7714285714285715

In [28]:
get_class_accuracy('colon', testing_label_list, testing_predictions)

0.5756302521008403

In [29]:
# Essentially returns the True Prediction Rate.
def get_average_per_class_accuracy(testing_label_list, testing_predictions):
    rectum_acc = get_class_accuracy('rectum', testing_label_list, testing_predictions)
    colon_acc = get_class_accuracy('colon', testing_label_list, testing_predictions)
    return (rectum_acc + colon_acc) / 2

In [30]:
get_average_per_class_accuracy(testing_label_list, testing_predictions)

0.6735294117647059

In [31]:
def get_recall(class_name, y_true, y_pred):
    bin_y_true = [1 if label == class_name else 0 for label in y_true]
    bin_y_pred = [1 if label == class_name else 0 for label in y_pred]
    return metrics.recall_score(bin_y_true, bin_y_pred)

In [32]:
rectum_true_prediction_rate = get_recall('rectum', testing_label_list, testing_predictions)
print(rectum_true_prediction_rate)

0.771428571429


In [33]:
colon_true_prediction_rate = get_recall('colon', testing_label_list, testing_predictions)
print(colon_true_prediction_rate)

0.575630252101


In [34]:
def get_precision(class_name, y_true, y_pred):
    bin_y_true = [1 if label == class_name else 0 for label in y_true]
    bin_y_pred = [1 if label == class_name else 0 for label in y_pred]
    return metrics.precision_score(bin_y_true, bin_y_pred)

In [35]:
rectum_precision = get_precision('rectum', testing_label_list, testing_predictions)
print(rectum_precision)

0.2109375


In [36]:
colon_precision = get_precision('colon', testing_label_list, testing_predictions)
print(colon_precision)

0.944827586207


In [37]:
# Balanced Error Rate (BER)
rectum_false_prediction_rate = 1 - rectum_true_prediction_rate
colon_false_prediction_rate = 1 - colon_true_prediction_rate
print(0.5 * (rectum_false_prediction_rate + colon_false_prediction_rate))

0.326470588235
