In [1]:
import pandas as pd
import numpy as np
import math

In [2]:
# trunc_illuminaga_rna_data has the same illuminaga RNA expression data as that downloaded from 
# class project website, though the keys in the header of trunc_illuminaga_rna_data.tsv now match
# patient.bcr_patient_barcode from the clinical data, which gives use better mapping between the
# clinical patients dataset and the RNA dataset.
gene_exp = pd.read_table('data/trunc_illuminaga_rna_data.tsv',
                                     header=0,
                                     index_col=0)

## Preprocessing

###  Normalize by taking log of FPKM expression values.

In [3]:
log_gene_exp_df = np.log(gene_exp.copy())

### Replace all -inf with the smallest logFPKM integral, so that we don't get weird results with the variance.

In [4]:
log_gene_exp_df.replace([np.inf, -np.inf], np.nan, inplace=True)  # must replace all -inf with NaN so that .min() will work.
min_fpkm_per_patient = list(log_gene_exp_df.min())
min_fpkm = min(min_fpkm_per_patient)
fpkm_floor = math.floor(min_fpkm)
log_gene_exp_df.replace(np.nan, fpkm_floor, inplace=True)

### Take top 10000 genes with highest variance

In [5]:
log_gene_exp_var_df = log_gene_exp_df.copy()
log_gene_exp_var_df['var'] = log_gene_exp_var_df.var(axis=1)
filtered_log_gene_exp = log_gene_exp_var_df.sort_values(by='var', ascending=False)[:10000]
filtered_log_gene_exp = filtered_log_gene_exp.ix[:, :-1]  # Remove variance column

In [17]:
filtered_log_gene_exp

Unnamed: 0_level_0,TCGA-A6-2671,TCGA-A6-2672,TCGA-A6-2674,TCGA-A6-2676,TCGA-A6-2677,TCGA-A6-2678,TCGA-A6-2679,TCGA-A6-2680,TCGA-A6-2681,TCGA-A6-2682,...,TCGA-AG-A01Y,TCGA-AG-A020,TCGA-AG-A023,TCGA-AG-A025,TCGA-AG-A026,TCGA-AG-A02G,TCGA-AG-A02N,TCGA-AG-A02X,TCGA-AG-A032,TCGA-AG-A036
Hybridization REF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TMEM189-UBE2V1|387522,7.498745,5.561514,-5.000000,4.824196,-5.000000,-5.000000,7.501010,6.954354,6.542009,6.427028,...,-5.000000,-5.000000,-5.000000,-5.000000,-5.000000,-5.000000,-5.000000,-5.000000,-5.000000,2.760016
ZNHIT2|741,5.473908,-5.000000,5.639528,-5.000000,-5.000000,-5.000000,7.016961,5.255777,3.531205,-5.000000,...,-5.000000,-5.000000,4.778681,4.505774,4.982993,2.116858,-5.000000,4.254335,5.985465,5.691897
EIF1AY|9086,3.615010,-5.000000,6.266924,-5.000000,-5.000000,0.823430,-5.000000,1.212893,-5.000000,6.357417,...,1.884703,2.874519,-5.000000,-5.000000,5.562619,6.455079,5.857061,6.683879,5.847421,6.691775
XIST|7503,-5.000000,6.935519,-5.000000,7.360341,9.405358,8.705276,3.379374,7.283354,7.541415,0.428596,...,7.157197,8.851360,9.020012,8.974108,3.082433,2.717261,5.119000,1.178778,3.824758,3.315992
PRAC|84366,-5.000000,-5.000000,-5.000000,0.166023,-5.000000,3.867973,-5.000000,2.193730,2.311713,4.522932,...,7.553284,7.029133,7.981574,7.832861,6.459365,6.490273,6.486777,6.702029,2.902981,7.085491
KDM5D|8284,4.699635,-5.000000,6.522544,-5.000000,-5.000000,-5.000000,-5.000000,-5.000000,0.606990,6.103382,...,2.896299,3.734722,-5.000000,-5.000000,6.429689,7.194169,7.351867,7.296984,6.436669,6.989180
PPAN-P2RY11|692312,-5.000000,-5.000000,4.262643,-5.000000,3.732559,-5.000000,5.633819,4.951919,-5.000000,5.526710,...,3.794795,3.513978,-5.000000,5.993831,3.612506,3.031543,-5.000000,-5.000000,5.916677,-5.000000
UTY|7404,2.921865,-5.000000,5.295757,-5.000000,-5.000000,-5.000000,-5.000000,-5.000000,-5.000000,5.512650,...,0.903854,2.918970,-5.000000,-0.043743,5.495928,6.197777,5.809433,6.648461,5.857241,6.353414
CYorf15A|246126,2.382865,-5.000000,5.779290,-5.000000,-5.000000,-5.000000,2.686227,-5.000000,1.012437,5.897250,...,1.751163,2.874519,-5.000000,-5.000000,5.540879,6.121588,6.036055,6.365197,5.463112,6.394160
MAGEA2|4101,1.130111,-5.000000,4.074541,-5.000000,7.073712,8.305007,-5.000000,-5.000000,-5.000000,-5.000000,...,7.874911,3.472358,1.883868,1.054835,2.416315,1.893714,3.733470,-5.000000,2.209834,3.720012


## Formatting Data

### Get features and labels

#### NOTE: mRNA data patients are a subset of those included in COADREAD.clin.merged.txt

In [6]:
clinical_data_df = pd.read_table('data/clinical/COADREAD.clin.merged.txt', index_col=0)

In [7]:
# Don't try to print all contents of patient_dict; too big! Will freeze browser.
patient_dict = {}
patient_dict['colon'] = {}
patient_dict['rectum'] = {}

tumor_tissue_site_nan_count = 0
patient_rna_exp_barcode_nan_count = 0
patient_rna_exp_barcode_not_in_rna_dataset_count = 0

column_header_list = list(clinical_data_df.columns.values)
for column_header in column_header_list:

    tumor_tissue_site = str(clinical_data_df.loc["patient.tumor_tissue_site"][column_header])
    if tumor_tissue_site != "nan":  # We only want patients which have a label.
        
        bcr_patient_barcode = clinical_data_df.loc["patient.bcr_patient_barcode"][column_header]
        bcr_patient_barcode = bcr_patient_barcode.upper()
        
        if bcr_patient_barcode in filtered_log_gene_exp.keys():
            patient_exp_list = list(filtered_log_gene_exp[bcr_patient_barcode])
            patient_dict[tumor_tissue_site][bcr_patient_barcode] = patient_exp_list
            
    # investigating quality of my data mapping
    else:
        tumor_tissue_site_nan_count += 1

print(tumor_tissue_site_nan_count)

4


#### Balancing training set between colon and rectum tissue.

In [8]:
print(len(patient_dict['rectum']))
print(len(patient_dict['colon']))

71
190


In [9]:
rectum_dict = patient_dict['rectum']

training_patient_list = list(rectum_dict.keys())[:60]
testing_patient_list = list(rectum_dict.keys())[60:]

training_feature_list = list(rectum_dict.values())[:60]
training_label_list = ['rectum' ] * len(training_feature_list)

testing_feature_list = list(rectum_dict.values())[60:]
testing_label_list = ['rectum'] * len(testing_feature_list)


colon_dict = patient_dict['colon']

training_patient_list += list(colon_dict.keys())[:60]
testing_patient_list += list(colon_dict.keys())[60:]

training_feature_list += list(colon_dict.values())[:60]
training_label_list += ['colon' ] * len(list(colon_dict.values())[:60])

testing_feature_list += list(colon_dict.values())[60:]
testing_label_list += ['colon'] * len(list(colon_dict.values())[60:])

print(len(training_feature_list))
print(len(training_patient_list))

print(len(testing_feature_list))
print(len(testing_patient_list))


120
120
141
141


## Classification

In [10]:
from sklearn.svm import SVC

In [11]:
svm_classifier = SVC()
svm_classifier.fit(training_feature_list, training_label_list)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [12]:
training_predictions = svm_classifier.predict(training_feature_list)
testing_predictions = svm_classifier.predict(testing_feature_list)

In [13]:
# We don't want all predictions to simply be the same. We want variety in our predictions.
testing_predictions

array(['colon', 'rectum', 'colon', 'colon', 'rectum', 'colon', 'colon',
       'colon', 'colon', 'rectum', 'rectum', 'rectum', 'colon', 'colon',
       'rectum', 'rectum', 'colon', 'colon', 'rectum', 'rectum', 'colon',
       'rectum', 'rectum', 'colon', 'colon', 'colon', 'rectum', 'rectum',
       'colon', 'colon', 'colon', 'colon', 'rectum', 'rectum', 'rectum',
       'rectum', 'rectum', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'rectum', 'rectum',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'rectum',
       'rectum', 'rectum', 'colon', 'rectum', 'colon', 'rectum', 'colon',
       'colon', 'colon', 'colon', 'colon', 'rectum', 'rectum', 'rectum',
       'rectum', 'colon', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colon', 'rectum', 'colon', 'rectum', 'colon',
       'rectum', 'colon', 'rectum', 'colon', 'colon', 'rectum', 'colon',
       'rectum', 'colon', 'colon', 'colon', 'colon', 're

In [14]:
np.array(testing_label_list)

array(['rectum', 'rectum', 'rectum', 'rectum', 'rectum', 'rectum',
       'rectum', 'rectum', 'rectum', 'rectum', 'rectum', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'c

### Output Classification to TSV

In [15]:
patient_list = training_patient_list + testing_patient_list
prediction_list = list(training_predictions) + list(testing_predictions)

output_tsv_df = pd.DataFrame(np.array([prediction_list]), columns=patient_list)

In [16]:
output_tsv_df.to_csv('data/illuminaga_patient_classification.tsv', sep='\t')

### Investigate Classification Statistics

In [17]:
from sklearn import metrics

In [18]:
metrics.accuracy_score(training_label_list, training_predictions)

1.0

In [19]:
metrics.accuracy_score(testing_label_list, testing_predictions)

0.66666666666666663

In [20]:
def get_class_accuracy(class_name, label_list, predictions):
    correct_prediction_count = 0
    class_count = 0
    for index in range(0, len(label_list)):
        if label_list[index] == class_name:
            class_count += 1
            if predictions[index] == label_list[index]:
                correct_prediction_count += 1    
    return correct_prediction_count/class_count

In [21]:
get_class_accuracy('rectum', testing_label_list, testing_predictions)

0.7272727272727273

In [22]:
get_class_accuracy('colon', testing_label_list, testing_predictions)

0.6615384615384615

In [23]:
def get_average_per_class_accuracy(testing_label_list, testing_predictions):
    rectum_acc = get_class_accuracy('rectum', testing_label_list, testing_predictions)
    colon_acc = get_class_accuracy('colon', testing_label_list, testing_predictions)
    return (rectum_acc + colon_acc) / 2

In [24]:
get_average_per_class_accuracy(testing_label_list, testing_predictions)

0.6944055944055945

In [25]:
def get_recall(class_name, y_true, y_pred):
    bin_y_true = [1 if label == class_name else 0 for label in y_true]
    bin_y_pred = [1 if label == class_name else 0 for label in y_pred]
    return metrics.recall_score(bin_y_true, bin_y_pred)

In [26]:
rectum_true_prediction_rate = get_recall('rectum', testing_label_list, testing_predictions)
print(rectum_true_prediction_rate)

0.727272727273


In [27]:
colon_true_prediction_rate = get_recall('colon', testing_label_list, testing_predictions)
print(colon_true_prediction_rate)

0.661538461538


In [28]:
def get_precision(class_name, y_true, y_pred):
    bin_y_true = [1 if label == class_name else 0 for label in y_true]
    bin_y_pred = [1 if label == class_name else 0 for label in y_pred]
    return metrics.precision_score(bin_y_true, bin_y_pred)

In [29]:
print(get_precision('rectum', testing_label_list, testing_predictions))

0.153846153846


In [30]:
print(get_precision('colon', testing_label_list, testing_predictions))

0.966292134831


In [31]:
# Balanced Error Rate (BER)
rectum_false_prediction_rate = 1 - rectum_true_prediction_rate
colon_false_prediction_rate = 1 - colon_true_prediction_rate
print(0.5 * (rectum_false_prediction_rate + colon_false_prediction_rate))

0.305594405594
