In [1]:
import pandas as pd
import numpy as np
import math
from collections import OrderedDict

In [2]:
# trunc_illuminaga_rna_data has the same illuminaga RNA expression data as that downloaded from 
# class project website, though the keys in the header of trunc_illuminaga_rna_data.tsv now match
# patient.bcr_patient_barcode from the clinical data, which gives use better mapping between the
# clinical patients dataset and the RNA dataset.
gene_exp = pd.read_table('data/trunc_illuminaga_rna_data.tsv',
                                     header=0,
                                     index_col=0)

## Preprocessing

###  Normalize by taking log of FPKM expression values.

In [3]:
log_gene_exp_df = np.log(gene_exp.copy())

### Replace all -inf with the smallest logFPKM integral, so that we don't get weird results with the variance.

In [4]:
log_gene_exp_df.replace([np.inf, -np.inf], np.nan, inplace=True)  # must replace all -inf with NaN so that .min() will work.
min_fpkm_per_patient = list(log_gene_exp_df.min())
min_fpkm = min(min_fpkm_per_patient)
fpkm_floor = math.floor(min_fpkm)
log_gene_exp_df.replace(np.nan, fpkm_floor, inplace=True)

### Take top 2000 genes with highest variance

In [50]:
log_gene_exp_var_df = log_gene_exp_df.copy()
log_gene_exp_var_df['var'] = log_gene_exp_var_df.var(axis=1)
filtered_log_gene_exp = log_gene_exp_var_df.sort_values(by='var', ascending=False)[:10000]
filtered_log_gene_exp = filtered_log_gene_exp.ix[:, :-1]  # Remove variance column

## Formatting Data

### Get features and labels

#### NOTE: mRNA data patients are a subset of those included in COADREAD.clin.merged.txt

In [51]:
clinical_data_df = pd.read_table('data/clinical/COADREAD.clin.merged.txt', index_col=0)

In [52]:
# Don't try to print all contents of patient_dict; too big! Will freeze browser.
patient_dict = OrderedDict()
patient_dict['colon'] = {}
patient_dict['rectum'] = {}

tumor_tissue_site_nan_count = 0
patient_rna_exp_barcode_nan_count = 0
patient_rna_exp_barcode_not_in_rna_dataset_count = 0

column_header_list = list(clinical_data_df.columns.values)
for column_header in column_header_list:

    tumor_tissue_site = str(clinical_data_df.loc["patient.tumor_tissue_site"][column_header])
    if tumor_tissue_site != "nan":  # We only want patients which have a label.
        
        bcr_patient_barcode = clinical_data_df.loc["patient.bcr_patient_barcode"][column_header]
        bcr_patient_barcode = bcr_patient_barcode.upper()
        
        if bcr_patient_barcode in filtered_log_gene_exp.keys():
            patient_exp_list = list(filtered_log_gene_exp[bcr_patient_barcode])
            patient_dict[tumor_tissue_site][bcr_patient_barcode] = patient_exp_list
            
    # investigating quality of my data mapping
    else:
        tumor_tissue_site_nan_count += 1

print(tumor_tissue_site_nan_count)

4


#### Balancing training set between colon and rectum tissue.

In [53]:
print(len(patient_dict['rectum']))
print(len(patient_dict['colon']))

71
190


In [54]:
rectum_dict = patient_dict['rectum']
training_feature_list = list(rectum_dict.values())[:60]
training_label_list = ['rectum' ] * len(training_feature_list)

testing_feature_list = list(rectum_dict.values())[60:]
testing_label_list = ['rectum'] * len(testing_feature_list)


colon_dict = patient_dict['colon']
training_feature_list += list(colon_dict.values())[:60]
training_label_list += ['colon' ] * len(list(colon_dict.values())[:60])

testing_feature_list += list(colon_dict.values())[60:]
testing_label_list += ['colon'] * len(list(colon_dict.values())[60:])

print(len(training_feature_list))
print(len(testing_feature_list))

120
141


## Classification

In [55]:
from sklearn.svm import SVC

In [56]:
svm_classifier = SVC()
svm_classifier.fit(training_feature_list, training_label_list)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [57]:
training_predictions = svm_classifier.predict(training_feature_list)
testing_predictions = svm_classifier.predict(testing_feature_list)

In [58]:
# We don't want all predictions to simply be the same. We want variety in our predictions.
testing_predictions

array(['rectum', 'colon', 'colon', 'rectum', 'rectum', 'rectum', 'rectum',
       'rectum', 'colon', 'rectum', 'colon', 'rectum', 'colon', 'colon',
       'colon', 'rectum', 'rectum', 'colon', 'colon', 'colon', 'colon',
       'colon', 'rectum', 'rectum', 'rectum', 'rectum', 'colon', 'colon',
       'colon', 'colon', 'rectum', 'colon', 'rectum', 'rectum', 'colon',
       'rectum', 'rectum', 'colon', 'colon', 'rectum', 'colon', 'colon',
       'rectum', 'colon', 'colon', 'rectum', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'rectum', 'colon', 'rectum',
       'colon', 'colon', 'colon', 'rectum', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'colon', 'rectum', 'rectum', 'rectum', 'rectum', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'rectum',
       'rectum', 'rectum', 'colon', 'colon', 'rectum', 'rect

In [59]:
np.array(testing_label_list)

array(['rectum', 'rectum', 'rectum', 'rectum', 'rectum', 'rectum',
       'rectum', 'rectum', 'rectum', 'rectum', 'rectum', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'colon', 'colon', 'colon', 'colon', 'colon', 'colon',
       'colon', 'c

### Investigate Classification Statistics

In [60]:
def get_accuracy(label_list, predictions):
    correct_predictionscount = sum([a == b for (a,b) in zip(label_list, predictions)])
    return float(correct_predictionscount)/len(predictions)

In [61]:
print(get_accuracy(testing_label_list, testing_predictions))

0.7021276595744681


In [67]:
def get_class_accuracy(class_name, label_list, predictions):
    correct_prediction_count = 0
    class_count = 0
    for index in range(0, len(label_list)):
        if label_list[index] == class_name:
            class_count += 1
            if predictions[index] == label_list[index]:
                correct_prediction_count += 1    
    return correct_prediction_count/class_count

In [68]:
get_class_accuracy('rectum', testing_label_list, testing_predictions)

0.6363636363636364

In [69]:
get_class_accuracy('colon', testing_label_list, testing_predictions)

0.7076923076923077

In [70]:
def get_average_per_class_accuracy(testing_label_list, testing_predictions):
    rectum_acc = get_class_accuracy('rectum', testing_label_list, testing_predictions)
    colon_acc = get_class_accuracy('colon', testing_label_list, testing_predictions)
    return (rectum_acc + colon_acc) / 2

In [71]:
get_average_per_class_accuracy(testing_label_list, testing_predictions)

0.672027972027972