In [302]:
import pandas as pd
import numpy as np
import math
from sklearn.decomposition import PCA
# from sklearn.preprocessing import scale  #TODO: check to see if scaling cluster DFs before PCA improves classification pref.
from sklearn.svm import SVC

In [2]:
# trunc_illuminaga_rna_data has the same illuminaga RNA expression data as that downloaded from 
# class project website, though the keys in the header of trunc_illuminaga_rna_data.tsv now match
# patient.bcr_patient_barcode from the clinical data, which gives use better mapping between the
# clinical patients dataset and the RNA dataset.
gene_exp = pd.read_table('data/trunc_illuminahiseq_rna_data.tsv',
                                     header=0,
                                     index_col=0)

### Preprocessing and getting variances of gene across all patients

In [3]:
log_gene_exp_df = np.log(gene_exp.copy())

In [4]:
log_gene_exp_df.replace([np.inf, -np.inf], np.nan, inplace=True)  # must replace all -inf with NaN so that .min() will work.
min_fpkm_per_patient = list(log_gene_exp_df.min())
min_fpkm = min(min_fpkm_per_patient)
fpkm_floor = math.floor(min_fpkm)
log_gene_exp_df.replace(np.nan, fpkm_floor, inplace=True)

In [135]:
log_gene_exp_var_df = log_gene_exp_df.copy()
log_gene_exp_var_df['var'] = log_gene_exp_var_df.var(axis=1)
log_gene_exp_var_df = log_gene_exp_var_df.sort_values(by='var', ascending=False)
variance_df = log_gene_exp_var_df[['var']]
sorted_log_gene_exp_df = log_gene_exp_var_df.ix[:, :-1]
variance_list = [[x] for x in list(variance_df['var'])]
variance_list

[[30.618654021862952],
 [28.974371650594708],
 [28.929956680101505],
 [28.842896635383152],
 [28.5199628889232],
 [27.736073353362229],
 [27.483888861659718],
 [25.839786985286402],
 [25.457111012747745],
 [25.192914401193921],
 [23.74672007850074],
 [23.709285375713808],
 [23.307332811575648],
 [22.715607442712759],
 [22.495503627180703],
 [22.063558444041252],
 [21.844442370269466],
 [21.502566058745874],
 [21.417557059686285],
 [21.310708210957358],
 [21.174531864806998],
 [21.020128577003586],
 [20.585842336427365],
 [20.525636586370517],
 [20.517760275080104],
 [20.141464370420135],
 [19.532812088933966],
 [19.436718155282641],
 [19.4119417638132],
 [19.094815208125102],
 [19.047534532552724],
 [18.501416290106931],
 [18.160985173831939],
 [18.062041059580263],
 [17.904212178476577],
 [17.88488685835388],
 [17.582895519620461],
 [17.540843837373249],
 [17.539255334429111],
 [17.509870061502667],
 [17.421453251104257],
 [17.111579483874412],
 [17.106544391695834],
 [17.106046563643

### Clustering

In [53]:
from sklearn.cluster import KMeans

In [68]:
# kmeans = KMeans(init='k-means++', n_clusters=100, n_init=10)
kmeans = KMeans(init='k-means++', n_clusters=10, n_init=10)

In [69]:
kmeans.fit(variance_list)

KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=10, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [70]:
kmeans.labels_

array([5, 5, 5, ..., 0, 0, 0], dtype=int32)

In [185]:
def get_cluster_idx_dict(k_means_results):
    cluster_dict = {}
    for cluster_num in set(k_means_results.labels_):
        cluster_dict[cluster_num] = []
    for variance_idx in range(0, len(k_means_results.labels_)):
        cluster_num = k_means_results.labels_[variance_idx]
        cluster_dict[cluster_num].append(variance_idx)
    return cluster_dict

In [186]:
cluster_idx_dict = get_cluster_idx_dict(kmeans)

### Take PCA of gene clusters

In [116]:
# asdf = {}
# for key, values in cluster_idx_dict.items():
#     asdf[key] = []
#     for value in values:
#         if list(variance_df['var'])[value] > 30.0:
#             print(str(key)+"\n"+str(list(variance_df['var'])[value]))
#         asdf[key].append(list(variance_df['var'])[value])

# for key in asdf.keys():
#     print(str(key) + "\n" + str(asdf[key][1]) + "\t" + str(asdf[key][-1]))

In [290]:
# !!!This is really slow (minutes): there has got to be a way to return multiple rows from a df simultaneously with a list of indeces.
def get_cluster_df(cluster_index_list, gene_exp_df):
    df = pd.DataFrame()
    for idx in cluster_index_list:
        df = df.append(gene_exp_df.iloc[idx])
    return df

In [229]:
# cluster_gene_exp_df_dict = {}
# for cluster_key, cluster_gene_idx_list in cluster_idx_dict.items():
#     cluster_gene_exp_df_dict[cluster_key] = get_cluster_df(cluster_idx_dict[cluster_key], sorted_log_gene_exp_df)

In [281]:
pca = PCA(n_components=1)
cluster_gene_exp_pca_dict = {}
for cluster_key, cluster_gene_idx_list in cluster_idx_dict.items():
    cluster_df = get_cluster_df(cluster_idx_dict[cluster_key], sorted_log_gene_exp_df)
    # insert scaling below before PCA to see if this enhances classification performance.
    cluster_gene_exp_pca_dict[cluster_key] = pca.fit_transform(cluster_df.transpose())

## Classification

### Get features and labels

In [303]:
clinical_data_df = pd.read_table('data/clinical/COADREAD.clin.merged.txt', index_col=0)

In [292]:
# Don't try to print all contents of patient_dict; too big! Will freeze browser.
patient_dict = {}
patient_dict['colon'] = {}
patient_dict['rectum'] = {}

tumor_tissue_site_nan_count = 0
patient_rna_exp_barcode_nan_count = 0
patient_rna_exp_barcode_not_in_rna_dataset_count = 0

column_header_list = list(clinical_data_df.columns.values)
for column_header in column_header_list:

    tumor_tissue_site = str(clinical_data_df.loc["patient.tumor_tissue_site"][column_header])
    if tumor_tissue_site != "nan":  # We only want patients which have a label.
        
        bcr_patient_barcode = clinical_data_df.loc["patient.bcr_patient_barcode"][column_header]
        bcr_patient_barcode = bcr_patient_barcode.upper()
        
        if bcr_patient_barcode in sorted_log_gene_exp_df.keys():
            patient_exp_list = list(sorted_log_gene_exp_df[bcr_patient_barcode])
            patient_dict[tumor_tissue_site][bcr_patient_barcode] = patient_exp_list
            
    # investigating quality of my data mapping
    else:
        tumor_tissue_site_nan_count += 1
        
print(tumor_tissue_site_nan_count)

4


#### Balancing training set between colon and rectum tissue.

In [293]:
print(len(patient_dict['rectum']))
print(len(patient_dict['colon']))

95
298


In [294]:
rectum_dict = patient_dict['rectum']

training_patient_list = list(rectum_dict.keys())[:60]
testing_patient_list = list(rectum_dict.keys())[60:]

training_feature_list = list(rectum_dict.values())[:60]
training_label_list = ['rectum' ] * len(training_feature_list)

testing_feature_list = list(rectum_dict.values())[60:]
testing_label_list = ['rectum'] * len(testing_feature_list)


colon_dict = patient_dict['colon']

training_patient_list += list(colon_dict.keys())[:60]
testing_patient_list += list(colon_dict.keys())[60:]

training_feature_list += list(colon_dict.values())[:60]
training_label_list += ['colon' ] * len(list(colon_dict.values())[:60])

testing_feature_list += list(colon_dict.values())[60:]
testing_label_list += ['colon'] * len(list(colon_dict.values())[60:])

print(len(training_feature_list))
print(len(training_patient_list))

print(len(testing_feature_list))
print(len(testing_patient_list))

120
120
273
273


### Execute classification

In [298]:
svm_classifier = SVC()
svm_classifier.fit(training_feature_list, training_label_list)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [None]:
training_predictions = svm_classifier.predict(training_feature_list)
testing_predictions = svm_classifier.predict(testing_feature_list)