# Predicting ASD diagnosis from Genetic Data

Author: Rachael Caelie "Rocky" Aikens

Created: Dec 7, 2017

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import random
from sklearn.model_selection import train_test_split

## Data preprocessing

We have genotype information for siblings from the Agre and Simons Simplex Collection, which has been featurized into a binary matrix (described below). In addition to that, we have imputed ASD/non-ASD labels and ADOS/ADI-R scores for a subset of those individuals.

### Feature Data (Genotype)

The input data is represented as a binary matrix.  There are a couple different representations we can use here, but to start I will use 1 = loss of function variant(compound het or homozygous alternate), 0 = no loss of function variant.

In [None]:
# load input feature dataset for Agre
Agre_asd = pd.read_csv("../../../iHART/kpaskov/CGT/data/v34_lof_asd_af0.50.txt", index_col=0).transpose()
Agre_ctrl = pd.read_csv("../../../iHART/kpaskov/CGT/data/v34_lof_typical_af0.50.txt", index_col=0).transpose()

print "Cases: ", Agre_asd.shape[0]
print "Controls: ", Agre_ctrl.shape[0]

In [None]:
# load input feature dataset for SSC
SSC_asd = pd.read_csv("../../../iHART/kpaskov/CGT/data/SSC_lof_asd_af0.50.txt", index_col=0).transpose()
SSC_ctrl = pd.read_csv("../../../iHART/kpaskov/CGT/data/SSC_lof_typical_af0.50.txt", index_col=0).transpose()

print "Cases: ", SSC_asd.shape[0]
print "Controls: ", SSC_ctrl.shape[0]

In [None]:
# merge SSC and Agre data
X_asd = pd.concat([SSC_asd, Agre_asd], axis = 0).fillna(0)
X_ctrl = pd.concat([SSC_ctrl, Agre_ctrl], axis = 0).fillna(0)
X = pd.concat([X_asd, X_ctrl], axis=0)
print "Total cases: ", X_asd.shape[0]
print "Total controls: ", X_ctrl.shape[0]
print "Features (ie. genes): ", X.shape[1]
print "Missing Values: ", int(X.isnull().values.any())

### Target Data (ASD/non-ASD diagnosis)

We have a file that Kelley has made with inferred Autism/Control diagnosis for the individuals in the iHart study.  We will try and predict diagnosis 0 = Control, 1 = Austism.

In [None]:
y = pd.read_csv("../../../iHART/kpaskov/PhenotypeGLRM/data/all_samples_filtered_labels.csv", usecols = ['identifier','diagnosis'], index_col=0)

In [None]:
# shift y to a 0/1 representation for Control/ASD
y["diagnosis"] = np.where(y['diagnosis'] == 'Autism', 1, 0)

### Filtering for Overlap

Our phenotype labels y may not perfectly overlap with our genotype data, X.

In [None]:
# get lists of individuals in X and Y
m_x = X.index.values.tolist()
m_x_asd = X_asd.index.tolist()
m_x_ctrl = X_ctrl.index.tolist()
m_y = y.index.values.tolist()

# check subject overlap between X and Y
print "%d subjects in X are not in y.  Of these, %d are cases and %d are controls." % (len(set(m_x) - set(m_y)), len(set(m_x_asd) - set(m_y)), len(set(m_x_ctrl) - set(m_y)))

# make a list of Subject IDs with overlap
subjects = list(set(m_x) & set(m_y))
print "This leaves %d subjects: %d cases and %d controls." % (len(subjects), len(set(m_x_asd) & set(m_y)), len(set(m_x_ctrl)&set(m_y))) 

**Note:** The set of "cases" and "controls" appear to be differently defined between the iHart Phenotype labels (i.e. our `y` labels) and the CGT matrix labels (i.e. our `X` features). 

You can notice that the majority of controls don't appear in our phenotype information dataset. This is because ADOS\ADI-R was not administered to many controls from SSC and Agre. Since we're interested in classifying ASD/non-ASD, for our purposes it is not necessary to exclude these individuals because we do not necessarily need any phenotype information outside of diagnosis. Rather, we can infer that all individuals in a 'control' CGT matrix without ADOS/ADI-R information have a non-ASD diagnosis.

In [None]:
# add in controls without explicit diagnosis
to_add = list(set(m_x_ctrl) - set(m_y))
y_ctrl = pd.DataFrame(np.zeros(len(to_add),), columns = ['diagnosis'],index = to_add)
y = pd.concat([y, y_ctrl], axis = 0)
subjects = subjects + to_add

# redefine X and Y to contain only the subjects we want
X = X.ix[subjects]
y = y.ix[subjects]

# check we have the same subject IDs in the same order for X and Y
print y.index.values.tolist() == X.index.values.tolist()
y = y.ix[:,0]
print y.value_counts()

One thing that's probably going to be an issue for this experiment is that there are very few controls for whom we have both genetic and ADOS/ADI-R information.  This is going to mean that a random classifier performs with fairly high accuracy, just because classifying most or all individuals as autistic is a effective strategy when we have so few negatives. 

## Data Splitting

Since we have ~1,600 examples, I'm going to hold out 20% of the data as a test set and then do 5 fold cross validation using built-in sklearn methods.

In [None]:
random.seed(143)
from class_SibKFold import SibKFold
skf = SibKFold(5, X)
train_ids, test_ids = skf.split(X)[0]
X_train = X.ix[train_ids]
X_test = X.ix[test_ids]
y_train = y.ix[train_ids]
y_test = y.ix[test_ids]

In [None]:
print len(train_ids)
print len(test_ids)

# Gradient Boosted Classifier

In [None]:
# Import EvalLR
import class_EvalGBM
reload(class_EvalGBM)
from class_EvalGBM import EvalGBM

In [None]:
# base classifier without resampling
evalgbm = EvalGBM(X_train, y_train)
scores = evalgbm.kfold(7, True)
print scores

In [None]:
print np.mean(scores.Test_score)

In [None]:
# classify again with resampling
scores = evalgbm.kfold(7, True, resample = True)
print scores

In [None]:
print np.mean(scores.Test_score)

# Tuning parameters

One possible parameter we can tune is `n_trees`, which determines the number of estimators to build. The code below measures model performance for a variety of hyperparameter values.

In [None]:
# create a plot of preformance versus f1 score for different c values
def reg_plot(ntree_vals, X_train, y_train, resample = True):
    ntree_scores = []
    print "Running 7-fold cross validation for:"
    for i in range(len(ntree_vals)):
        print "C = %f" % ntree_vals[i]
        evalgbm = EvalGBM(X_train, y_train, n_estimators = ntree_vals[i])
        ntree_scores.append(np.mean(evalgbm.kfold(7, False, False, resample).Test_score))

    plt.clf()
    plt.ylabel('mean cross validation f1 score')
    plt.xlabel('Number of weak estimators')
    plt.plot(ntree_vals, ntree_scores, linestyle = '-')
    plt.show()
    plt.figure(figsize=(10,10))
    return ntree_vals[ntree_scores.index(max(ntree_scores))]

In [None]:
ntree_vals = [20,40, 50, 100, 150, 200, 250, 300, 350]
reg_plot(ntree_vals, X_train, y_train)

These results suggest that a smaller number of estimators may increase test performance, so let's retrain a model with 20 trees:

In [None]:
evalgbm = EvalGBM(X_train, y_train, n_estimators = 20)
scores = evalgbm.kfold(7, True, resample = True)
print scores

In [None]:
print scores
print "Train:", np.mean(scores.Train_score)
print "Test:", np.mean(scores.Test_score)

This is a good start, but this classifier type has a large number of tuneable hyperparameters which may help us. In this project, we focus just on `n_trees` and `max_depth` due to computational and time constraints.

In [None]:
# measure performance of gbm using different values for ntrees and max_depth
def param_search(ntree_vals, md_vals, X_train, y_train, resample = True):
    scores = np.zeros((len(ntree_vals), len(md_vals)))
    print "Running 7-fold cross validation for:"
    for i in range(len(ntree_vals)):
        for j in range(len(md_vals)):
            print "n_trees = %d," % ntree_vals[i],
            print "max_depth = %d," % md_vals[j]
            evalgbm = EvalGBM(X_train, y_train, n_estimators = ntree_vals[i], max_depth = md_vals[j], metric = 'roc')
            scores[i,j] = np.mean(evalgbm.kfold(7, False, False, resample).Test_score)

    print scores
    score_opt = scores.max()
    params_opt = np.where(scores==score_opt)
    
    return (score_opt, params_opt)

In [None]:
# perform 2D grid search over a subset of hyperparameter values
ntree_vals = (20,40,60,80)
md_vals = (1,2,3,4)
param_search(ntree_vals, md_vals, X_train, y_train, resample = True)

In [None]:
# preliminary code to start looking at feature selection
evalgbm = EvalGBM(X_train, y_train, n_estimators = 20, max_depth = 2)
rXtrain, rytrain = evalgbm.resample(X_train, y_train, False)
evalgbm.gbm.fit(rXtrain, rytrain)
features = list(X_train.columns.values)
print features[np.argmax(evalgbm.gbm.feature_importances_)]

# Testing

Based on the tuning results above, a model of 40 estimators with a max-depth of two should ouperform our basic model. Here, we train and test that model

In [None]:
evalgbm = EvalGBM(X_train, y_train, n_estimators = 40, max_depth = 2)
rXtrain, rytrain = evalgbm.resample(X_train, y_train, False)
rXtest, rytest = evalgbm.resample(X_test, y_test, False)
evalgbm.setTrain(rXtrain, np.asarray(rytrain))
evalgbm.setTest(rXtest, rytest)


from sklearn.metrics import roc_curve, f1_score, confusion_matrix, roc_auc_score, accuracy_score
# fit to train data
evalgbm.gbm.fit(rXtrain, rytrain)

test_probs = evalgbm.gbm.predict_proba(rXtest)[:,1]
train_probs = evalgbm.gbm.predict_proba(rXtrain)[:,1]

fpr, tpr, thresholds = roc_curve(rytrain, train_probs, pos_label = 1)
f1s = [f1_score(rytrain, (train_probs>t).astype(int), average = 'binary') for t in thresholds]
f_i = np.argmax(np.asarray(f1s))

test_probs_f = (test_probs>thresholds[f_i]).astype(int)
print "\nOptimum threshold to maximize f1:", thresholds[f_i]
trainscore = f1_score(rytrain, (train_probs>thresholds[f_i]).astype(int), average = 'binary')
testscore = f1_score(rytest, test_probs_f, average = 'binary')

print "Train Score: %f Test Score: %f" % (trainscore, testscore)

print "AU-ROC:", roc_auc_score(rytest, test_probs)
print "Accuracy:", accuracy_score(rytest, test_probs_f)
print "Confusion Matrix:\n", confusion_matrix(rytest, test_probs_f)

plt.figure()
lw = 2


fpr, tpr, thresholds = roc_curve(rytest, test_probs, pos_label = 1)
plt.plot(fpr, tpr, color = 'darkorange', lw = lw, label = 'ROC curve')
plt.plot([0,1], [0,1], color = 'navy', lw = lw, linestyle = '--')
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.05])
plt.legend(loc='lower right')
plt.show()
plt.figure(figsize = (10,10))


In [None]:
print list(fpr)
print list(tpr)

# Bagging

Perhaps we can combine the LR and GBM classifiers to achieve even better performance.

In [None]:
# Import EvalLR
import class_EvalLR
reload(class_EvalLR)
from class_EvalLR import EvalLR

# train our best LR classifier
evalr = EvalLR(X_train, y_train, c=0.125)
evalr.lr.fit(rXtrain, rytrain)
test_probs_lr = evalr.lr.predict_proba(rXtest)[:,1]
train_probs_lr = evalr.lr.predict_proba(rXtrain)[:,1]

# average predictions for both classifiers
test_probs_bg = (test_probs_lr + test_probs)/2
train_probs_bg = (train_probs_lr + train_probs)/2

# evaluate bagged classifier
fpr, tpr, thresholds = roc_curve(rytrain, train_probs_bg, pos_label = 1)
f1s = [f1_score(rytrain, (train_probs_bg>t).astype(int), average = 'binary') for t in thresholds]
f_i = np.argmax(np.asarray(f1s))

test_probs_f = (test_probs_bg>thresholds[f_i]).astype(int)
print "\nOptimum threshold to maximize f1:", thresholds[f_i]
trainscore = f1_score(rytrain, (train_probs_bg>thresholds[f_i]).astype(int), average = 'binary')
testscore = f1_score(rytest, test_probs_f, average = 'binary')

print "Train Score: %f Test Score: %f" % (trainscore, testscore)

print "AU-ROC:", roc_auc_score(rytest, test_probs_bg)
print "Accuracy:", accuracy_score(rytest, test_probs_f)
print "Confusion Matrix:\n", confusion_matrix(rytest, test_probs_f)

plt.figure()
lw = 2


fpr, tpr, thresholds = roc_curve(rytest, test_probs_bg, pos_label = 1)
plt.plot(fpr, tpr, color = 'darkorange', lw = lw, label = 'ROC curve')
plt.plot([0,1], [0,1], color = 'navy', lw = lw, linestyle = '--')
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.05])
plt.legend(loc='lower right')
plt.show()
plt.figure(figsize = (10,10))

In [None]:
print list(fpr)
print list(tpr)

In [None]:
ssc_subjects = list(set(SSC_asd.index.values.tolist() + SSC_ctrl.index.values.tolist()) & set(subjects))
agre_subjects = list(set(Agre_asd.index.values.tolist() + Agre_ctrl.index.values.tolist()) & set(subjects))

ssc_test_subjects = list(set(ssc_subjects) & set(rytest.index.values.tolist()))
agre_test_subjects = list(set(agre_subjects) & set(rytest.index.values.tolist()))

print len(scc_subjects)
print len(agre_subjects)

ssc_ytest = rytest.ix[ssc_test_subjects]
ssc_Xtest = rXtest.ix[ssc_test_subjects]

agre_ytest = rytest.ix[agre_test_subjects]
agre_Xtest = rXtest.ix[agre_test_subjects]

As a final check, measure performance separately on the SSC and Agre members of the test set.

In [None]:
# results for ssc
test_probs_lr = evalr.lr.predict_proba(ssc_Xtest)[:,1]
test_probs = evalgbm.gbm.predict_proba(ssc_Xtest)[:,1]

# average predictions for both classifiers
test_probs_bg = (test_probs_lr + test_probs)/2

print "AU-ROC:", roc_auc_score(ssc_ytest, test_probs_bg)

plt.figure()
lw = 2

fpr, tpr, thresholds = roc_curve(ssc_ytest, test_probs_bg, pos_label = 1)
plt.plot(fpr, tpr, color = 'darkorange', lw = lw, label = 'ROC curve')
plt.plot([0,1], [0,1], color = 'navy', lw = lw, linestyle = '--')
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.05])
plt.legend(loc='lower right')
plt.show()
plt.figure(figsize = (10,10))

In [None]:
# results for agre
test_probs_lr = evalr.lr.predict_proba(agre_Xtest)[:,1]
test_probs = evalgbm.gbm.predict_proba(agre_Xtest)[:,1]

# average predictions for both classifiers
test_probs_bg = (test_probs_lr + test_probs)/2

print "AU-ROC:", roc_auc_score(agre_ytest, test_probs_bg)

plt.figure()
lw = 2

fpr, tpr, thresholds = roc_curve(agre_ytest, test_probs_bg, pos_label = 1)
plt.plot(fpr, tpr, color = 'darkorange', lw = lw, label = 'ROC curve')
plt.plot([0,1], [0,1], color = 'navy', lw = lw, linestyle = '--')
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.05])
plt.legend(loc='lower right')
plt.show()
plt.figure(figsize = (10,10))