# Optimal Age Range

Our preliminary assumption is that there is sufficient information in the microbiome to predict Autism status. But are prediction accuracies uniformly spread across ages, or are there subsets of ages in which the samples are more separable? To test this, we first perform a sweep of different age ranges: subsetting data and training different models.

In [22]:
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import os, sys
import numpy as np
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RepeatedStratifiedKFold

sys.path.insert(0, '..')

from utils import UnityScaler


In [24]:
reads_threshold = 6000

data_directory = '../data/'
data_filename = 'metaanalysis_data.pickle'
metadata_filename = 'metaanalysis_metadata.pickle' 

data = pickle.load(open(os.path.join(data_directory, data_filename), 'rb'))
metadata = pickle.load(open(os.path.join(data_directory, metadata_filename), 'rb'))

# remove samples with fewer than reads_threshold 
data = data.loc[data.sum(axis = 1) >= reads_threshold]
metadata = metadata.loc[data.index]

# remove samples with valid ages
metadata = metadata.loc[~metadata['Age'].isna()]
data = data.loc[metadata.index]

# label encoder
label_encoder = LabelEncoder()
metadata['Status'] = label_encoder.fit_transform(metadata['Status'])

# normalize each sample
data = UnityScaler().fit_transform(data)

  df /= np.sum(df, axis = self.axis)[:, None]


In [54]:
start_range = [1, 16]
end_range = [3, 18]
model = GradientBoostingClassifier()
results = {(start,end): [] for start in range(start_range[0], start_range[1]) for end in range(end_range[0], end_range[1])}

kf = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 1, random_state = 1)


In [None]:
for start in range(start_range[0], start_range[1]):
    for end in range(end_range[0], end_range[1]):
        print(start, end)
        if start >= end-1: 
            continue
        inclusion = (metadata['Age'] >= start) & (metadata['Age'] < end)
        iteration = 0
        while iteration < iterations:
            try:
                for train_index, test_index in kf.split(data.loc[inclusion], metadata.loc[inclusion, 'Status'].values):
                    xtrain, xtest = data.iloc[train_index], data.iloc[test_index]
                    ytrain, ytest = metadata.iloc[train_index]['Status'], metadata.iloc[test_index]['Status'].values, 
                    model.fit(xtrain, ytrain)
                    results[start,end].append(roc_auc_score(model.predict(xtest), ytest))
            except Exception as e:
                print(iteration, e)
                iteration -= 1
                pass
            iteration += 1

1 3
17 Only one class present in y_true. ROC AUC score is not defined in that case.
1 4
1 5
1 6
1 7
1 8
1 9
1 10
1 11
1 12
1 13
1 14
1 15
1 16
1 17
2 3
2 4
2 5
2 6
2 7
2 8
2 9
2 10
2 11
2 12
2 13
2 14
2 15
2 16
2 17


In [None]:
pickle.dump(results, open('optimal_ages.pickle', 'wb'))

In [None]:
results = pickle.load(open('optimal_ages.pickle', 'rb'))

In [None]:
def get_grid(results):
    grid = np.zeros((start_range[1] - start_range[0], end_range[1] - end_range[0], iterations))
    for (start, end), value in results.items():
        temp = value.copy()
        try:
            grid[start - start_range[0], end - end_range[0]] = temp
        except:
            continue
    grid = np.nanmean(grid,axis = 2)[:-1, 1:]
    
    for i in range(1, 14):
        grid[i:,i-1] = np.nan
    return grid

def display_grid(grid, title, cmap = 'jet'):

    plt.imshow(grid, cmap = 'jet', )
    plt.ylabel('Lower bound (years)')
    plt.xlabel('Upper bound (years)')
    plt.yticks(np.arange(grid.shape[0]), np.arange(grid.shape[0]) + start_range[0])
    plt.xticks(np.arange(grid.shape[1]), np.arange(grid.shape[1]) + end_range[0])
    plt.colorbar()
    plt.savefig(f'{title}.pdf', dpi = 1200)

In [None]:
display_grid(get_grid(results),  title = 'OptimalAge')

In [None]:
# get sample size grid
sample_size = {}
for start in range(start_range[0], start_range[1]):
    for end in range(end_range[0], end_range[1]):
        if start >= end: 
            continue
        inclusion = (metaanalysis.metadata['Age'] >= start) & (metaanalysis.metadata['Age'] < end)
        sample_size[start,end] = [inclusion.sum()]
display_grid(get_grid(sample_size),  title = 'SampleSize')

In [None]:
df = pd.DataFrame([get_grid(sample_size).flatten(), get_grid(results).flatten()], index = ['Sample Size', 'AUC']).T
df = df.dropna()

sns.lmplot(data = df, x = 'Sample Size', y = 'AUC', order = 2)
plt.savefig('Relationship between sample size and AUC.pdf', dpi = 1200)