# Predict age using AFQ-Insight

In [1]:
import afqinsight as afqi
import itertools
import matplotlib.pyplot as plt
import numpy as np
import os.path as op
import pandas as pd
import pickle

from mpl_toolkits.mplot3d import Axes3D

from bokeh.io import output_notebook
from bokeh.embed import file_html
from bokeh.layouts import row, column, widgetbox
from bokeh.models import BoxSelectTool, ColorBar, CustomJS, HoverTool, Legend, Range1d, Title, Whisker
from bokeh.models.tickers import FixedTicker
from bokeh.models.annotations import LegendItem
from bokeh.palettes import Spectral10, Cividis256, Category10_10
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.resources import CDN
from bokeh.models.mappers import LinearColorMapper

from sklearn.decomposition import PCA

%matplotlib notebook

In [2]:
output_notebook()

## Load the data

In [3]:
afq_data = afqi.load_afq_data(
    '../data/raw/als_data',
    target_cols=['age']
)

x, y, groups, columns, bias_index = (
    afq_data.x,
    afq_data.y['age'],
    afq_data.groups,
    afq_data.columns,
    afq_data.bias_index
)

## Find the optimal feature coefficients $\widehat{\beta}$

We search for the optimal coefficients using two different loss types: square loss and huber.

In [4]:
hp_cv_res_square = afqi.fit_hyperparams_cv(
    x, y, groups, bias_index=bias_index,
    max_evals_per_cv=1000, loss_type='square',
    score='rmse',
    trials_pickle_dir='./trials_age_regression/cv10_rs42_square_rmse',
    verbose=1, random_state=42, clf_threshold=0.5
)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




In [5]:
[(r.alpha1, r.alpha2) for r in hp_cv_res_square]

[(0.006588269939251683, 0.1844564478401641),
 (0.030567512170753625, 0.22765972873339727),
 (0.15997228115472426, 0.6392659086427099),
 (0.09799946026492448, 0.6696910798616379),
 (0.013601594112556509, 0.4807439358064541),
 (0.001168840138901625, 0.857115863630093),
 (4.8675271246158, 0.11259447095586812),
 (0.27874451176625414, 4.533084243443717),
 (0.5595391753533444, 0.6200126473803248),
 (0.043208488051826255, 0.3706039961484397)]

In [6]:
def print_results_summary(hp_cv_results):
    template = '{stat:15s} {mean:7.5g} ({var:7.5g})'
    test = [r.test for r in hp_cv_results]
    train = [r.train for r in hp_cv_results]
    test_rmse = [t.rmse for t in test]
    test_r2 = [t.r2 for t in test]
    train_rmse = [t.rmse for t in train]
    train_r2 = [t.r2 for t in train]

    print('Statistic         mean   (variance)')
    print('--------------  ------- ------------')
    print(template.format(stat='test RMSE', mean=np.mean(test_rmse), var=np.var(test_rmse)))
    print(template.format(stat='test R2', mean=np.mean(test_r2), var=np.var(test_r2)))
    print(template.format(stat='train RMSE', mean=np.mean(train_rmse), var=np.var(train_rmse)))
    print(template.format(stat='train R2', mean=np.mean(train_r2), var=np.var(train_r2)))

In [7]:
[r.train.r2 for r in hp_cv_res_square]

[0.9962007291675848,
 0.9939254517652314,
 0.9471927618036642,
 0.9515312503323857,
 0.9767032102833956,
 0.932322309491079,
 0.8704162021538943,
 -0.3205232941473397,
 0.9276402755698001,
 0.9819704729892975]

In [8]:
print_results_summary(hp_cv_res_square)

Statistic         mean   (variance)
--------------  ------- ------------
test RMSE        8.9557 ( 1.1805)
test R2         -1.2435 ( 2.4423)
train RMSE         2.43 ( 5.6588)
train R2        0.82574 (0.14728)


In [9]:
test_set_y_hat = pd.concat([
    pd.Series(data=cv.test.x.dot(cv.beta_hat),
              index=cv.test.y.index,
              name='yhat')
    for cv in hp_cv_res_square
])

In [10]:
df_y = pd.concat([y, test_set_y_hat], axis='columns', sort=True)
df_y['index'] = np.arange(len(y), dtype=np.int32)
df_y['stdres'] = (df_y['age'] - df_y['yhat']) / (np.std(df_y['age'] - df_y['yhat']))
df_y['subject_id'] = df_y.index
df_y.head()

Unnamed: 0,age,yhat,index,stdres,subject_id
subject_000,54,61.135609,0,-0.834308,subject_000
subject_001,69,58.052394,1,1.280013,subject_001
subject_002,55,60.428694,2,-0.634732,subject_002
subject_003,52,52.789566,3,-0.092317,subject_003
subject_004,58,61.737714,4,-0.43702,subject_004


In [12]:
p = figure(plot_width=600, plot_height=600, toolbar_location='above')
p.title.text = 'Residuals by subject'

source = ColumnDataSource(data=df_y)

hover = HoverTool(
    tooltips=[("Subject", "@subject_id"),
              ("Age", "@age")],
)
hover.point_policy = 'snap_to_data'
hover.line_policy = 'nearest'

p.circle(source=source,
         x='index',
         y='stdres',
         radius=0.4,
         line_color=None)

p.xaxis.axis_label = 'Subject'
p.yaxis.axis_label = 'Standardized Residuals'

p.add_tools(hover)

# html = file_html(p, CDN, 'my plot')
# with open(op.abspath('../docs/img/bokeh_plots/regression_residuals_by_subject.html'), 'w') as fp:
#     fp.write(html)
show(p)

In [14]:
p = figure(plot_width=500, plot_height=500, toolbar_location='above')
p_res = figure(plot_width=500, plot_height=500, toolbar_location='above')
p.title.text = 'Predicted vs. Actual Age for test splits'
p_res.title.text = 'Residuals for test splits'

source = ColumnDataSource(data=df_y)

hover = HoverTool(
    tooltips=[("Subject", "@subject_id"),
              ("Age", "@age"),
              ("Residual", "@stdres")],
)
hover.point_policy = 'snap_to_data'
hover.line_policy = 'nearest'

p.scatter(source=source,
          x='yhat',
          y='age',
          radius=0.2,
          line_color=None)

p.line(x=[y.min(), y.max()], y=[y.min(), y.max()], line_color="black")

p_res.scatter(source=source,
              x='yhat',
              y='stdres',
              radius=0.2,
              line_color=None)

p.xaxis.axis_label = 'Predicted Age'
p.yaxis.axis_label = 'Actual Age'

p_res.xaxis.axis_label = 'Predicted Age'
p_res.yaxis.axis_label = 'Standardized Residuals'

p.add_tools(hover)
p_res.add_tools(hover)

layout = row([p, p_res])

# html = file_html(layout, CDN, 'my plot')
# with open(op.abspath('../docs/img/bokeh_plots/regression_residuals.html'), 'w') as fp:
#     fp.write(html)
show(layout)

Using the hover tool on the chart above, we can see that subjects 05, 07, 16, 19, 30, 32, 35, 36 are all hard to classify (they are consistently closer to the classification threshold of 0.5). We should fire up the AFQ browser and look at how these subjects compare to the rest of the subjects in their group.

Here are links to a running instance of AFQ-Browser with the hard to classify subjects:
- [False negatives](https://yeatmanlab.github.io/Sarica_2017/?table[prevSort][count]=2&table[prevSort][order]=ascending&table[prevSort][key]=&table[sort][count]=2&table[sort][order]=ascending&table[sort][key]=class&table[selectedRows][subject_005]=true&table[selectedRows][subject_007]=true&table[selectedRows][subject_016]=true&table[selectedRows][subject_019]=true&table[selectedRows][subject_030]=false&table[selectedRows][subject_032]=false&table[selectedRows][subject_035]=false&table[selectedRows][subject_036]=false&plots[checkboxes][right-corticospinal]=true&plots[zoom][rd][scale]=1&plots[zoom][rd][translate][0]=-3&plots[zoom][rd][translate][1]=-21&plots[zoom][fa][scale]=2.1140360811227614&plots[zoom][fa][translate][0]=-27.244995845837778&plots[zoom][fa][translate][1]=-106.10468474511174&plots[plotKey]=fa&plots[errorType]=stderr&plots[lineOpacity]=0.09355440414507772)
- [False positives](https://yeatmanlab.github.io/Sarica_2017/?table[prevSort][count]=2&table[prevSort][order]=ascending&table[prevSort][key]=&table[sort][count]=2&table[sort][order]=ascending&table[sort][key]=class&table[selectedRows][subject_005]=false&table[selectedRows][subject_007]=false&table[selectedRows][subject_016]=false&table[selectedRows][subject_019]=false&table[selectedRows][subject_030]=true&table[selectedRows][subject_032]=true&table[selectedRows][subject_035]=true&table[selectedRows][subject_036]=true&plots[checkboxes][right-corticospinal]=true&plots[zoom][rd][scale]=1&plots[zoom][rd][translate][0]=-3&plots[zoom][rd][translate][1]=-21&plots[zoom][fa][scale]=2.1140360811227614&plots[zoom][fa][translate][0]=-27.244995845837778&plots[zoom][fa][translate][1]=-106.10468474511174&plots[plotKey]=fa&plots[errorType]=stderr&plots[lineOpacity]=0.09355440414507772)

# Feature Importance

Let's sort the features by their importance

In [11]:
feature_dicts = afqi.multicol2dicts(columns, tract_symmetry=False)

mean_beta = np.mean(np.array(
    [np.delete(res.beta_hat, bias_index) for res in hp_cv_res_square]
), axis=0)

mean_beta_converged = np.mean(np.array(
    [np.delete(res.beta_hat, bias_index)
     for res in (hp_cv_res_square[0:7] + hp_cv_res_square[8:])]
), axis=0)

sorted_features = afqi.sort_features(feature_dicts, mean_beta)

sorted_features[0:50]

[({'metric': 'torsion', 'tractID': 'Left Arcuate', 'nodeID': 36},
  -0.690501064666338),
 ({'metric': 'curvature', 'tractID': 'Left Corticospinal', 'nodeID': 67},
  -0.605071327838971),
 ({'metric': 'curvature', 'tractID': 'Left Corticospinal', 'nodeID': 68},
  -0.6005316879598295),
 ({'metric': 'fa', 'tractID': 'Right Uncinate', 'nodeID': 62},
  0.3910632116504863),
 ({'metric': 'curvature', 'tractID': 'Callosum Forceps Major', 'nodeID': 10},
  0.38926944937429),
 ({'metric': 'torsion', 'tractID': 'Left Thalamic Radiation', 'nodeID': 98},
  0.37824894383580937),
 ({'metric': 'fa', 'tractID': 'Right Uncinate', 'nodeID': 61},
  0.3407473862515903),
 ({'metric': 'torsion', 'tractID': 'Right Uncinate', 'nodeID': 79},
  0.33064886440276936),
 ({'metric': 'fa', 'tractID': 'Right IFOF', 'nodeID': 93},
  -0.3044669777525109),
 ({'metric': 'torsion', 'tractID': 'Right Cingulum Cingulate', 'nodeID': 3},
  0.29951299948347593),
 ({'metric': 'fa', 'tractID': 'Left Uncinate', 'nodeID': 0},
  -0.28

It's nice to see the top few features in a sorted list, but let's plot the features to get a feel for their distributions

In [12]:
beta_hats = afqi.beta_hat_by_groups(mean_beta, columns=columns, drop_zeros=True)
beta_hats_converged = afqi.beta_hat_by_groups(mean_beta_converged, columns=columns, drop_zeros=True)

First let's plot the coefficients themselves

In [13]:
afqi.plot.plot_betas(beta_hat=beta_hats, columns=columns)

In [14]:
unfolded_beta = afqi.transform.unfold_beta_hat_by_metrics(
    beta_hat=mean_beta,
    columns=columns
)

In [15]:
afqi.plot.plot_unfolded_beta(
    unfolded_beta=unfolded_beta,
    output_html='../docs/img/bokeh_plots/regression_unfolded_beta.html',
    width=1500,
    height=800
)

In [16]:
afqi.plot.plot_pca_space(
    x=np.delete(x, bias_index, axis=1),
    y=y,
    beta=mean_beta,
    target_name='Age',
    plot_type='regression',
    output_html='../docs/img/bokeh_plots/regression_pca_both.html',
    width=750,
    height=750
)

In [17]:
afqi.plot.plot_pca_space(
    x=np.delete(x, bias_index, axis=1),
    y=y,
    beta=mean_beta,
    target_name='Age',
    plot_type='regression',
    plot_both=False,
    output_html='../docs/img/bokeh_plots/regression_pca_sgl_only.html',
    width=750,
    height=750
)