In [None]:
import numpy as np
import pandas as pd

In [None]:
!pip install pynrrd

In [None]:
from tqdm import tqdm
import os
from random import randint

import numpy as np
import pandas as pd

import nibabel as nib
import pydicom as pdm
import nilearn as nl
import nilearn.plotting as nlplt
import nrrd
import h5py

import matplotlib.pyplot as plt
from matplotlib import cm
import matplotlib.animation as anim

import imageio
from skimage.transform import resize
from skimage.util import montage

from IPython.display import Image as show_gif

import warnings
warnings.simplefilter("ignore")

from glob import glob
from os.path import join as opj

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
sns.set_context('notebook')

from tqdm.notebook import tqdm

In [None]:
sample_filename = '../input/trends-assessment-prediction/fMRI_train/10001.mat'
matlab_file = h5py.File(sample_filename)
print(matlab_file.keys())
print(matlab_file.values())
print(matlab_file['SM_feature'][()].shape)

In [None]:
# Loading reference image
fmri_mask = nl.image.load_img('../input/trends-assessment-prediction/fMRI_mask.nii')

# Reorienting the axis of 3D spatial map
spatial_maps = np.moveaxis(matlab_file['SM_feature'][()], [0, 1, 2, 3], [3, 2, 1, 0]) 

# Loading 3D spatial maps
spatial_maps_niimg = nl.image.new_img_like(ref_niimg=fmri_mask,
                                           data=spatial_maps,
                                           affine=fmri_mask.affine,
                                           copy_header=True)

In [None]:
img = img = list(nl.image.iter_img(spatial_maps_niimg))[0]
view = nlplt.view_img_on_surf(img,
                              title=f'10009.mat Spatial Map 0 view_img_on_surf',
                              title_fontsize=20,
                              threshold=1,
                              black_bg=False)
view.open_in_browser()
view

In [None]:
path = '/kaggle/input/trends-assessment-prediction/'
#targets = pd.read_csv(opj(path, 'train_scores.csv')).set_index('Id')

In [None]:
# Load the target features
targets = pd.read_csv('/kaggle/input/trends-assessment-prediction/train_scores.csv').set_index('Id')
targets.head()

In [None]:
# How many null values does each target have
n_nulls = targets.isnull().sum()
display(n_nulls)
n_nulls.plot.barh();

Observation 1
So the first hunch is correct. Domain 1 and domain 2 seem to be connected, as they both contain the same amount of missing values.

In [None]:
targets.dropna(inplace=True)

In [None]:
sns.heatmap(targets.corr()*100, square=True, annot=True, fmt='.0f');

Observation 2
While there is a slight correlation within domain 2, there doesn't seem to be one within domain 1. So, the targets were measured in the same domain, but do not necessarily encode a connected property. Except for this domain2 connection

In [None]:
targets.plot(lw=0, marker='.', markersize=1, subplots=True, figsize=(14, 8));

Just looking at the targets, most seems to be normal. Except for age, something is unique.So let us have a closer look.

In [None]:
targets['age'].plot(lw=0, marker='.', markersize=1, figsize=(14, 4));

In [None]:
targets['age'].nunique()

In [None]:
#distribution of age over target
plt.plot(targets['age'].sort_values().values);

If we try to predict this feature, it might be worth it to restrict the predictions to these 33 unique values.

In [None]:
sns.pairplot(targets, plot_kws=dict(s=5, alpha=0.5));

We can see again the stratification of age, but what is more interesting is the relationship within domain2

In [None]:
plt.figure(figsize=(6, 6))
d2 = targets.dropna().iloc[:, 3:].values
plt.scatter(d2[:, 0], d2[:, 1], s=3);

#### Function to rotate a point around the origin (0, 0).

In [None]:
def rotate_origin(x, y, radians):
    xx = x * np.cos(radians) + y * np.sin(radians)
    yy = -x * np.sin(radians) + y * np.cos(radians)
    return np.array([xx, yy]).T

#### To get the best rotation, we have to plot the length of unique values that can be notices in variable 2 with every rotation and take the angle at which the number of unique values are minimum.

In [None]:
# Function to plot unique values in a given range

def plot_unique(start,end,d2,noOfVals):
    n_uniques = []
    for r in np.linspace(start, end, noOfVals):
        d22_rot = rotate_origin(d2[:, 0], d2[:, 1], r)[:, 1]
        n_uniques.append([r, len(np.unique(np.round(d22_rot, 6)))])
    n_uniques = np.array(n_uniques)

    plt.figure(figsize=(14, 2))
    plt.scatter(n_uniques[:, 0], n_uniques[:, 1], s=3);

In [None]:
# Trying for values from 0.85 to 0.95 radians
plot_unique(0.85,0.95,d2,5000)

In [None]:
# As we can see sudden drop, trying for values from 0.905 to 0.910 radians
plot_unique(0.905,0.910,d2,5000)

In [None]:
# Trying for values from 0.90771 to 0.907715 radians
plot_unique(0.90771,0.907715,d2,5000)

So the best value to rotate is taken as 0.90771256655

In [None]:
# rgets = pd.read_csv(opj(path, 'train_scores.csv')).set_index('Id')
# Let's also create the rotated domain2 targets
rot = 0.90771256655
d2 = rotate_origin(targets.iloc[:, 3].values, targets.iloc[:, 4].values, rot)
targets['d21_rot'] = d2[:, 0]
targets['d22_rot'] = d2[:, 1]

#### Plotting Distribution of Target variables

In [None]:
from scipy.stats import norm
for col in targets.columns:
    plt.figure(figsize=(8, 2))
    sns.distplot(targets[col], fit=norm, kde=True)
    plt.show()

Age is almost normally distributed but for the other targets power transformations can be applied.

In [None]:
# Let's apply the power transformation to make the value distribution gaussian
pow_age = 1.0
pow_d1v1 = 1.5
pow_d1v2 = 1.5
pow_d2v1 = 1.5
pow_d2v2 = 1.5
pow_d21 = 1.5
pow_d22 = 1
powers = [pow_age, pow_d1v1, pow_d1v2, pow_d2v1, pow_d2v2, pow_d21, pow_d22 ]

from scipy.stats import norm
for i, col in enumerate(targets.columns):
    plt.figure(figsize=(8, 2))
    sns.distplot(np.power(targets[col], powers[i]), fit=norm, kde=True)
    plt.show()

In [None]:
for i, col in enumerate(targets.columns):
    targets[col] = np.power(targets[col], powers[i])

### Data Scaling

Loading the targets and doing necessary rotation and power transformations as done before without excluding the NULL values.

In [None]:
targets = pd.read_csv(opj(path, 'train_scores.csv')).set_index('Id')

In [None]:
# Creating the rotated domain2 targets
rot = 0.90771256655
d2 = rotate_origin(targets.iloc[:, 3].values, targets.iloc[:, 4].values, rot)
targets['d21_rot'] = d2[:, 0]
targets['d22_rot'] = d2[:, 1]

In [None]:
pow_age = 1.0
pow_d1v1 = 1.5
pow_d1v2 = 1.5
pow_d2v1 = 1.5
pow_d2v2 = 1.5
pow_d21 = 1.5
pow_d22 = 1
powers = [pow_age, pow_d1v1, pow_d1v2, pow_d2v1, pow_d2v2, pow_d21, pow_d22 ]

for i, col in enumerate(targets.columns):
    targets[col] = np.power(targets[col], powers[i])

In [None]:
from sklearn.preprocessing import StandardScaler

# And last but not least, let's scale the target features using ab
scaler = StandardScaler()
targets.iloc[:, :] = scaler.fit_transform(targets)
targets.head()

In [None]:
# Extract ID to separate train and test set
train_id = targets.index.values
sample_submission = pd.read_csv(opj(path, 'sample_submission.csv'))
test_id = np.unique(sample_submission.Id.str.split('_', expand=True)[0].astype('int'))
print(train_id.shape, test_id.shape)

In [None]:
# Load ICs from the loading file and separate them into train and test set
df_ic = pd.read_csv(opj(path, 'loading.csv'))
ic_train = df_ic[df_ic.Id.isin(train_id)].set_index('Id')
ic_test = df_ic[df_ic.Id.isin(test_id)].set_index('Id')
print(ic_train.shape, ic_test.shape)

In [None]:
# Load FNCs from file and separate them into train and test set
df_fnc = pd.read_csv(opj(path, 'fnc.csv'))
fnc_train = df_fnc[df_fnc.Id.isin(train_id)].set_index('Id')
fnc_test = df_fnc[df_fnc.Id.isin(test_id)].set_index('Id')
print(fnc_train.shape, fnc_test.shape)

### Feature Exploration

In [None]:
def plot_corr_matrix(df_train, df_test, c_restrict=200):

    # Correlation matrix for ICA components
    fig, ax = plt.subplots(ncols=3, figsize=(20, 10))
    abs_max = 1.0
    sns.heatmap(df_train.iloc[:, :c_restrict].corr(), square=True, vmin=-abs_max, vmax=abs_max, cbar=False, ax=ax[0]);
    sns.heatmap(df_test.iloc[:, :c_restrict].corr(), square=True, vmin=-abs_max, vmax=abs_max, cbar=False, ax=ax[1]);
    sns.heatmap(df_train.iloc[:, :c_restrict].corr()-df_test.iloc[:, :c_restrict].corr(),
                square=True, vmin=-0.33, vmax=0.33, cbar=False, ax=ax[2]);
    ax[0].set_title('Train')
    ax[1].set_title('Test')
    ax[2].set_title('Difference (Train - Test)');

In [None]:
# Correlation matrix for IC features
plot_corr_matrix(ic_train, ic_test, c_restrict=100)

In [None]:
# Correlation matrix for FNC features
plot_corr_matrix(fnc_train, fnc_test, c_restrict=100)

In [None]:
def plot_corr_matrix_target(targets, df_train, c_restrict=100):

    # Merge target and feature matrix
    df_temp = pd.merge(targets.reset_index(), df_train.reset_index())
    df_temp = df_temp.set_index('Id').iloc[:, :c_restrict]
    
    # Correlation matrix for ICA components
    plt.figure(figsize=(16, 3))
    sns.heatmap(df_temp.corr().iloc[:7, 7:], square=True,
                vmin=-0.5, vmax=0.5, cbar=False, cmap='Spectral');

In [None]:
# Correlation between IC features and targets
plot_corr_matrix_target(targets, ic_train, c_restrict=100)

In [None]:
# Correlation between FNC features and targets
plot_corr_matrix_target(targets, fnc_train, c_restrict=100)

Observation 1
For both feature types, the correlation with age seems to be the highest. Let's explore this in a bit more detail. What is the highest correlation features can reach with the 5 targets?

In [None]:
# Show highest correlation with target variables and IC dataset
df_corr = pd.concat([np.abs(ic_train.corrwith(targets.iloc[:, i])).sort_values(ascending=False).reset_index(drop=True) for i in range(7)], axis=1)
df_corr.columns = targets.columns
df_corr.head(5)

In [None]:
# Show highest correlation with target variables and FNC dataset
df_corr = pd.concat([np.abs(fnc_train.corrwith(targets.iloc[:, i])).sort_values(ascending=False).reset_index(drop=True) for i in range(7)], axis=1)
df_corr.columns = targets.columns
df_corr.head(5)

In [None]:
#Explore between features and targets
# Number of columns to investigate
n_invest = 10
sns.pairplot(ic_train.iloc[:, :n_invest], diag_kind="kde", corner=True);

In [None]:
sns.pairplot(fnc_train.iloc[:, :n_invest], diag_kind="kde", corner=True);

In [None]:
def plot_markers(key, df_temp, ncolmarker=5, split_at=5, plot_max=15):

    # Restrict dataframe to first X features
    df_temp = df_temp.iloc[:, :plot_max]

    # Compute dataset selecters
    ncolumns = np.arange(df_temp.shape[1])
    selecter = np.split(ncolumns, ncolumns[::split_at][1:])

    for s in selecter:

        print(key, s)
        df_temp.iloc[:, s].plot(kind='line',subplots=True, sharex=True, marker='.', lw=0,
                                ms=10, markeredgecolor='k', markeredgewidth=0.3,
                     figsize=(5 * ncolmarker, 4 * df_temp.iloc[:, s].shape[1]//ncolmarker), layout=(-1,ncolmarker));
        plt.show()

In [None]:
plot_markers('Visualization of IC features:', ic_train)

Conclusion 1
The datasets seem to contain a few outliers. We will take care of them at the very end.

## Prediction

In [None]:
import warnings
warnings.filterwarnings("ignore")

!pip install pycaret --quiet

In [None]:
from sklearn.model_selection import KFold

from pycaret.regression import *

### Loading datasets and making train and test datasets

In [None]:
BASE_PATH = '../input/trends-assessment-prediction'

fnc_df = pd.read_csv(f"{BASE_PATH}/fnc.csv")
loading_df = pd.read_csv(f"{BASE_PATH}/loading.csv")
labels_df = pd.read_csv(f"{BASE_PATH}/train_scores.csv")

In [None]:
fnc_features, loading_features = list(fnc_df.columns[1:]), list(loading_df.columns[1:])
df = fnc_df.merge(loading_df, on="Id")
labels_df["is_train"] = True
df = df.merge(labels_df, on="Id", how="left")

test_df = df[df["is_train"] != True].copy()
df = df[df["is_train"] == True].copy()
print(f'Shape of train data: {df.shape}, Shape of test data: {test_df.shape}')

In [None]:
target_cols = ['age', 'domain1_var1', 'domain1_var2', 'domain2_var1', 'domain2_var2']
df.drop(['is_train'], axis=1, inplace=True)
test_df = test_df.drop(target_cols + ['is_train'], axis=1)


# Giving less importance to FNC features since they are easier to overfit due to high dimensionality.
FNC_SCALE = 1/500
df[fnc_features] *= FNC_SCALE
test_df[fnc_features] *= FNC_SCALE

As pycaret supports only a single column prediction at a time, a utility function is defined to get values of a given col.

In [None]:
def get_train_data(target):
    other_targets = [tar for tar in target_cols if tar != target]
    train_df = df.drop( other_targets, axis=1)
    return train_df

Not using a few types of regression as it takes a lot of time

In [None]:
blacklist_models = ['ransac', 'tr', 'rf', 'et', 'ada', 'gbr', 'xgboost', 'catboost']

### Firstly taking the Age column for prediction

In [None]:
target = target_cols[0]
train_df = get_train_data(target)

setup_reg = setup(
    data = train_df,
    target = target,
    train_size=0.8,
    numeric_imputation = 'mean',
    silent = True
)

compare_models(
    exclude = blacklist_models,
    fold = 7,
    sort = 'MAE',
    turbo = True
)

#### Creating a Bayesian Ridge model

In [None]:
br_age = create_model(
    estimator='br',
    fold=7
)

#### Tuning the Bayesian Ridge model to optimize on MAE (metric for the competition)

In [None]:
tuned_br_age = tune_model(
    br_age,
    optimize = 'MAE'
)

#### Plotting the Learning Curve

In [None]:
plot_model(tuned_br_age,plot = 'learning')

#### Plotting the Residuals

In [None]:
plot_model(tuned_br_age, plot = 'residuals')

#### Plotting the feature importance

In [None]:
plot_model(tuned_br_age,plot = 'feature')

#### Prediction on age for test dataset

In [None]:
predictions =  predict_model(tuned_br_age, data=test_df)

In [None]:
predictions[['Id','Label']].head()

### Predicting the other targets:

#### domain1_var1

In [None]:
target = target_cols[1]
train_df = get_train_data(target)

setup_reg = setup(
    data = train_df,
    target = target,
    train_size=0.8,
    numeric_imputation = 'mean',
    silent = True
)

compare_models(
    exclude = blacklist_models,
    fold = 7,
    sort = 'MAE',
    turbo = True
)

#### domain1_var2

In [None]:
target = target_cols[2]
train_df = get_train_data(target)

setup_reg = setup(
    data = train_df,
    target = target,
    train_size=0.8,
    numeric_imputation = 'mean',
    silent = True
)

compare_models(
    exclude = blacklist_models,
    fold = 7,
    sort = 'MAE',
    turbo = True
)

#### domain2_var1

In [None]:
target = target_cols[3]
train_df = get_train_data(target)

setup_reg = setup(
    data = train_df,
    target = target,
    train_size=0.8,
    numeric_imputation = 'mean',
    silent = True
)

compare_models(
    exclude = blacklist_models,
    fold = 7,
    sort = 'MAE',
    turbo = True
)

#### domain2_var2

In [None]:
target = target_cols[4]
train_df = get_train_data(target)

setup_reg = setup(
    data = train_df,
    target = target,
    train_size=0.8,
    numeric_imputation = 'mean',
    silent = True
)

compare_models(
    exclude = blacklist_models,
    fold = 7,
    sort = 'MAE',
    turbo = True
)

**OBSERVATIONS:**
1. age          - Bayesian Ridge
2. domain1_var1 - Linear Regression
3. domain1_var2 - Lasso Least Angle Regression
4. domain2_var1 - Linear Regression
5. domain2_var2 - Linear Regression


#### Function to tune and ensemble (Bagging) best model for each target:

In [None]:
models = []

target_models_dict = {
    'age': 'br',
    'domain1_var1':'lr',
    'domain1_var2':'llar',
    'domain2_var1':'lr',
    'domain2_var2':'lr',
}

def tune_and_ensemble(target):
    train_df = get_train_data(target)    
    exp_reg = setup(
        data = train_df,
        target = target,
        train_size=0.8,
        numeric_imputation = 'mean',
        silent = True
    )
    model_name = target_models_dict[target]
    mod = create_model(model_name,fold=7)
    tuned_model = tune_model(mod, fold=7, optimize = 'MAE')
    model = ensemble_model(tuned_model, fold=7, optimize = 'MAE', choose_better = True)
    return model

In [None]:
for target in target_cols:
    model = tune_and_ensemble(target)
    models.append(model)

#### Function to finalize and save model

In [None]:
models

In [None]:
def finalize_model_pipeline(model, target):
    # this will train the model on houldout data
    finalize_model(model)
    save_model(model, f'{target}_{target_models_dict[target]}', verbose=True)
    # making predictions on test data
    predictions = predict_model(model, data=test_df)
    test_df[target] = predictions['Label'].values

In [None]:
for index, target in enumerate(target_cols):
    model = models[index]
    finalize_model_pipeline(model,target)

### Creating submission csv file

In [None]:
sub_df = pd.melt(test_df[["Id", "age", "domain1_var1", "domain1_var2", "domain2_var1", "domain2_var2"]], id_vars=["Id"], value_name="Predicted")
sub_df["Id"] = sub_df["Id"].astype("str") + "_" +  sub_df["variable"].astype("str")

sub_df = sub_df.drop("variable", axis=1).sort_values("Id")
assert sub_df.shape[0] == test_df.shape[0]*5

sub_df.to_csv("submission1.csv", index=False)

sub_df.head(15)

In [None]:
models[0].get_params()

In [None]:
age_train_df = get_train_data('age')
preds = predict_model(models[0], data=age_train_df)
preds


In [None]:
np.mean(np.sum(np.abs(preds['age'] - preds['Label']), axis=0)/np.sum(preds['age'], axis=0))

In [None]:
predictions = []
op = 'Label'
overall = 0.0
for i in range(5):
    train_df = get_train_data(target_cols[i])
    preds = predict_model(models[i], data=train_df)
    predictions.append(preds)
    score = np.mean(np.sum(np.abs(preds[target_cols[i]] - preds[op]), axis=0)/np.sum(preds[target_cols[i]], axis=0))
    overall+=score
    print(f'{target_cols[i]}: \t{score}')
print(f"Overall score = {overall/5}")