In [None]:
!wget https://github.com/Chaogan-Yan/DPABI/raw/master/Templates/ch2better.nii

In [None]:
import warnings
warnings.filterwarnings('ignore')

import os
import re
import h5py
from tqdm import tqdm

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

from scipy import stats

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

import lightgbm as lgb

import nilearn as nl
import nilearn.plotting as nlplt
import nibabel as nib

SEED = 1337

In [None]:
df_fnc = pd.read_csv('../input/trends-assessment-prediction/fnc.csv')
df_loading = pd.read_csv('../input/trends-assessment-prediction/loading.csv')
df_train_scores = pd.read_csv('../input/trends-assessment-prediction/train_scores.csv')

fnc_features = list(df_fnc.columns[1:])
loading_features = list(df_loading.columns[1:])
target_features = ['age', 'domain1_var1', 'domain1_var2', 'domain2_var1', 'domain2_var2']

df_train_scores['is_train'] = 1

df = df_fnc.merge(df_loading, on='Id')
df = df.merge(df_train_scores, how='left', on='Id')

df.loc[df['is_train'].isnull(), 'is_train'] = 0
df['is_train'] = df['is_train'].astype(np.uint8)
df['Id'] = df['Id'].astype(np.uint16)

print(f'Static FNC Correlation Shape = {df_fnc.shape}')
print(f'Static FNC Correlation Memory Usage = {df_fnc.memory_usage().sum() / 1024 ** 2:.2f} MB')
print(f'sMRI SBM Loadings Shape = {df_loading.shape}')
print(f'sMRI SBM Loadings Memory Usage = {df_loading.memory_usage().sum() / 1024 ** 2:.2f} MB')
print(f'Train Scores Shape = {df_train_scores.shape}')
print(f'Train Scores Memory Usage = {df_train_scores.memory_usage().sum() / 1024 ** 2:.2f} MB')
print('-------------------------------------')
print(f'Train & Test Set Shape = {df.shape}')
print(f'Train & Test Set Memory Usage = {df.memory_usage().sum() / 1024 ** 2:.2f} MB')

del df_fnc, df_loading, df_train_scores

## **1. Train Scores (Targets)**
`train_scores.csv` is the file which consists of target features. Those features are `age`, `domain1_var1`, `domain1_var2`, `domain2_var1` and `domain2_var2`. There are 5 target features to predict and submissions are scored using feature-weighted, normalized absolute errors.

**score** ${= \Large \sum\limits_{f} w_{f} \Big( \frac{\sum_{i} \big| y_{f,i}  - \hat{y}_{f,i}\big| }{\sum_{i} \hat{y}_{f,i}} \Big) }$

The weights are `[.3, .175, .175, .175, .175]` corresponding to features `[age, domain1_var1, domain1_var2, domain2_var1, domain2_var2]`. This means every targets normalized absolute error is independent from each other. They can be trained and predicted with a single model or 5 different models.

Another important thing to consider is, `train_scores.csv` are not the original age and raw assessment values. They have been transformed and de-identified to help protect subject identity and minimize the risk of unethical usage of the data. Nonetheless, they are directly derived from the original assessment values and, thus, associations with the provided features is equally likely.

**Before transformation, the age in the training set is rounded to nearest year for privacy reasons. However, age is not rounded to year (higher precision) in the test set. Thus, heavily overfitting to the training set age will very likely have a negative impact on your submissions.**

### **1.1 Target Distributions**

Even though some of target features are slightly tailed, all of them follow a normal distribution. Their descriptive statistical summary are very similar to each other.

A small percentage of values are missing in target features except `age`. Target features in the same domain have same number of missing values and they are missing in same samples.  Those are skipped in the score calculation. However, every row should be predicted in the submission file.

In [None]:
def plot_target(target_feature): 
    
    if target_feature == 'age':
        print(f'Target feature {target_feature} Statistical Analysis\n{"-" * 39}')
    else:
        print(f'Target feature {target_feature} Statistical Analysis\n{"-" * 48}')
        
    print(f'Mean: {df[target_feature].mean():.4}  -  Median: {df[target_feature].median():.4}  -  Std: {df[target_feature].std():.4}')
    print(f'Min: {df[target_feature].min():.4}  -  25%: {df[target_feature].quantile(0.25):.4}  -  50%: {df[target_feature].quantile(0.5):.4}  -  75%: {df[target_feature].quantile(0.75):.4}  -  Max: {df[target_feature].max():.4}')
    print(f'Skew: {df[target_feature].skew():.4}  -  Kurtosis: {df[target_feature].kurtosis():.4}')
    missing_values_count = df[(df['is_train'] == 1) & (df[target_feature]).isnull()].shape[0]
    training_samples_count = df[df['is_train'] == 1].shape[0]
    print(f'Missing Values: {missing_values_count}/{training_samples_count} ({missing_values_count * 100 / training_samples_count:.4}%)')

    fig, axes = plt.subplots(ncols=2, figsize=(18, 6), dpi=100)

    sns.distplot(df[target_feature], label=target_feature, ax=axes[0])
    stats.probplot(df[target_feature], plot=axes[1])
    
    for i in range(2):
        axes[i].tick_params(axis='x', labelsize=12)
        axes[i].tick_params(axis='y', labelsize=12)
        axes[i].set_xlabel('')
        axes[i].set_ylabel('')
    axes[0].set_title(f'{target_feature} Distribution in Training Set')
    axes[1].set_title(f'{target_feature} Probability Plot')
    plt.show()
    
for target_feature in target_features:
    plot_target(target_feature)

### **1.2 Age**

First and the most important target variable is `age`. It has the highest weight in the competition metric. It is different than other target variables because `age` in training set is rounded to nearest year for privacy reasons. However, `age` in test set is not rounded. Therefore, overfitting to those rounded values might be disastrous.

Cardinality of `age` in training set is very low due to rounding. There are **33** unique values in `age`, so adding random noise to it might help models to generalize better.

In [None]:
fig = plt.subplots(figsize=(12, 6), dpi=100)

sns.countplot(y=df['age'], label=target_feature)

plt.xlabel('Value Counts', size=12)
plt.ylabel('')
plt.tick_params(axis='x', labelsize=12)
plt.tick_params(axis='y', labelsize=12)
plt.title(f'age Value Counts in Training Set', size=12, pad=12)

plt.show()

### **1.3 Domain 1 Variables**
Second group of target variables are `domain1_var1` and `domain1_var2`. They are dependent to each other and both of them have **438** missing values in the same samples.

There are anomalies in `domain1_var1` and `domain1_var2` values. Every unique value in `domain1_var1` can be another unique value in `domain1_var2`, and vice versa. Both of those target features have **5328** unique values. At first this could be related to high precision in target features, but there are duplicate samples which can be seen below. Thus, `domain1_var1` and `domain1_var2` look like unique value pairs rather than continuous data.

In [None]:
df_domain1_anomaly = df[(df['domain1_var1'] == 47.7159246378591) |
                        (df['domain1_var1'] == 40.0354975451987)][target_features].sort_values(by='domain1_var1')
df_domain1_anomaly.style.background_gradient(subset=['age', 'domain1_var1', 'domain1_var2'], cmap='viridis')

Even though `domain1_var1` and `domain1_var2` are independent, adding them to each other creates a new target feature which has higher predictive power and stronger linear relationships. The distribution of the new target is smooth and less tailed.

In [None]:
df['domain1'] = df['domain1_var1'] + df['domain1_var2']

for target_feature in ['domain1_var1', 'domain1_var2', 'domain1']:
    print(f'Target feature {target_feature} Statistical Analysis\n{"-" * (36 + len(target_feature))}')
    print(f'Mean: {df[target_feature].mean():.4}  -  Median: {df[target_feature].median():.4}  -  Std: {df[target_feature].std():.4}')
    print(f'Min: {df[target_feature].min():.4}  -  25%: {df[target_feature].quantile(0.25):.4}  -  50%: {df[target_feature].quantile(0.5):.4}  -  75%: {df[target_feature].quantile(0.75):.4}  -  Max: {df[target_feature].max():.4}')
    print(f'Skew: {df[target_feature].skew():.4}  -  Kurtosis: {df[target_feature].kurtosis():.4}\n')

fig, axes = plt.subplots(ncols=3, figsize=(18, 6), dpi=100)
plt.tight_layout()

sns.distplot(df['domain1_var1'], ax=axes[0])
sns.distplot(df['domain1_var2'], ax=axes[1])
sns.distplot(df['domain1'], ax=axes[2])

for i in range(3):
    axes[i].tick_params(axis='x', labelsize=12)
    axes[i].tick_params(axis='y', labelsize=10)
    axes[i].set_xlabel('')
    axes[i].set_ylabel('')
    
axes[0].set_title(f'domain1_var1 Distribution', size=15, pad=15)
axes[1].set_title(f'domain1_var2 Distribution', size=15, pad=15)
axes[2].set_title(f'domain1_var1 + domain1_var2 Distribution', size=15, pad=15)

plt.show()

df.drop(columns=['domain1'], inplace=True)

### **1.4 Domain 2 Variables**

Third group of target variables are `domain2_var1` and `domain2_var2`. They are dependent to each other as well and both of them have **39** missing values in the same samples.

The anomalies in `domain1_var1` and `domain1_var2` values exist in `domain2_var1` and `domain2_var2` and it is more severe. Both of those target features have **2038** unique values. They also look like unique value pairs rather than continuous data. 

In [None]:
df_domain2_anomaly = df[(df['domain2_var1'] == 39.350140365296397) |
                        (df['domain2_var1'] == 46.244030185775202) | 
                        (df['domain2_var1'] == 43.239463678844999)][target_features].sort_values(by='domain2_var1')

df_domain2_anomaly.style.background_gradient(subset=['age', 'domain2_var1', 'domain2_var2'], cmap='viridis')

Target features `domain2_var1` and `domain2_var2` are independent, and adding them to each other creates a new target which has a higher predictive power and stronger linear relationships as well. The distribution of the new target becomes more bumpy and tailed.

In [None]:
df['domain2'] = df['domain2_var1'] + df['domain2_var2']

for target_feature in ['domain2_var1', 'domain2_var2', 'domain2']:
    print(f'Target feature {target_feature} Statistical Analysis\n{"-" * (36 + len(target_feature))}')
    print(f'Mean: {df[target_feature].mean():.4}  -  Median: {df[target_feature].median():.4}  -  Std: {df[target_feature].std():.4}')
    print(f'Min: {df[target_feature].min():.4}  -  25%: {df[target_feature].quantile(0.25):.4}  -  50%: {df[target_feature].quantile(0.5):.4}  -  75%: {df[target_feature].quantile(0.75):.4}  -  Max: {df[target_feature].max():.4}')
    print(f'Skew: {df[target_feature].skew():.4}  -  Kurtosis: {df[target_feature].kurtosis():.4}\n')

fig, axes = plt.subplots(ncols=3, figsize=(18, 6), dpi=100)
plt.tight_layout()

sns.distplot(df['domain2_var1'], ax=axes[0])
sns.distplot(df['domain2_var2'], ax=axes[1])
sns.distplot(df['domain2'], ax=axes[2])

for i in range(3):
    axes[i].tick_params(axis='x', labelsize=12)
    axes[i].tick_params(axis='y', labelsize=10)
    axes[i].set_xlabel('')
    axes[i].set_ylabel('')
    
axes[0].set_title(f'domain2_var1 Distribution', size=15, pad=15)
axes[1].set_title(f'domain2_var2 Distribution', size=15, pad=15)
axes[2].set_title(f'domain2_var1 + domain2_var2 Distribution', size=15, pad=15)

plt.show()

df.drop(columns=['domain2'], inplace=True)

### **1.5 Target Correlations**

Target features are not correlated with each other too much. The strongest correlations are between `age` and `domain1_var1` (**0.34**), and between  `age` and `domain2_var1` (**0.23**). There might be a relationship between  `age` and var1 features.

Correlation between `domain1_var1` and `domain1_var2` is **0** which strongly indicates that Domain 1 is a single variable and it has two independent components.

Correlation between `domain2_var1` and `domain2_var2` is **0.18** which It is too high for both PCA and ICA, but they are probably two independent components of a single variable as well.

In [None]:
fig = plt.figure(figsize=(10, 10), dpi=100)

sns.heatmap(df[['age', 'domain1_var1', 'domain1_var2', 'domain2_var1', 'domain2_var2']].corr(),
            annot=True,
            square=True,
            cmap='coolwarm',
            annot_kws={'size': 14}, 
            fmt='.2f')   

plt.tick_params(axis='x', labelsize=14, rotation=45)
plt.tick_params(axis='y', labelsize=14, rotation=0)
    
plt.title('Target Features Correlations', size=18, pad=18)
plt.show()

## **2. Source-based Morphometry Loadings**

The first set of features are source-based morphometry (SBM) loadings. These are subject-level weights from a group-level ICA decomposition of gray matter concentration maps from structural MRI (sMRI) scans. Those features are for both training and test samples.

There are **26** features in `loading.csv` file without `Id`. Those features are named from `IC_01` to `IC_30`, but `IC_19`, `IC_23`, `IC_25` and `IC_27` don't exist. They are parts of brain and their explanations are listed below:

`
IC_01 - Cerebellum
IC_02 - ACC+mpfc
IC_03 - Caudate
IC_04 - Cerebellum
IC_05 - Calcarine
IC_06 - Calcarine
IC_07 - Precuneus+PCC
IC_08 - Frontal
IC_09 - IPL+AG
IC_10 - MTG
IC_11 - Frontal
IC_12 - SMA
IC_13 - Temporal Pole
IC_14 - Temporal Pole + Fusiform
IC_15 - STG
IC_16 - Middle Occipital?
IC_17 - Cerebellum
IC_18 - Cerebellum
IC_20 - MCC
IC_21 - Temporal Pole + Cerebellum
IC_22 - Insula + Caudate
IC_24 - IPL+Postcentral
IC_26 - Inf+Mid Frontal
IC_28 - Calcarine
IC_29 - MTG
IC_30 - Inf Frontal
`

### **2.1 Source-based Morphometry Loadings' Distributions**

All of the loading features follow a normal distribution. Their distributions and descriptive statistical summary in training and test samples are very similar except `IC_20`. It is an exception because the distribution of `IC_20` in test samples is shifted. This feature may require some preprocessing.

None of the loading features have a visible relationship with any of the targets. Data points of loading features are scattered around the means of `domain1_var1`, `domain1_var2`, `domain2_var1`, `domain2_var2`, however `age` has a tiny relationship with loading features. This relationship is not easy to detect but it looks like `age` is easier to predict than other target features.

There are some data points which can be classified as mild outliers in all loading features. Models used for predicting targets, should be robust to outliers. The most extreme outlier in loading features belongs to `IC_02`. It is a single data point and it's locations are very close in every target.

In [None]:
def plot_loading(loading_feature):
    
    df_train = df.loc[df['is_train'] == 1]
    df_test = df.loc[df['is_train'] == 0]
        
    print(f'Loading feature {loading_feature} Statistical Analysis\n{"-" * 42}')

    print(f'Training Mean: {float(df_train[loading_feature].mean()):.4}  - Training Median: {float(df_train[loading_feature].median()):.4} - Training Std: {float(df_train[loading_feature].std()):.4}')
    print(f'Test Mean: {float(df_test[loading_feature].mean()):.4}  - Test Median: {float(df_test[loading_feature].median()):.4} - Test Std: {float(df_test[loading_feature].std()):.4}')
    print(f'Training Min: {float(df_train[loading_feature].min()):.4}  - Training Max: {float(df_train[loading_feature].max()):.4}')
    print(f'Test Min: {float(df_test[loading_feature].min()):.4}  - Training Max: {float(df_test[loading_feature].max()):.4}')
    print(f'Training Skew: {float(df_train[loading_feature].skew()):.4}  - Training Kurtosis: {float(df_train[loading_feature].kurtosis()):.4}')
    print(f'Test Skew: {float(df_test[loading_feature].skew()):.4}  - Test Kurtosis: {float(df_test[loading_feature].kurtosis()):.4}')
    training_missing_values_count = df_train[df_train[loading_feature].isnull()].shape[0]
    test_missing_values_count = df_test[df_test[loading_feature].isnull()].shape[0]
    training_samples_count = df_train.shape[0]
    test_samples_count = df_test.shape[0]
    print(f'Training Missing Values: {training_missing_values_count}/{training_samples_count} ({training_missing_values_count * 100 / training_samples_count:.4}%)')
    print(f'Test Missing Values: {test_missing_values_count}/{test_samples_count} ({test_missing_values_count * 100 / test_samples_count:.4}%)')

    fig, axes = plt.subplots(ncols=3, nrows=2, figsize=(25, 12), dpi=100, constrained_layout=True)
    title_size = 18
    label_size = 18

    # Loading Feature Training and Test Set Distribution
    sns.distplot(df[df['is_train'] == 1][loading_feature], label='Training', ax=axes[0][0])
    sns.distplot(df[df['is_train'] == 0][loading_feature], label='Test', ax=axes[0][0])
    axes[0][0].set_xlabel('')
    axes[0][0].tick_params(axis='x', labelsize=label_size)
    axes[0][0].tick_params(axis='y', labelsize=label_size)
    axes[0][0].legend()
    axes[0][0].set_title(f'{loading_feature} Distribution in Training and Test Set', size=title_size, pad=title_size)
    
    # Loading Feature vs age
    sns.scatterplot(df[loading_feature], df['age'], ax=axes[0][1])
    axes[0][1].set_title(f'{loading_feature} vs age', size=title_size, pad=title_size)
    axes[0][1].set_xlabel('')
    axes[0][1].set_ylabel('')
    axes[0][1].tick_params(axis='x', labelsize=label_size)
    axes[0][1].tick_params(axis='y', labelsize=label_size)
    
    # Loading Feature vs domain1_var1
    sns.scatterplot(df[loading_feature], df['domain1_var1'], ax=axes[0][2])
    axes[0][2].set_title(f'{loading_feature} vs domain1_var1', size=title_size, pad=title_size)
    axes[0][2].set_xlabel('')
    axes[0][2].set_ylabel('')
    axes[0][2].tick_params(axis='x', labelsize=label_size)
    axes[0][2].tick_params(axis='y', labelsize=label_size)
    
    # Loading Feature vs domain1_var2
    sns.scatterplot(df[loading_feature], df['domain1_var2'], ax=axes[1][0])
    axes[1][0].set_title(f'{loading_feature} vs domain1_var2', size=title_size, pad=title_size)
    axes[1][0].set_xlabel('')
    axes[1][0].set_ylabel('')
    axes[1][0].tick_params(axis='x', labelsize=label_size)
    axes[1][0].tick_params(axis='y', labelsize=label_size)
    
    # Loading Feature vs domain2_var1
    sns.scatterplot(df[loading_feature], df['domain2_var1'], ax=axes[1][1])
    axes[1][1].set_title(f'{loading_feature} vs domain2_var1', size=title_size, pad=title_size)
    axes[1][1].set_xlabel('')
    axes[1][1].set_ylabel('')
    axes[1][1].tick_params(axis='x', labelsize=label_size)
    axes[1][1].tick_params(axis='y', labelsize=label_size)
    
    # Loading Feature vs domain2_var2
    sns.scatterplot(df[loading_feature], df['domain2_var2'], ax=axes[1][2])
    axes[1][2].set_title(f'{loading_feature} vs domain2_var2', size=title_size, pad=title_size)
    axes[1][2].set_xlabel('')
    axes[1][2].set_ylabel('')
    axes[1][2].tick_params(axis='x', labelsize=label_size)
    axes[1][2].tick_params(axis='y', labelsize=label_size)
    
    plt.show()
    
for loading_feature in sorted(loading_features):
    plot_loading(loading_feature)


### **2.2 Source-based Morphometry Loadings' Correlations**

There are strong correlations between `age` and some loading features that exceed **-0.4**, but none of the loading features are correlated with other targets.

Loading features have decent correlations between themselves that exceed **0.5** and **-0.5**. Some of those high positive correlations belong to feature groups which are from the same part of the brain. However, all features from the same part of the brain are not necessarily correlated with each other, so there is no pattern here.

In [None]:
loading_target_features = sorted(loading_features) + ['age', 'domain1_var1', 'domain1_var2', 'domain2_var1', 'domain2_var2']

fig = plt.figure(figsize=(30, 30), dpi=100)

sns.heatmap(df[loading_target_features].corr(),
            annot=True,
            square=True,
            cmap='coolwarm',
            annot_kws={'size': 15}, 
            fmt='.2f')   

plt.tick_params(axis='x', labelsize=18, rotation=45)
plt.tick_params(axis='y', labelsize=18, rotation=45)

plt.title('Target and Loading Features Correlations', size=25, pad=25)
plt.show()

## **3. Static Functional Network Connectivity**
The second set of features are static functional network connectivity (FNC) matrices. These are the subject-level cross-correlation values among 53 component timecourses estimated from GIG-ICA of resting state functional MRI (fMRI).

There are **1378** features in `fnc.csv` file without `Id`. Those features are named as `Network1(X)_vs_Network2(Y)` and there are **7** different networks. Network names and abbreviations are listed below:

`
SCN - Sub-cortical Network
ADN - Auditory Network
SMN - Sensorimotor Network
VSN - Visual Network
CON - Cognitive-control Network    
DMN - Default-mode Network
CBN - Cerebellar Network
`

Those groups might be useful to analyze **1378** features part by part.

### **3.1. Sub-cortical Network (SCN) Features**

First FNC feature sub-group is SCN features. This is the smallest sub-group of fnc features and there are **10** features in it. This sub-group has cross-correlations with only itself.

SCN features are strongly correlated with each other, but none of the SCN features are correlated with targets. Correlations between target and SCN features are very weak which can be seen at the bottom and right end.

In [None]:
scn_pattern = r'SCN\(\d+\)_vs_[A-Z]+\(\d+\)'
scn_features = [col for col in fnc_features if re.match(scn_pattern, col)]
scn_target_features = sorted(scn_features) + ['age', 'domain1_var1', 'domain1_var2', 'domain2_var1', 'domain2_var2']

fig = plt.figure(figsize=(25, 25), dpi=100)

sns.heatmap(df[scn_target_features].corr(),
            annot=True,
            square=True,
            cmap='coolwarm',
            annot_kws={'size': 15}, 
            fmt='.2f')   

plt.tick_params(axis='x', labelsize=18, rotation=75)
plt.tick_params(axis='y', labelsize=18, rotation=0)

plt.title('Target and SCN Features Correlations', size=25, pad=25)
plt.show()

### **3.2. Auditory Network (ADN) Features**

Second FNC feature sub-group is ADN features. This is also the second smallest sub-group of FNC features and there are **11** features in it. This sub-group has cross-correlations with SCN (10) and itself (1).

ADN features are strongly correlated with each other if only ADN is cross-correlated with SCN. There are 10 ADN(2) vs SCN(5) features and they have very strong correlations. There is only one ADN vs ADN (`ADN(56)_vs_ADN(21)`) feature and it doesn't have any significant correlation with anything. Correlations between target and ADN features are very weak which can be seen at the bottom and right end.

In [None]:
adn_pattern = r'ADN\(\d+\)_vs_[A-Z]+\(\d+\)'
adn_features = [col for col in fnc_features if re.match(adn_pattern, col)]
adn_target_features = sorted(adn_features) + ['age', 'domain1_var1', 'domain1_var2', 'domain2_var1', 'domain2_var2']

fig = plt.figure(figsize=(25, 25), dpi=100)

sns.heatmap(df[adn_target_features].corr(),
            annot=True,
            square=True,
            cmap='coolwarm',
            annot_kws={'size': 15},
            fmt='.2f')   

plt.tick_params(axis='x', labelsize=18, rotation=75)
plt.tick_params(axis='y', labelsize=18, rotation=0)

plt.title('Target and ADN Features Correlations', size=25, pad=25)
plt.show()

### **3.3. Sensorimotor Network (SMN) Features**

Third FNC feature sub-group is SMN features. This is a large sub-group compared to previous ones and there are **99** features in it. This sub-group has cross-correlations with SCN (45), ADN (18) and itself (36).

It is hard to identify correlations at feature level on this scale but it still gives lots of information. Since the feature names are sorted by alphabetical order, the correlations of feature groups are easy to detect.

Every cross-correlation of SMN have very strong correlations inside the second network groups. This explains the bright red blocks near the diagonal axis.

* `SMN(X)_vs_ADN(Y)`: `X` is strongly correlated with every different `Y` value for `ADN` 
* `SMN(X)_vs_SCN(Y)`: `X` is strongly correlated with every different `Y` value for `SCN` 
* `SMN(X)_vs_SMN(Y)`: `X` is strongly correlated with every different `Y` value for `SMN`

Every cross-correlation of SMN have moderate correlations inside the first network groups. This explains the repeating orange and blue color blocks along the vertical and horizontal axis.

* `SMN(X)_vs_ADN(Y)`: `Y` is moderately correlated with every different `X` value for `ADN` 
* `SMN(X)_vs_SCN(Y)`: `Y` is moderately correlated with every different `X` value for `SCN` 
* `SMN(X)_vs_SMN(Y)`: `Y` is moderately correlated with every different `X` value for `SMN`

Correlations between target and SMN features are very weak which can be seen at the bottom and right end.

In [None]:
smn_pattern = r'SMN\(\d+\)_vs_[A-Z]+\(\d+\)'
smn_features = [col for col in fnc_features if re.match(smn_pattern, col)]
smn_target_features = sorted(smn_features) + ['age', 'domain1_var1', 'domain1_var2', 'domain2_var1', 'domain2_var2']

fig = plt.figure(figsize=(40, 40), dpi=100)

sns.heatmap(df[smn_target_features].corr(),
            annot=False,
            square=True,
            cmap='coolwarm',
            yticklabels=False,
            xticklabels=False)   

plt.title('Target and SMN Features Correlations', size=50, pad=50)
plt.show()

### **3.4. Visual Network (VSN) Features** 

Fourth FNC feature sub-group is VSN features. This is a very large sub-group compared to previous ones and there are **180** features in it. This sub-group has cross-correlations with SCN (45), ADN (18), SMN (81) and itself (36).

It is hard to identify correlations at feature level on this scale but it still gives lots of information. Since the feature names are sorted by alphabetical order, the correlations of feature groups are easy to detect. The same pattern from SMN features can be seen on VSN features as well.

Every cross-correlation of VSN have very strong correlations inside the second network groups. This explains the bright red blocks near the diagonal axis.

* `VSN(X)_vs_ADN(Y)`: `X` is strongly correlated with every different `Y` value for `ADN` 
* `VSN(X)_vs_SCN(Y)`: `X` is strongly correlated with every different `Y` value for `SCN` 
* `VSN(X)_vs_SMN(Y)`: `X` is strongly correlated with every different `Y` value for `SMN`
* `VSN(X)_vs_VSN(Y)`: `X` is strongly correlated with every different `Y` value for `VSN`

Every cross-correlation of VSN have moderate correlations inside the first network groups. This explains the repeating orange and blue color blocks along the vertical and horizontal axis.

* `VSN(X)_vs_ADN(Y)`: `Y` is moderately correlated with every different `X` value for `ADN` 
* `VSN(X)_vs_SCN(Y)`: `Y` is moderately correlated with every different `X` value for `SCN` 
* `VSN(X)_vs_SMN(Y)`: `Y` is moderately correlated with every different `X` value for `SMN`
* `VSN(X)_vs_VSN(Y)`: `Y` is moderately correlated with every different `X` value for `VSN`

Correlations between target and VSN features are very weak which can be seen at the bottom and right end.

In [None]:
vsn_pattern = r'VSN\(\d+\)_vs_[A-Z]+\(\d+\)'
vsn_features = [col for col in fnc_features if re.match(vsn_pattern, col)]
vsn_target_features = sorted(vsn_features) + ['age', 'domain1_var1', 'domain1_var2', 'domain2_var1', 'domain2_var2']

fig = plt.figure(figsize=(40, 40), dpi=100)

sns.heatmap(df[vsn_target_features].corr(),
            annot=False,
            square=True,
            cmap='coolwarm',
            yticklabels=False,
            xticklabels=False)   

plt.title('Target and VSN Features Correlations', size=50, pad=50)
plt.show()

### **3.5. Cognitive-control Network (CON) Features** 

Fifth FNC feature sub-group is CON features. This is the largest sub-group and there are **561** features in it. This sub-group has cross-correlations with SCN (85), ADN (34), SMN (153), VSN (153) and itself (136).

It is even hard to identify correlations at block level on this scale but it still gives lots of information. Since the feature names are sorted by alphabetical order, the correlations of feature groups are easy to detect. The same pattern from SMN and VSN features can be seen on CON features as well.

Every cross-correlation of CON have very strong correlations inside the second network groups. This explains the bright red blocks near the diagonal axis.

* `CON(X)_vs_ADN(Y)`: `X` is strongly correlated with every different `Y` value for `ADN` 
* `CON(X)_vs_SCN(Y)`: `X` is strongly correlated with every different `Y` value for `SCN` 
* `CON(X)_vs_SMN(Y)`: `X` is strongly correlated with every different `Y` value for `SMN`
* `CON(X)_vs_VSN(Y)`: `X` is strongly correlated with every different `Y` value for `VSN`
* `CON(X)_vs_CON(Y)`: `X` is strongly correlated with every different `Y` value for `CON`

Every cross-correlation of CON have weak correlations inside the first network groups. This explains the light orange and light blue color blocks everywhere except near of diagonal axis.

Correlations between target and CON features are very weak which can be seen at the bottom and right end.

In [None]:
con_pattern = r'CON\(\d+\)_vs_[A-Z]+\(\d+\)'
con_features = [col for col in fnc_features if re.match(con_pattern, col)]
con_target_features = sorted(con_features) + ['age', 'domain1_var1', 'domain1_var2', 'domain2_var1', 'domain2_var2']

fig = plt.figure(figsize=(50, 50), dpi=100)

sns.heatmap(df[con_target_features].corr(),
            annot=False,
            square=True,
            cmap='coolwarm',
            yticklabels=False,
            xticklabels=False)   

plt.title('Target and CON Features Correlations', size=50, pad=50)
plt.show()

### **3.6. Default-mode Network (DMN) Features** 

Sixth FNC feature sub-group is DMN features. This is the second largest sub-group and there are **315** features in it. This sub-group has cross-correlations with SCN (35), ADN (14), SMN (63), VSN (63), CON (119) and itself (21).

It is hard to identify correlations at feature level on this scale but it still gives lots of information. Since the feature names are sorted by alphabetical order, the correlations of feature groups are easy to detect. The same pattern from SMN, VSN and CON features can be seen on DMN features as well.

Every cross-correlation of DMN have very strong correlations inside the second network groups. This explains the bright red and blue blocks near the diagonal axis.

* `DMN(X)_vs_ADN(Y)`: `X` is strongly correlated with every different `Y` value for `ADN` 
* `DMN(X)_vs_SCN(Y)`: `X` is strongly correlated with every different `Y` value for `SCN` 
* `DMN(X)_vs_SMN(Y)`: `X` is strongly correlated with every different `Y` value for `SMN`
* `DMN(X)_vs_VSN(Y)`: `X` is strongly correlated with every different `Y` value for `VSN`
* `DMN(X)_vs_CON(Y)`: `X` is strongly correlated with every different `Y` value for `CON`
* `DMN(X)_vs_DMN(Y)`: `X` is strongly correlated with every different `Y` value for `DMN`

Every cross-correlation of DMN have weak correlations inside the first network groups. This explains the light orange and light blue color blocks everywhere except near of diagonal axis. There are also repeating light orange and light blue diagonal lines. 

Correlations between target and DMN features are very weak which can be seen at the bottom and right end.

In [None]:
dmn_pattern = r'DMN\(\d+\)_vs_[A-Z]+\(\d+\)'
dmn_features = [col for col in fnc_features if re.match(dmn_pattern, col)]
dmn_target_features = sorted(dmn_features) + ['age', 'domain1_var1', 'domain1_var2', 'domain2_var1', 'domain2_var2']

fig = plt.figure(figsize=(50, 50), dpi=100)

sns.heatmap(df[dmn_target_features].corr(),
            annot=False,
            square=True,
            cmap='coolwarm',
            yticklabels=False,
            xticklabels=False)   

plt.title('Target and DMN Features Correlations', size=50, pad=50)
plt.show()

### **3.7. Cerebellar Network (CBN) Features** 

Seventh FNC feature sub-group is CBN features. This is a large sub-group and there are **202** features in it. This sub-group has cross-correlations with SCN (20), ADN (8), SMN (36), VSN (36), CON (68), DMN (28) and itself (6).

It is hard to identify correlations at feature level on this scale but it still gives lots of information. Since the feature names are sorted by alphabetical order, the correlations of feature groups are easy to detect. The same pattern from SMN, VSN, CON and DMN features can be seen on CBN features as well.

Every cross-correlation of CBN have very strong correlations inside the second network groups. This explains the bright red and blue blocks near the diagonal axis.

* `CBN(X)_vs_ADN(Y)`: `X` is strongly correlated with every different `Y` value for `ADN` 
* `CBN(X)_vs_SCN(Y)`: `X` is strongly correlated with every different `Y` value for `SCN` 
* `CBN(X)_vs_SMN(Y)`: `X` is strongly correlated with every different `Y` value for `SMN`
* `CBN(X)_vs_VSN(Y)`: `X` is strongly correlated with every different `Y` value for `VSN`
* `CBN(X)_vs_CON(Y)`: `X` is strongly correlated with every different `Y` value for `CON`
* `CBN(X)_vs_DMN(Y)`: `X` is strongly correlated with every different `Y` value for `DMN`
* `CBN(X)_vs_CBN(Y)`: `X` is strongly correlated with every different `Y` value for `CBN`

Every cross-correlation of CBN have moderate correlations inside the first network groups. This explains the repeating orange and blue color blocks along the vertical and horizontal axis, and repeating bright red and orange diagonal lines.

* `CBN(X)_vs_ADN(Y)`: `Y` is strongly correlated with every different `X` value for `ADN` 
* `CBN(X)_vs_SCN(Y)`: `Y` is strongly correlated with every different `X` value for `SCN` 
* `CBN(X)_vs_SMN(Y)`: `Y` is strongly correlated with every different `X` value for `SMN`
* `CBN(X)_vs_VSN(Y)`: `Y` is strongly correlated with every different `X` value for `VSN`
* `CBN(X)_vs_CON(Y)`: `X` is strongly correlated with every different `Y` value for `CON`
* `CBN(X)_vs_DMN(Y)`: `X` is strongly correlated with every different `Y` value for `DMN`
* `CBN(X)_vs_CBN(Y)`: `X` is strongly correlated with every different `Y` value for `CBN`

Correlations between target and CBN features are very weak which can be seen at the bottom and right end.

In [None]:
cbn_pattern = r'CBN\(\d+\)_vs_[A-Z]+\(\d+\)'
cbn_features = [col for col in fnc_features if re.match(cbn_pattern, col)]
cbn_target_features = sorted(cbn_features) + ['age', 'domain1_var1', 'domain1_var2', 'domain2_var1', 'domain2_var2']

fig = plt.figure(figsize=(50, 50), dpi=100)

sns.heatmap(df[cbn_target_features].corr(),
            annot=False,
            square=True,
            cmap='coolwarm',
            yticklabels=False,
            xticklabels=False)   

plt.title('Target and CBN Features Correlations', size=50, pad=50)
plt.show()

### **3.8. Inference**
FNC features from the same group are more likely to be correlated with each other. Correlations from different groups doesn't seem to be very strong. Even in the same groups, there are lots of weak correlations. FNC features from the same groups must be in the same network in order to have strong correlation. FNC features in the same network don't have strong correlations if they are not in the same group.

The entire heatmap of FNC features is rendered below but it is very hard to derive any meaning from it.

In [None]:
fig = plt.figure(figsize=(60, 60), dpi=100)

sns.heatmap(df[fnc_features].corr(),
            annot=False,
            square=True,
            cmap='coolwarm',
            yticklabels=False,
            xticklabels=False)   

plt.title('Target and All FNC Features Correlations', size=60, pad=60)
plt.show()

## **4. Sites**

`reveal_ID_site2.csv` a list of subject IDs whose data was collected with a different scanner than the train samples.

> Models are expected to generalize on data from a different scanner/site (site 2). All subjects from site 2 were assigned to the test set, so their scores are not available. While there are fewer site 2 subjects than site 1 subjects in the test set, the total number of subjects from site 2 will not be revealed until after the end of the competition. To make it more interesting, the IDs of some site 2 subjects have been revealed below. Use this to inform your models about site effects. Site effects are a form of bias. To generalize well, models should learn features that are not related to or driven by site effects.

This paragraph is taken from the data tab. It means data is collected from two different sources, site 1 and 2. This is the reason of train and test set feature distribution discrepancies.

* All of the data in training set is collected from site 1.
* All of the data collected from site 2 are in test set but the entire test set is not collected from site 2.

In this case, there is no way to find out site 1 samples in test set perfectly, but they can be predicted with a classifier by looking at feature distributions.

In [None]:
site2_ids = pd.read_csv('../input/trends-assessment-prediction/reveal_ID_site2.csv').values.flatten()

df['site'] = 0
df.loc[df['is_train'] == 1, 'site'] = 1
df.loc[df['Id'].isin(site2_ids), 'site'] = 2
df['site'] = df['site'].astype(np.uint8)

del site2_ids

In [None]:
fig = plt.figure(figsize=(10, 6), dpi=100)
sns.barplot(x=df['site'].value_counts().index, y=df['site'].value_counts())

percentages = [(count / df['site'].value_counts().sum() * 100).round(2) for count in df['site'].value_counts()]
plt.ylabel('')
plt.xticks(np.arange(3), [f'No Site (%{percentages[1]})', f'Site 1 (%{percentages[0]})', f'Site 2 (%{percentages[2]})'])
plt.tick_params(axis='x', labelsize=12)
plt.tick_params(axis='y', labelsize=12)
plt.title('Site Counts', size=15, pad=15)

plt.show()

### **4.1. Loading Features Site Distributions**

It is hard to make an accurate analysis on feature distributions for different sites because sample sizes are different for site 1 and 2. There are **5877** site 1 samples (training set) and **510** verified site 2 samples. That's why train/test distribution of the feature should be plotted as well in order to measure the shift in distribution.

There are three types of shifts in the distributions ploted below:

* Mild shift in site 1/2 feature distribution but it can't be seen on train/test feature distribution (Mean shift close to ±0.001). Those shifts can be observed due to small sample size of site 2 subjects, but they are suspicious.
* Severe shift in site 1/2 feature distribution that it can also be seen on train/test feature distribution (Mean shift greater than ±0.002). Those features should be eliminated.
* No shift in both site 1/2 and train/test feature distributions. Those features can be trusted. 

In [None]:
def plot_site_distribution(feature):    
    
    print(f'{feature} Site Distribution Analysis\n{"-" * 32}')
        
    print(f'Site 1 Mean: {df[df["site"] == 1][feature].mean():.4}  -  Median: {df[df["site"] == 1][feature].median():.4}  -  Std: {df[df["site"] == 1][feature].std():.4}')
    print(f'Site 2 Mean: {df[df["site"] == 2][feature].mean():.4}  -  Median: {df[df["site"] == 2][feature].median():.4}  -  Std: {df[df["site"] == 2][feature].std():.4}')
    print(f'\nSite 1 Min: {df[df["site"] == 1][feature].min():.4}  -  25%: {df[df["site"] == 1][feature].quantile(0.25):.4}  -  50%: {df[df["site"] == 1][feature].quantile(0.5):.4}  -  75%: {df[df["site"] == 1][feature].quantile(0.75):.4}  -  Max: {df[df["site"] == 1][feature].max():.4}')
    print(f'Site 2 Min: {df[df["site"] == 2][feature].min():.4}  -  25%: {df[df["site"] == 2][feature].quantile(0.25):.4}  -  50%: {df[df["site"] == 2][feature].quantile(0.5):.4}  -  75%: {df[df["site"] == 2][feature].quantile(0.75):.4}  -  Max: {df[df["site"] == 2][feature].max():.4}')

    fig, axes = plt.subplots(ncols=2, figsize=(20, 6), dpi=100)

    sns.distplot(df[df['site'] == 1][feature], label='Site 1', ax=axes[0])
    sns.distplot(df[df['site'] == 2][feature], label='Site 2', ax=axes[0])
    sns.distplot(df[df['is_train'] == 1][feature], label='Training', ax=axes[1])
    sns.distplot(df[df['is_train'] == 0][feature], label='Test', ax=axes[1])
    
    for i in range(2):
        axes[i].set_xlabel('')
        axes[i].tick_params(axis='x', labelsize=15)
        axes[i].tick_params(axis='y', labelsize=15)
        axes[i].legend()
    axes[0].set_title(f'{feature} Distribution in Site 1 and 2', size=18, pad=18)
    axes[1].set_title(f'{feature} Distribution in Training and Test Set', size=18, pad=18)
    
    plt.show()
    
for loading_feature in sorted(loading_features):
    plot_site_distribution(loading_feature)

### **4.2. FNC Features Site Distributions**

There are 1378 FNC features so they can't be analyzed one by one. Kolmogorov–Smirnov test calculated on site 1 and site 2 feature distribution can be used for feature selection. If the KS statistic is small or the p-value is high, then we cannot reject the hypothesis that the distributions of the two samples are the same. I used **0.125** for KS statistic threshold and selected features that are exceeding the threshold.

In [None]:
def get_distribution_difference(feature):
    site1_values = df[df['site'] == 1][feature].values
    site2_values = df[df['site'] == 2][feature].values
    return feature, stats.ks_2samp(site1_values, site2_values)

ks_threshold = 0.125
shifted_features = [fnc_feature for fnc_feature in fnc_features if get_distribution_difference(fnc_feature)[1][0] > ks_threshold]        

The same things in loading features can be observed in FNC features as well. Slightly shifted train/test feature distributions are shifted even more in site 1/2. However, none of the feature shifts are as severe as `IC_20` in loading features.

**0.125** is an arbitrary number and distribution difference can be measured with other ways as well, but feature selection should definitely be based on site 1 and 2 feature distribution difference.

In [None]:
for feature in sorted(shifted_features):
    plot_site_distribution(feature)

### **4.3. Site Classification**

Even though it is impossible to predict the sites of unlabeled samples perfectly, shifted features can be used in a classifier and predict the sites to some degree. Loading features should be used in this classifier along with shifted FNC features.

In [None]:
site_predictors = shifted_features + loading_features

X_train = df.loc[df['site'] > 0, site_predictors]
y_train = df.loc[df['site'] > 0, 'site']
X_test = df.loc[df['site'] == 0, site_predictors]

Expected values closer to **1** and **2** in `site_predicted`, could possible be added to the original `site` feature since they are confident predictions.

`IC_20`, `IC_18`, `IC_11`, `IC_21` and `IC_05` are at the top of feature importance in site classifier model. It looks like FNC features don't contribute site classifier model as much as loading features. `IC_20` is the most dangerous feature without any doubt but other loading features should be selected very carefully in main models.

In [None]:
df['site_predicted'] = 0

K = 2
skf = StratifiedKFold(n_splits=K, shuffle=True, random_state=SEED)

oof_scores = []
feature_importance = pd.DataFrame(np.zeros((X_train.shape[1], K)), columns=[f'Fold_{i}_Importance' for i in range(1, K + 1)], index=X_train.columns)

site_model_parameters = {
    'num_iterations': 500,
    'early_stopping_round': 50,
    'num_leaves': 2 ** 5, 
    'learning_rate': 0.05,
    'bagging_fraction': 0.9,
    'bagging_freq': 1,
    'feature_fraction': 0.9,
    'feature_fraction_bynode': 0.9,
    'lambda_l1': 0,
    'lambda_l2': 0,
    'max_depth': -1,
    'objective': 'regression',
    'seed': SEED,
    'feature_fraction_seed': SEED,
    'bagging_seed': SEED,
    'drop_seed': SEED,
    'data_random_seed': SEED,
    'boosting_type': 'gbdt',
    'verbose': 1,
    'metric': 'rmse',
    'n_jobs': -1,   
}

print('Running LightGBM Site Classifier Model\n' + ('-' * 38) + '\n')

for fold, (trn_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):

    trn_data = lgb.Dataset(X_train.iloc[trn_idx, :], label=y_train.iloc[trn_idx])
    val_data = lgb.Dataset(X_train.iloc[val_idx, :], label=y_train.iloc[val_idx])                
    site_model = lgb.train(site_model_parameters, trn_data, valid_sets=[trn_data, val_data], verbose_eval=50)
    feature_importance.iloc[:, fold - 1] = site_model.feature_importance(importance_type='gain')

    site_oof_predictions = site_model.predict(X_train.iloc[val_idx, :], num_iteration=site_model.best_iteration)
    df.loc[X_train.iloc[val_idx, :].index, 'site_predicted'] = site_oof_predictions
    
    site_test_predictions = site_model.predict(X_test, num_iteration=site_model.best_iteration)
    df.loc[X_test.index, 'site_predicted'] += site_test_predictions / K

    oof_score = f1_score(y_train.iloc[val_idx], np.clip(np.round(site_oof_predictions), 1, 2))
    oof_scores.append(oof_score)            
    print(f'\nFold {fold} - F1 Score {oof_score:.6}\n')
    
df['site_predicted'] = df['site_predicted'].astype(np.float32)
oof_f1_score = f1_score(df.loc[df['site'] > 0, 'site'], np.clip(np.round(df.loc[df['site'] > 0, 'site_predicted']), 1, 2))

print('-' * 38)
print(f'LightGBM Site Classifier Model Mean F1 Score {np.mean(oof_scores):.6} [STD:{np.std(oof_scores):.6}]')
print(f'LightGBM Site Classifier Model OOF F1 Score {oof_f1_score:.6}')

plt.figure(figsize=(20, 20))
feature_importance['Mean_Importance'] = feature_importance.sum(axis=1) / K
feature_importance.sort_values(by='Mean_Importance', inplace=True, ascending=False)
sns.barplot(x='Mean_Importance', y=feature_importance.index, data=feature_importance)

plt.xlabel('')
plt.tick_params(axis='x', labelsize=18)
plt.tick_params(axis='y', labelsize=18)
plt.title('LightGBM Site Classifier Model Feature Importance (Gain)', size=20, pad=20)

plt.show()

### **4.4. Private Leaderboard Simulation**
If the expected values predicted by the site classifier are rounded, there are **32** samples classified as site 2. This operation ceils the values greater than **.5** and floors the values lesser than **.5**. It yields very few samples for models to use as a validation set.

If this threshold is set to **.2**, a 0.9 train/test split can be achieved with samples which slightly look like they are from site 2. I tried to simulate the private leaderboard by:

* Training on **5342** samples which are classified as site 1 (`site_predicted` expected value < 1.2)
* Validate on **535** samples which slightly look like site 2 (`site_predicted` expected value >= 1.2)

In [None]:
print('Site Classifier expected values are rounded with .5 threshold\n')
print(df.loc[df['is_train'] == 1, 'site_predicted'].round(0).astype(np.uint8).value_counts())

print('\nSite Classifier expected values are rounded with .2 threshold\n')
df.loc[df['site_predicted'] > 1.2, 'site_predicted'] = 2
df['site_predicted'] = df['site_predicted'].round().astype(np.uint8)
print(df.loc[df['is_train'] == 1, 'site_predicted'].round(0).astype(np.uint8).value_counts())

Ridge regression is used in this simulation and the results are listed below. The gap between training and validation NAE looks very realistic based on the train/test discrepancies, especially for `age`.

`
age Train NAE 0.129934 - Validation NAE 0.154733
domain1_var1 Train NAE 0.143931 - Validation NAE 0.147178
domain1_var2 Train NAE 0.150529 - Validation NAE 0.146966
domain2_var1 Train NAE 0.176281 - Validation NAE 0.189226
domain2_var2 Train NAE 0.171305 - Validation NAE 0.171517
`

Even though Ridge regression scores **0.159** on public leaderboard, it scored **0.161** on slightly different samples. This means the private leaderboard score will be even worse.

`Train Weighted NAE 0.151338 - Validation Weighted NAE 0.161025`

## **5. Component Spatial Maps**

The third set of features are the component spatial maps (SM). These are the subject-level 3D images of 53 spatial networks estimated from GIG-ICA of resting state functional MRI (fMRI). Those features are contained under the directories below:

* `fMRI_train` - a folder containing 53 3D spatial maps for train samples in .mat format
* `fMRI_test` - a folder containing 53 3D spatial maps for test samples in .mat format

Those explanations are written under the Data tab on the competition page, but they are not clear. `fMRI_train` and `fMRI_test` are directories in which there are **5877** samples. Each sample is a `.mat` file that contains **53** 3D spatial maps.

This means there are **311481** 3D spatial maps in both `fMRI_train` and `fMRI_test`, and this is the reason why competition data is **166.59 GB**.

In [None]:
print(f'Train fMRI Samples in fMRI_train: {len(os.listdir("../input/trends-assessment-prediction/fMRI_train"))}')
print(f'Test fMRI Samples in fMRI_test: {len(os.listdir("../input/trends-assessment-prediction/fMRI_test"))}')

### **5.1 MATLAB Files**

The `.mat` files in this competition can be read in Python using `h5py.File`. HDF5 files work generally like standard Python file objects. They support standard modes like r/w/a, and should be closed when they are no longer in use. For an example, the first file (`10001.mat`) inside the `fMRI_train` is read in the cell below.

In [None]:
matlab_file = h5py.File(f'../input/trends-assessment-prediction/fMRI_train/10001.mat', 'r')
matlab_file

The `.mat` files are using dictionary interface so data inside the file can be accessed with `keys()` and `values()` methods. `.keys()` method shows us there is only one key in the file and it is `SM_feature`. That is where the data is stored.

Accessing the data using `matlab_file["SM_feature"]` shows us, each `.mat` file is a 4D HDF5 dataset with the shape of `(53, 52, 63, 53)`. Those are the **53** 3D spatial maps.

In [None]:
print(f'.keys() -> {matlab_file.keys()}')
print(f'.values() -> {matlab_file.values()}')

print('\nmatlab_file["SM_feature"] ->', matlab_file['SM_feature'])

### **5.2 Visualizing 3D Data on 2D Space**

Those **53** 3D spatial maps can be visualized with `nilearn` package. However, 3D spatial maps have to be drawn on a reference image and the reference image is `fMRI_mask.nii` file that is shared in the competition data. Niimg-like objects (`.nii` files) can be load from filenames or list of filenames with `nilearn` as well.

Before the 3D spatial maps are loaded, it's necessary to reorient the axis, since h5py flips axis order. The axis of data is reoriented in reverse order.

Finally, 3D spatial maps can be loaded as a Niimg-like object with `nilearn.image.new_img_like` by using the `fMRI_mask.nii` as a reference image.

In [None]:
# Loading reference image
fmri_mask = nl.image.load_img('../input/trends-assessment-prediction/fMRI_mask.nii')

# Reorienting the axis of 3D spatial map
spatial_maps = np.moveaxis(matlab_file['SM_feature'][()], [0, 1, 2, 3], [3, 2, 1, 0]) 

# Loading 3D spatial maps
spatial_maps_niimg = nl.image.new_img_like(ref_niimg=fmri_mask,
                                           data=spatial_maps,
                                           affine=fmri_mask.affine,
                                           copy_header=True)

After the 3D spatial maps are loaded as Niimg-like objects, they can be iterated with `nilearn.image.iter_img` and visualized one by one with `nilearn.plotting.plot_stat_map`. The **53** 3D spatial maps that are visualized below, belong to a single sample, `10001.mat`. There are **5877** samples in both `fMRI_train` and `fMRI_test`.

`nilearn` is very useful to visualize 3D spatial maps because `plot_stat_map` plots the data from three different angles which makes it easier to see the correlated parts of brain.

In [None]:
fig, axes = plt.subplots(nrows=53, figsize=(30, 300))

for i, img in enumerate(list(nl.image.iter_img(spatial_maps_niimg))):
    nlplt.plot_stat_map(stat_map_img=img,
                        bg_img='ch2better.nii',
                        title=f'10001.mat Spatial Map {i + 1} plot_stat_map',
                        axes=axes[i],
                        threshold=1,
                        display_mode='ortho',
                        annotate=False,
                        draw_cross=True,
                        colorbar=False)

There are various ways to visualize 3D spatial maps other than `plot_stat_map`. 5 different plotting functions used in the visualizations below in every row. Those plotting functions are `plot_glass_brain`, `plot_roi`, `plot_anat`, `plot_epi` and `plot_img`. Some of those visualizations are more detailed and some of them are faster.

`threshold` parameter is used for reducing the noise in visualizations. If `None` is given, the image is not thresholded. If a number is given, it is used to threshold the image: values below the threshold (in absolute value) are plotted as transparent.


In [None]:
img = list(nl.image.iter_img(spatial_maps_niimg))[0]

fig, axes = plt.subplots(nrows=5, figsize=(30, 50))

nlplt.plot_glass_brain(img,
                       title='10001.mat Spatial Map 1 plot_glass_brain',
                       threshold=3,
                       black_bg=True,
                       axes=axes[0])
nlplt.plot_roi(img,
               title='10001.mat Spatial Map 1 plot_roi',
               threshold=3,
               black_bg=True,
               axes=axes[1])

nlplt.plot_anat(img,
                title='10001.mat Spatial Map 1 plot_anat',
                axes=axes[2])

nlplt.plot_epi(img,
               title='10001.mat Spatial Map 1 plot_epi',
               axes=axes[3])

nlplt.plot_img(img,
               title='10001.mat Spatial Map 1 plot_img',
               axes=axes[4])

plt.show()

### **5.3 Visualizing 3D Data on 3D Space**
3D spatial maps can also be visualized on 3D space with `nilearn` package. `view_img_on_surf` used as the plotting function and it renders the 3D data onto a 3D brain figure. 3D visualizations can be thresholded with `threshold` parameter as well. 

In [None]:
view = nlplt.view_img_on_surf(img,
                              title='10001.mat Spatial Map 1 view_img_on_surf',
                              title_fontsize=20,
                              threshold=1,
                              black_bg=False)
view.open_in_browser()
view

## **6. Feature Engineering**

### **6.1 ICN Numbers**

The last file shared in the competition data is `ICN_numbers.csv`. It has intrinsic connectivity network numbers for each fMRI spatial map. Those numbers can be mapped to **53** 3D spatial maps.

Values in `ICN_number` represent the number that comes after the network names in parentheses. After the network names are connected with their numbers, they can be mapped to 3D spatial maps directly because index is the order of the 3D spatial maps. It means the first 3D spatial map in every sample is `SCN(69)`, the second 3D spatial map in every sample is `SCN(53)` and etc.

In [None]:
df_icn = pd.read_csv('../input/trends-assessment-prediction/ICN_numbers.csv')
df_icn['ICN_number'] = df_icn['ICN_number'].astype(np.uint8)

unique_components = [feature.split('_')[0] for feature in fnc_features] + [feature.split('_')[2] for feature in fnc_features]
df_icn['component'] = df_icn['ICN_number'].apply(lambda x: [c for c in unique_components if f'({x})' in c][0])

icn_order = {index: value for index, value in enumerate(df_icn['component'].values)}
icn_order

### **6.2 fMRI Comparison**

Feature engineering should be based on the differences of fMRIs between different target groups. In order to make the differences more clear, fMRIs of the lowest value and the highest value are selected for every target feature and plotted side by side.

* **Active/Inactive Regions:** Yellow and light blue areas
* **More Active/Inactive Regions:** Red and dark blue areas

Active/Inactive Regions and patterns vary a lot from component to component. They also vary a lot from target to target, so one feature could be useful for one target, but it could be useless for another target at the same time.

The youngest subject (`10721.mat`) and the oldest subject (`11685.mat`) are selected and plotted side by side. The differences of those fMRIs are listed below.

* The locations of the active/inactive regions are similar for both fMRIs, but they may shift, expand or shrink slightly. The locations of the more active/inactive regions are less similar for both fMRIs, and they may shift, expand or shrink more significantly. It looks like the changes in more active region is dependent to target, but changes in more inactive region is random. Active regions are more dense and inactive regions are more scattered around. Based on that observation, features that are capturing active/inactive regions and more active/inactive regions could be useful.

* Active/inactive intensity and balance is another phenomenon that has effects on targets. In almost every component, active/inactive intensity changes between different subjects. The change in active/inactive intensity may vary from component to component as well. Active/inactive balance also changes between different subjects due to intensity change. Based on that observation, features that are capturing active/inactive regions' distributions could be useful.

* Active/inactive spread and intensity in specific locations (region of interests) has major effects on targets. Featurizing the change in those particular locations could be useful.

Features should be created separately for every different component because patterns change a lot between different components. One newly created feature can have positive linear relationship in one component, but at the same time, it can have negative relationship in another component.

In [None]:
age_min_id = 10721
age_max_id = 11685

spatial_data_age_min = h5py.File(f'../input/trends-assessment-prediction/fMRI_train/{age_min_id}.mat', 'r')
spatial_data_age_max = h5py.File(f'../input/trends-assessment-prediction/fMRI_train/{age_max_id}.mat', 'r')

spatial_data_age_min = np.moveaxis(spatial_data_age_min['SM_feature'][()], [0, 1, 2, 3], [3, 2, 1, 0])
spatial_data_age_max = np.moveaxis(spatial_data_age_max['SM_feature'][()], [0, 1, 2, 3], [3, 2, 1, 0])

spatial_data_age_min_niimg = nl.image.new_img_like(ref_niimg=fmri_mask, data=spatial_data_age_min, affine=fmri_mask.affine, copy_header=True)
spatial_data_age_max_niimg = nl.image.new_img_like(ref_niimg=fmri_mask, data=spatial_data_age_max, affine=fmri_mask.affine, copy_header=True)

spatial_data_age_min_images = list(nl.image.iter_img(spatial_data_age_min_niimg))
spatial_data_age_max_images = list(nl.image.iter_img(spatial_data_age_max_niimg))

fig, axes = plt.subplots(nrows=53, ncols=2, figsize=(30, 300))

for i, (img_age_min, img_age_max) in enumerate(zip(spatial_data_age_min_images, spatial_data_age_max_images)):

    nlplt.plot_stat_map(stat_map_img=img_age_min,
                        bg_img='ch2better.nii',
                        title=f'{age_min_id}.mat (Youngest Age) Component:{icn_order[i]}',
                        axes=axes[i][0],
                        threshold=1,
                        display_mode='ortho',
                        annotate=False,
                        draw_cross=True,
                        colorbar=False)
    
    nlplt.plot_stat_map(stat_map_img=img_age_max,
                        bg_img='ch2better.nii',
                        title=f'{age_max_id}.mat (Oldest Age) Component:{icn_order[i]}',
                        axes=axes[i][1],
                        threshold=1,
                        display_mode='ortho',
                        annotate=False,
                        draw_cross=True,
                        colorbar=False)

### **6.3 Noise Comparison**

There are lots of noise in all components. Inactive and more inactive regions appear to be more noisy and scattered around. The noise can be removed with `image.smooth_img` function. It smoothes images by applying a Gaussian filter and it takes two parameters; the image and smoothing factor (`fwhm`).

As the FWHM increases, the amount of noise decreases along with loose spatial details. There is a trade off between noise and information. In general, the best amount of smoothing for a given analysis depends on the spatial extent of the effects that are expected. In this case, the best amount of smoothing should be between 0 and 5. FWHM greater than 5 loses too much information.

In [None]:
component = 0
img = list(nl.image.iter_img(spatial_data_age_min_niimg))[component]

fig, axes = plt.subplots(nrows=6, ncols=1, figsize=(30, 50))

for i, fwhm in enumerate(np.arange(0, 30, 5)):
    
    smoothed_img = nl.image.smooth_img(img, fwhm)
    
    nlplt.plot_stat_map(stat_map_img=smoothed_img,
                        bg_img='ch2better.nii',
                        title=f'{age_min_id}.mat (Youngest Age) Component:{icn_order[component]} - Smoothing FWHM {fwhm}',
                        axes=axes[i],
                        threshold=1,
                        display_mode='ortho',
                        annotate=False,
                        draw_cross=True,
                        colorbar=False)

### **6.4 Masker**

Data in `fMRI_train` and `fMRI_test` are represented in 4 dimensional space: 3 spatial dimensions and one component dimension. The 4th dimension is time in most cases but it is 53 different components in this dataset. It is convenient to apply a brain mask in order to convert the 4D data into a restructured 2D data representation, **voxel x time**, as seen below:

![masker](https://nilearn.github.io/_images/masking1.jpg)

Component dimension can be treated as time dimension, and thus 4D data can be transformed with `fMRI_mask.nii` into 2D **voxel x component** structure. Transformed data of every subject will have the shape of **53 timesteps (components) x 58869 voxels**, so the whole dataset's shape will be `(11754, 53, 58869)`. Based on that information, the data can be modeled with both 2D and 3D CNNs.

In [None]:
age_min_id = 19531
age_max_id = 17104

fmri_mask = nl.image.load_img('../input/trends-assessment-prediction/fMRI_mask.nii')
print(f'fMRI Mask Shape: {fmri_mask.shape}')

spatial_data_age_min = h5py.File(f'../input/trends-assessment-prediction/fMRI_train/{age_min_id}.mat', 'r')['SM_feature'][()]
spatial_data_age_max = h5py.File(f'../input/trends-assessment-prediction/fMRI_train/{age_max_id}.mat', 'r')['SM_feature'][()]
print(f'Spatial Data Initial Shape: {spatial_data_age_min.shape}')

spatial_data_age_min = np.moveaxis(spatial_data_age_min, [0, 1, 2, 3], [0, 3, 2, 1])
spatial_data_age_max = np.moveaxis(spatial_data_age_max, [0, 1, 2, 3], [0, 3, 2, 1])
print(f'Spatial Data Reoriented Axes Shape: {spatial_data_age_min.shape}')

masked_spatial_data_age_min = spatial_data_age_min[:, fmri_mask.get_data() == 1].astype(np.float32)
masked_spatial_data_age_max = spatial_data_age_max[:, fmri_mask.get_data() == 1].astype(np.float32)
print(f'Spatial Data Masked Shape: {masked_spatial_data_age_min.shape}')

Extracted signals may vary from component to component and subject to subject. The youngest subject (`19531.mat`) and the oldest subject (`17104.mat`) are selected again and their extracted signals are drawn. There are both structural and random differences on those signals and they are listed below.

* Different subjects have their peaks aligned in all components but the height of the peaks are different. This shows the activation intensity difference between different subjects. The peak alignment means that the activation of certain locations are consistent between different subjects.
* Components' standard deviations are very similar to each other between different subjects, but rarely they can be different in some components.

In [None]:
fig, axes = plt.subplots(nrows=53, figsize=(30, 400), dpi=100)

for i in range(53):
    
    axes[i].plot(masked_spatial_data_age_min[i], label=f'{age_min_id}.mat (Youngest Age)', alpha=0.5)
    axes[i].plot(masked_spatial_data_age_max[i], label=f'{age_max_id}.mat (Oldest Age)', alpha=0.5)
    
    axes[i].tick_params(axis='x', labelsize=12)
    axes[i].tick_params(axis='y', labelsize=15)
    axes[i].legend(prop={'size': 15})
    axes[i].set_title(f'Component {i + 1} {icn_order[i]} Signal', size=20, pad=15)
    
plt.show()

### **6.5 Spatial Data Featurizer**

`SpatialDataFeaturizer` is a preprocessing and feature engineering pipeline. It executes two main steps for every .mat file in `fMRI_train` and `fMRI_test`. Those two main steps are:

1. Read file into memory and process it for feature engineering
  * Read the .mat file with `h5py`
  * Switch second and fourth axes
  * Smooth images by applying a Gaussian filter (This step is not used because smoothing loses too much information)
  * Reduce dimensions with fMRI mask (`fMRI_mask.nii`)
2. Create Features
  * Skew and kurtosis of the components
  * Skew of the components' difference
  * Shapes of active/inactive and more active/inactive regions
  * Skew of the active/inactive regions
  * Mean of active region / mean of inactive region
  * Std of active region / std of inactive region
  * Skew of the component - skew of the active/inactive regions
  * Means and shapes of active regions in 15 equally divided bounding boxes

In [None]:
class SpatialDataFeaturizer:
    
    def __init__(self, fwhm=0, visualize=False):
        
        self.binary_mask = nib.load('../input/trends-assessment-prediction/fMRI_mask.nii').get_fdata()
        self.n_components = 53
        self.networks = {
            'ADN': np.array([5, 6]),
            'CBN': np.array([49, 50, 51, 52]),
            'CON': np.array([25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41]),
            'DMN': np.array([42, 43, 44, 45, 46, 47, 48]),
            'SCN': np.array([0, 1, 2, 3, 4]),
            'SMN': np.array([7,  8,  9, 10, 11, 12, 13, 14, 15]),
            'VSN': np.array([16, 17, 18, 19, 20, 21, 22, 23, 24])
        }
        
        self.fwhm = fwhm
        self.visualize = visualize
        
        self.df_train_features = pd.DataFrame()
        self.df_test_features = pd.DataFrame()       
                
    def _read_file(self, file_path):
        
        """
        Read the given .mat file with h5py
        Switch second and fourth axes (53, 52, 63, 53) -> (53, 53, 63, 52)
        Smooth images by applying a Gaussian filter with a smoothing factor of given fwhm
        Reduce dimensions with binary mask (53, 53, 63, 52) -> (53, 58869)

        Parameters
        ----------
        file_path : string
        path of the MATLAB file

        Returns
        -------
        masked_spatial_data : numpy array, shape = (53, 58869)
        numpy array after reducing dimensions
        """
        
        spatial_data = h5py.File(file_path, 'r')['SM_feature'][()] 
        spatial_data = np.moveaxis(spatial_data, [0, 1, 2, 3], [0, 3, 2, 1])
        
        if self.fwhm > 1:
            for i in range(self.n_components):
                img = nl.image.new_img_like(ref_niimg=fmri_mask, data=spatial_data[i], affine=fmri_mask.affine, copy_header=True)
                smoothed_img = nl.image.smooth_img(img, self.fwhm)
                spatial_data[i] = smoothed_img.get_data()
        
        masked_spatial_data = spatial_data[:, self.binary_mask == 1].astype(np.float32)
        return masked_spatial_data
    
    def _create_features(self, Id, masked_spatial_data):
        
        """
        Create features from the spatial data

        Parameters
        ----------
        masked_spatial_data : numpy array, shape = (53, 58869)
        numpy array after reducing dimensions

        Returns
        -------
        features : pandas dataframe, shape = (1, n_features + 1)
        features created for a single sample
        """
        
        features = {}
        
        for i in range(self.n_components):
            
            component = masked_spatial_data[i]
            component_mean = component.mean()
            component_std = component.std()

            active = component[component > component_mean + component_std]
            inactive = component[component < component_mean - component_std]            
            more_active = component[component > component_mean + (2 * component_std)]
            more_inactive = component[component < component_mean - (2 * component_std)]
                    
            component_skew = stats.skew(component)
            component_kurtosis = stats.kurtosis(component)
            active_skew = stats.skew(active)
            inactive_skew = stats.skew(inactive)
            
            component_diff = np.diff(component)
            
            features['Id'] = Id,
            features[f'{icn_order[i]}_skew'] = component_skew,
            features[f'{icn_order[i]}_kurtosis'] = component_kurtosis,
            features[f'{icn_order[i]}_diff_skew'] = stats.skew(component_diff),
            features[f'{icn_order[i]}_active_spread'] = active.shape[0],
            features[f'{icn_order[i]}_active_skew'] = active_skew,
            features[f'{icn_order[i]}_inactive_spread'] = inactive.shape[0],
            features[f'{icn_order[i]}_inactive_skew'] = inactive_skew,
            features[f'{icn_order[i]}_active_inactive_mean_ratio'] = active.mean() / inactive.mean(),
            features[f'{icn_order[i]}_active_inactive_std_ratio'] = active.std() / inactive.std(),
            features[f'{icn_order[i]}_component_active_skew_difference'] = component_skew - active_skew,
            features[f'{icn_order[i]}_component_inactive_skew_difference'] = component_skew - inactive_skew,
            features[f'{icn_order[i]}_more_active_spread'] = more_active.shape[0],
            features[f'{icn_order[i]}_more_inactive_spread'] = more_inactive.shape[0],
                       
            box_count = 15
            for b, box in enumerate(np.array_split(component, box_count)):
                features[f'{icn_order[i]}_box{b}_mean'] = box.mean()
                box_active = box[box > component_mean + component_std]
                features[f'{icn_order[i]}_box{b}_active_spread'] = box_active.shape[0],
                
        return pd.DataFrame(features)      
    
    def visualize_features(self):
        
        """
        Plot distributions of newly created features in training and test set
        Plot scatter plots of newly created features vs every target features
        """
    
        for feature in self.df_train_features.columns.tolist()[1:]:
            
            print(f'New Feature {feature} Statistical Analysis\n{"-" * 42}')

            print(f'Training Mean: {float(self.df_train_features[feature].mean()):.4}  - Training Median: {float(self.df_train_features[feature].median()):.4} - Training Std: {float(self.df_train_features[feature].std()):.4}')
            print(f'Test Mean: {float(self.df_test_features[feature].mean()):.4}  - Test Median: {float(self.df_test_features[feature].median()):.4} - Test Std: {float(self.df_test_features[feature].std()):.4}')
            print(f'Training Min: {float(self.df_train_features[feature].min()):.4}  - Training Max: {float(self.df_train_features[feature].max()):.4}')
            print(f'Test Min: {float(self.df_test_features[feature].min()):.4}  - Training Max: {float(self.df_test_features[feature].max()):.4}')
            print(f'Training Skew: {float(self.df_train_features[feature].skew()):.4}  - Training Kurtosis: {float(self.df_train_features[feature].kurtosis()):.4}')
            print(f'Test Skew: {float(self.df_test_features[feature].skew()):.4}  - Test Kurtosis: {float(self.df_test_features[feature].kurtosis()):.4}')
            training_missing_values_count = self.df_train_features[self.df_train_features[feature].isnull()].shape[0]
            test_missing_values_count = self.df_test_features[self.df_test_features[feature].isnull()].shape[0]
            training_samples_count = self.df_train_features.shape[0]
            test_samples_count = self.df_test_features.shape[0]
            print(f'Training Missing Values: {training_missing_values_count}/{training_samples_count} ({training_missing_values_count * 100 / training_samples_count:.4}%)')
            print(f'Test Missing Values: {test_missing_values_count}/{test_samples_count} ({test_missing_values_count * 100 / test_samples_count:.4}%)')

            fig, axes = plt.subplots(ncols=3, nrows=2, figsize=(25, 12), dpi=100, constrained_layout=True)
            title_size = 18
            label_size = 18

            # New Feature Training and Test Set Distribution
            sns.distplot(self.df_train_features[feature], label='Training', ax=axes[0][0])
            sns.distplot(self.df_test_features[feature], label='Test', ax=axes[0][0])
            axes[0][0].set_xlabel('')
            axes[0][0].tick_params(axis='x', labelsize=label_size)
            axes[0][0].tick_params(axis='y', labelsize=label_size)
            axes[0][0].legend()
            axes[0][0].set_title(f'{feature} Distribution in Training and Test Set', size=title_size, pad=title_size)

            # New Feature vs age
            sns.scatterplot(self.df_train_features[feature], self.df_train_features['age'], ax=axes[0][1])
            axes[0][1].set_title(f'{feature} vs age', size=title_size, pad=title_size)
            axes[0][1].set_xlabel('')
            axes[0][1].set_ylabel('')
            axes[0][1].tick_params(axis='x', labelsize=label_size)
            axes[0][1].tick_params(axis='y', labelsize=label_size)

            # New Feature vs domain1_var1
            sns.scatterplot(self.df_train_features[feature], self.df_train_features['domain1_var1'], ax=axes[0][2])
            axes[0][2].set_title(f'{feature} vs domain1_var1', size=title_size, pad=title_size)
            axes[0][2].set_xlabel('')
            axes[0][2].set_ylabel('')
            axes[0][2].tick_params(axis='x', labelsize=label_size)
            axes[0][2].tick_params(axis='y', labelsize=label_size)

            # New Feature vs domain1_var2
            sns.scatterplot(self.df_train_features[feature], self.df_train_features['domain1_var2'], ax=axes[1][0])
            axes[1][0].set_title(f'{feature} vs domain1_var2', size=title_size, pad=title_size)
            axes[1][0].set_xlabel('')
            axes[1][0].set_ylabel('')
            axes[1][0].tick_params(axis='x', labelsize=label_size)
            axes[1][0].tick_params(axis='y', labelsize=label_size)

            # New Feature vs domain2_var1
            sns.scatterplot(self.df_train_features[feature], self.df_train_features['domain2_var1'], ax=axes[1][1])
            axes[1][1].set_title(f'{feature} vs domain2_var1', size=title_size, pad=title_size)
            axes[1][1].set_xlabel('')
            axes[1][1].set_ylabel('')
            axes[1][1].tick_params(axis='x', labelsize=label_size)
            axes[1][1].tick_params(axis='y', labelsize=label_size)

            # New Feature vs domain2_var2
            sns.scatterplot(self.df_train_features[feature], self.df_train_features['domain2_var2'], ax=axes[1][2])
            axes[1][2].set_title(f'{feature} vs domain2_var2', size=title_size, pad=title_size)
            axes[1][2].set_xlabel('')
            axes[1][2].set_ylabel('')
            axes[1][2].tick_params(axis='x', labelsize=label_size)
            axes[1][2].tick_params(axis='y', labelsize=label_size)

            plt.show()        
                
    def run(self):
                
        fMRI_train = [f'../input/trends-assessment-prediction/fMRI_train/{file}' for file in sorted(os.listdir('../input/trends-assessment-prediction/fMRI_train'))]
        fMRI_test = [f'../input/trends-assessment-prediction/fMRI_test/{file}' for file in sorted(os.listdir('../input/trends-assessment-prediction/fMRI_test'))]
        
        print('Creating spatial features from fMRI_train')

        for file_path in tqdm(fMRI_train):
            Id = file_path.split('/')[-1].strip('.mat')

            masked_spatial_data = self._read_file(file_path)
            df_features = self._create_features(Id, masked_spatial_data)
            
            self.df_train_features = pd.concat([self.df_train_features, df_features], axis=0) 
        
        self.df_train_features.reset_index(inplace=True, drop=True)
        self.df_train_features['Id'] = self.df_train_features['Id'].astype(np.uint32) 
        self.df_train_features = self.df_train_features.merge(df[['Id'] + target_features], on='Id', how='left')
        
        print('Creating spatial features from fMRI_test')
            
        for file_path in tqdm(fMRI_test):
            Id = file_path.split('/')[-1].strip('.mat')

            masked_spatial_data = self._read_file(file_path)
            df_features = self._create_features(Id, masked_spatial_data)

            self.df_test_features = pd.concat([self.df_test_features, df_features], axis=0)  
                       
        self.df_test_features.reset_index(inplace=True, drop=True)
        self.df_test_features['Id'] = self.df_test_features['Id'].astype(np.uint32) 
        self.df_test_features = self.df_test_features.merge(df[['Id'] + target_features], on='Id', how='left')
        
        if self.visualize:
            self.visualize_features()


In [None]:
featurizer = SpatialDataFeaturizer(fwhm=0, visualize=False)
featurizer.run()

In [None]:
print(f'Loading and FNC features {df.shape} are merged with spatial features {pd.concat([featurizer.df_train_features, featurizer.df_test_features], axis=0).shape}')
df = df.merge(pd.concat([featurizer.df_train_features, featurizer.df_test_features], axis=0).drop(columns=target_features), on='Id', how='left')

## **7. Conclusion**

Columns in `fnc.csv`, `loading.csv` and `train_scores.csv` are merged on `Id`. Three features `is_train`, `site` and `site_predicted` are created for labeling different datasets and sites. Bounding box features and statistical features are extracted from `fMRI_train` and `fMRI_test`. Data is saved in pickle format for faster save and load time.

In [None]:
utility_features = ['is_train', 'site', 'site_predicted']

spatial_features = [feature for feature in df.columns.tolist()[1:] if feature not in fnc_features and 
                                                                      feature not in loading_features and
                                                                      feature not in target_features and 
                                                                      feature not in utility_features]

bounding_box_features = [feature for feature in spatial_features if 'box' in feature]
statistical_features = [feature for feature in spatial_features if 'box' not in feature]

print(f'Number of Target Features: {len(target_features)}')
print(f'Number of Loading Features: {len(loading_features)}')
print(f'Number of FNC Features: {len(fnc_features)}')
print(f'Number of Utility Features: {len(utility_features)}')
print(f'Number of Bounding Box Features: {len(bounding_box_features)}')
print(f'Number of Statistical Features: {len(statistical_features)}')

df.to_pickle('trends_tabular_data.pkl')

print(f'TReNDS Tabular Data Shape = {df.shape}')
print(f'TReNDS Tabular Data Memory Usage = {df.memory_usage().sum() / 1024 ** 2:.2f} MB')