In [None]:
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold

# Reading and merging dataframes
* tabular data generated from brain scans
* **targets**: age and 4 anonymized domain variables
* **metric**: normalized mean absolute error
* **fnc_features**: cross-correlation values among 53 component timecourses from fMRI
* **loading_features**: independent component features from sMRI
* 5877 patients in train, 5877 patients in test set

In [None]:
%%time


def metric(y_true, y_pred):
    return np.mean(np.sum(np.abs(y_true - y_pred), axis=0)/np.sum(y_true, axis=0))


TARGETS = {"age": {"SVR_C": 100, "metric_weight": 0.3},
           "domain1_var1": {"SVR_C": 10, "metric_weight": 0.175},
           "domain1_var2": {"SVR_C": 10, "metric_weight": 0.175},
           "domain2_var1": {"SVR_C": 10, "metric_weight": 0.175},
           "domain2_var2": {"SVR_C": 10, "metric_weight": 0.175}}

fnc_df = pd.read_csv("/kaggle/input/trends-assessment-prediction/fnc.csv")
loading_df = pd.read_csv("/kaggle/input/trends-assessment-prediction/loading.csv")

fnc_features, loading_features = list(fnc_df.columns[1:]), list(loading_df.columns[1:])
df = fnc_df.merge(loading_df, on="Id")

labels_df = pd.read_csv("/kaggle/input/trends-assessment-prediction/train_scores.csv")
labels_df["is_train"] = True

df = df.merge(labels_df, on="Id", how="left")

test_df = df[df["is_train"] != True].copy()
df = df[df["is_train"] == True].copy()

print(df.shape, test_df.shape)

In [None]:
df.head(2)

In [None]:
%%time

from sklearn.manifold import TSNE


def plot_tsne(features, target):
    tsne = TSNE(n_components=2)

    X = tsne.fit_transform(df[features])

    plt.figure(figsize=(8, 8), dpi=80)
    plt.scatter(X[:, 0], X[:, 1], c=df[target].values)
    plt.legend()
    
plot_tsne(features=loading_features, target="age")

# Modelling and Cross-Validation

In [None]:
# Giving less importance to FNC features since they are easier to overfit due to high dimensionality.
FNC_SCALE = 1/500

df[fnc_features] *= FNC_SCALE
test_df[fnc_features] *= FNC_SCALE

In [None]:
%%time

NUM_FOLDS = 7
kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=0)


features = loading_features + fnc_features

overal_score = 0
for target, conf in TARGETS.items():    
    y_oof = np.zeros(df.shape[0])
    y_test = np.zeros((test_df.shape[0], NUM_FOLDS))
    
    for f, (train_ind, val_ind) in enumerate(kf.split(df, df)):
        train_df, val_df = df.iloc[train_ind], df.iloc[val_ind]
        train_df = train_df[train_df[target].notnull()]

        model = sklearn.svm.SVR(C=conf["SVR_C"], cache_size=3000.0)
        model.fit(train_df[features], train_df[target])

        y_oof[val_ind] = model.predict(val_df[features])
        y_test[:, f] = model.predict(test_df[features])
        
    df["pred_{}".format(target)] = y_oof
    test_df[target] = y_test.mean(axis=1)
    
    score = metric(df[df[target].notnull()][target].values, df[df[target].notnull()]["pred_{}".format(target)].values)
    overal_score += conf["metric_weight"]*score
    print(target, np.round(score, 4))
    print()
    
print("Overal score:", np.round(overal_score, 4))
print()

# Making submission

In [None]:
%%time

sub_df = pd.melt(test_df[["Id", "age", "domain1_var1", "domain1_var2", "domain2_var1", "domain2_var2"]], id_vars=["Id"], value_name="Predicted")
sub_df["Id"] = sub_df["Id"].astype("str") + "_" +  sub_df["variable"].astype("str")

sub_df = sub_df.drop("variable", axis=1).sort_values("Id")
assert sub_df.shape[0] == test_df.shape[0]*5
sub_df.head(10)

In [None]:
%%time

sub_df.to_csv("submission.csv", index=False)

# Feature Importance and Visualization

In [None]:
# TODO: Get most important features and plot TSNE with them