# [RAPIDS](http://rapids.ai): GPU Accelerated Data Science

![](https://rapids.ai/assets/images/RAPIDS-logo-white.svg)

In [None]:
import sys
!cp ../input/rapids/rapids.0.16.0 /opt/conda/envs/rapids.tar.gz
!cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
sys.path = ["/opt/conda/envs/rapids/lib/python3.7/site-packages"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib/python3.7"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path 
!cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/
!pip install lofo-importance

In [None]:
import cudf, cuml # RAPIDS
import cupy

import matplotlib.pyplot as plt
from sklearn.model_selection import KFold

# Reading and merging dataframes
* tabular data generated from brain scans
* **targets**: age and 4 anonymized domain variables
* **metric**: normalized mean absolute error
* **fnc_features**: cross-correlation values among 53 component timecourses from fMRI
* **loading_features**: independent component features from sMRI
* 5877 patients in train, 5877 patients in test set

In [None]:
%%time

def metric(y_true, y_pred):
    return cupy.mean(cupy.sum(cupy.abs(y_true - y_pred), axis=0)/cupy.sum(y_true, axis=0))


TARGETS = {"age": {"SVR_C": 100, "metric_weight": 0.3},
           "domain1_var1": {"SVR_C": 10, "metric_weight": 0.175},
           "domain1_var2": {"SVR_C": 10, "metric_weight": 0.175},
           "domain2_var1": {"SVR_C": 10, "metric_weight": 0.175},
           "domain2_var2": {"SVR_C": 10, "metric_weight": 0.175}}

fnc_df = cudf.read_csv("/kaggle/input/trends-assessment-prediction/fnc.csv")
loading_df = cudf.read_csv("/kaggle/input/trends-assessment-prediction/loading.csv")

fnc_features, loading_features = list(fnc_df.columns[1:]), list(loading_df.columns[1:])
df = fnc_df.merge(loading_df, on="Id")

labels_df = cudf.read_csv("/kaggle/input/trends-assessment-prediction/train_scores.csv")
labels_df["is_train"] = True

df = df.merge(labels_df, on="Id", how="left")

test_df = df[df["is_train"] != True].copy()
df = df[df["is_train"] == True].copy()

print(df.shape, test_df.shape)

In [None]:
df.head(2)

In [None]:
%%time


def plot_tsne(features, target):
    tsne = cuml.manifold.TSNE(n_components=2)

    X = tsne.fit_transform(df[features])

    plt.figure(figsize=(8, 8), dpi=80)
    plt.scatter(X[0].to_array(), X[1].to_array(), c=df[target].to_array())
    plt.legend()
    
plot_tsne(features=loading_features, target="age")

# Modelling and Cross-Validation

In [None]:
# Giving less importance to FNC features since they are easier to overfit due to high dimensionality. Feature scales matter for SVM.
FNC_SCALE = 1/500

df[fnc_features] *= FNC_SCALE
test_df[fnc_features] *= FNC_SCALE

In [None]:
%%time

NUM_FOLDS = 7
kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=0)


features = loading_features + fnc_features

overal_score = 0
for target, conf in TARGETS.items():    
    y_oof = cupy.zeros(df.shape[0])
    y_test = cupy.zeros((test_df.shape[0], NUM_FOLDS))
    
    for f, (train_ind, val_ind) in enumerate(kf.split(df, df)):
        train_df, val_df = df.iloc[train_ind], df.iloc[val_ind]
        train_df = train_df[train_df[target].notnull()]

        model = cuml.SVR(C=conf["SVR_C"], cache_size=3000.0)
        model.fit(train_df[features], train_df[target])

        y_oof[val_ind] = model.predict(val_df[features]).values
        y_test[:, f] = model.predict(test_df[features]).values
        
    df["pred_{}".format(target)] = y_oof
    test_df[target] = y_test.mean(axis=1)
    
    score = metric(df[df[target].notnull()][target].values, df[df[target].notnull()]["pred_{}".format(target)].values)
    overal_score += conf["metric_weight"]*score
    print(target, cupy.around(score, 4))
    print()
    
print("Overal score:", cupy.around(overal_score, 4))
print()

# Making submission

In [None]:
%%time

sub_df = cudf.melt(test_df[["Id", "age", "domain1_var1", "domain1_var2", "domain2_var1", "domain2_var2"]], id_vars=["Id"], value_name="Predicted")
sub_df["Id"] = sub_df["Id"].astype("str") + "_" +  sub_df["variable"].astype("str")

sub_df = sub_df.drop("variable", axis=1).sort_values("Id")
assert sub_df.shape[0] == test_df.shape[0]*5
sub_df.head(10)

In [None]:
%%time

sub_df.to_csv("submission.csv", index=False)

# Feature Importance and Visualization

In [None]:
from lofo import LOFOImportance, Dataset, plot_importance


def get_lofo_importance(df, target, n_folds=4):
    df = df.to_pandas()

    dataset = Dataset(df=df[df[target].notnull()], target=target, features=loading_features,
                      feature_groups={"fnc": df[df[target].notnull()][fnc_features].values
                      })

    model = cuml.SVR(C=TARGETS[target]["SVR_C"], cache_size=3000.0)
    lofo_imp = LOFOImportance(dataset, cv=n_folds, scoring="neg_mean_absolute_error", model=model)

    return lofo_imp.get_importance()

feature_imp = get_lofo_importance(df, target="age")

plot_importance(feature_imp, figsize=(8, 8), kind="box")

In [None]:
%%time

plot_tsne(features=["IC_22", "IC_12", "IC_04", "IC_15"], target="age")