In [None]:
#for rapids
import sys
!cp ../input/rapids/rapids.0.13.0 /opt/conda/envs/rapids.tar.gz
!cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
sys.path = ["/opt/conda/envs/rapids/lib/python3.6/site-packages"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib/python3.6"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path
!cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/

In [None]:
#for rapids
import cuml 
import cudf as cd

In [None]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import ElasticNet, Ridge
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import json 
import pickle


In [None]:
#Data Loader
fnc_df = pd.read_csv("../input/trends-assessment-prediction/fnc.csv")
loading_df = pd.read_csv("../input/trends-assessment-prediction/loading.csv")

fnc_features, loading_features = list(fnc_df.columns[1:]), list(loading_df.columns[1:])
df = fnc_df.merge(loading_df, on="Id")


labels_df = pd.read_csv("../input/trends-assessment-prediction/train_scores.csv")
labels_df["is_train"] = True

df = df.merge(labels_df, on="Id", how="left")

test_df = df[df["is_train"] != True].copy()
train_df = df[df["is_train"] == True].copy()


# used for training SVR better because SVR is sensitive to scale.
# I initialy did not use this scale for trainig ridge and enet, but using this turned out to \
# have better cv for them too.
FNC_SCALE = 1/500

train_df[fnc_features] *= FNC_SCALE
test_df[fnc_features] *= FNC_SCALE

# excluded 'IC_20' features based on previous experiments using leave one out feature selection
loading_features.remove('IC_20')
features = fnc_features + loading_features

targets = ["age", "domain1_var1","domain1_var2", "domain2_var1", "domain2_var2"]

In [None]:
#function for quickly testing cv and checking loss
def cv_test(model, train_df=train_df, test_df=test_df, features=features, targets=targets, folds=5):


    kf = KFold(n_splits = folds)

    losses = []


    test_preds = pd.DataFrame()

    for target in targets:

        print(target)

        test_pred_cv = pd.DataFrame()
        loss = 0

        train = train_df.loc[train_df[target].notnull()]
        for fold, (train_ind, val_ind) in enumerate(kf.split(train)):
                
            
            X_train, X_val = train.iloc[train_ind][features], train.iloc[val_ind][features]
            y_train, y_val = train.iloc[train_ind][target], train.iloc[val_ind][target]

            

            
            try:
                model.fit(X_train, y_train)
                pred = model.predict(X_val)
                test_pred = model.predict(test_df[features])
            except:
                if target == 'age':
                    c = 100
                else:
                    c = 10
                model = SVR(C=c, cache_size=3000.0)
                model.fit(cd.DataFrame(X_train), cd.Series(y_train))
                pred = model.predict(cd.DataFrame(X_val))
                pred = np.asarray(pred)
                test_pred = np.asarray(model.predict(cd.DataFrame(test_df[features])))
                

            
            loss+= metric(y_val, pred)/folds

                
            test_pred_cv = pd.concat([test_pred_cv, pd.Series(test_pred, name=f'{fold}')], axis=1)
        test_mean = test_pred_cv.mean(axis=1)
        test_preds[target] = test_mean

        
        
        losses.append(loss)
        print(loss, '\n\n')
    
    
    final_score = losses[0]*0.30 + losses[1]*0.175 + losses[2]*0.175+ losses[3]*0.175+ losses[4]*0.175
    print(final_score)
    return(test_preds, losses)

In [None]:
from cuml import SVR
svr_test, svr_losses = cv_test(SVR())

In [None]:
from sklearn.linear_model import ElasticNet
enet_test_preds, enet_losses = cv_test(ElasticNet(alpha=0.002, l1_ratio=0.99, max_iter=10000, 
                          normalize=True, selection='random', tol=1e-5))

In [None]:
from sklearn.linear_model import Ridge
ridge_test_preds, ridge_losses = cv_test(Ridge(0.001))

In [None]:
#the outputs from the above 3 lines
ridge_losses = [0.14432562046393557,
 0.1518621332334051,
 0.15199234197025407,
 0.182002015241784,
 0.1773679018606814]

enet_losses = [0.1466418276102861,
 0.15159310188684474,
 0.15164378498766481,
 0.1823087905152352,
 0.17763578983352862]


svr_losses = [0.14452686132265957,
 0.15536270536948454,
 0.15517225770751503,
 0.18654138255008823,
 0.18036466196946171]

importance_mat = pd.DataFrame()

for i in range(5):
    importance_mat[targets[i]] = pd.Series([1/ridge_losses[i], 1/enet_losses[i], 1/svr_losses[i]])
    
for i in range(5):
    sums = importance_mat[targets[i]].sum()
    importance_mat[f'ridge_{targets[i]}'] = (1/ridge_losses[i])/sums
    importance_mat[f'enet_{targets[i]}'] = (1/enet_losses[i])/sums
    importance_mat[f'svr_{targets[i]}'] = (1/svr_losses[i])/sums

importance_mat = importance_mat.drop(targets, axis=1)
importance_mat = importance_mat.drop([0, 1], axis=0)

In [None]:
# stacking train and pred for test set

import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

import cudf as cd    #rapids
from cuml import SVR #rapids

#first, split for validation of meta model
# meta_val, train = train_test_split(train_df)


features = fnc_features + loading_features # excluded IC_20
train = train_df   #not validating meta model for now
test = test_df[features]


folds = 5
kf = KFold(n_splits = folds, shuffle=True, random_state=0)#kfold cross val on train


sub = pd.DataFrame()   # for storing meta preds on test set


ridge_test = pd.DataFrame()
enet_test = pd.DataFrame()
svr_test = pd.DataFrame()

meta_train_df = pd.DataFrame()

for target in targets:
    
    print('\n target : ', target)
    
    
    ridge_test_preds = pd.DataFrame()
    enet_test_preds = pd.DataFrame()
    svr_test_preds = pd.DataFrame()
    
    meta_test_preds = pd.DataFrame()# for submission
    meta_test_train = pd.DataFrame()# dataset used to train meta model on test set
    
    train = train.loc[train[target].notnull()] # using traindf without null target values
    for fold, (train_ind, val_ind) in enumerate(kf.split(train)):
        print('fold : ', fold)
        
        base_train_X, base_val_X = train.iloc[train_ind][features], train.iloc[val_ind][features]
        base_train_y, base_val_y = train.iloc[train_ind][target], train.iloc[val_ind][target]
        
        
        
        # base models
        ridge = Ridge(alpha=0.001)
        enet = ElasticNet(alpha=0.002, l1_ratio=0.99, max_iter=10000, 
                          normalize=True, selection='random', tol=1e-5)
        if target=='age':
            c = 100
        else:
            c = 10
        svr = SVR(C=c, cache_size=3000.0)
        
        
        
        #fit for base models
        ridge.fit(base_train_X, base_train_y)
        enet.fit(base_train_X, base_train_y)
        svr.fit(cd.DataFrame(base_train_X), cd.Series(base_train_y))
        
        #predict on val for base models
        ridge_pred = ridge.predict(base_val_X)
        enet_pred = enet.predict(base_val_X)
        svr_pred = svr.predict(cd.DataFrame(base_val_X))
        svr_pred = np.asarray(svr_pred)
        svr_pred = pd.Series(svr_pred)
        
        #predict on test for base models
        ridge_test_pred = ridge.predict(test)
        enet_test_pred = enet.predict(test)
        svr_test_pred = svr.predict(cd.DataFrame(test))
        svr_test_pred = np.asarray(svr_test_pred)
        svr_test_pred = pd.Series(svr_test_pred)
        
        
        
        
        #fit for meta model
        r_w = importance_mat[f'ridge_{target}'].values  #weights 
        e_w = importance_mat[f'enet_{target}'].values
        s_w = importance_mat[f'svr_{target}'].values
        meta_train_X = ridge_pred*r_w + enet_pred*e_w + svr_pred*s_w
        
        meta_train_X = np.array(meta_train_X).reshape(-1, 1)
        param = {'num_leaves':80, 'metric':'auc', 'objective':'regression'}
        label = base_val_y
        train_data = lgb.Dataset(meta_train_X, label=label)
        meta_model = lgb.train(param, train_data)
        
        #predict for test set with meta model
        meta_test_o = ridge_test_pred*r_w + enet_test_pred*e_w + svr_test_pred*s_w
        meta_test = np.array(meta_test_o).reshape(-1, 1)
        print('meta_shape', meta_test.shape)
        meta_test_pred = meta_model.predict(meta_test)
        
        
        
        
        #concat for taking mean later
        ridge_test_preds = pd.concat([ridge_test_preds, pd.Series(ridge_test_pred)], axis=1)
        enet_test_preds = pd.concat([enet_test_preds, pd.Series(enet_test_pred)], axis=1)
        svr_test_preds = pd.concat([svr_test_preds, pd.Series(svr_test_pred)], axis=1)
        meta_test_preds = pd.concat([meta_test_preds, pd.Series(meta_test_pred)], axis=1)
        
        meta_test_train = pd.concat([meta_test_train, pd.Series(np.asarray(meta_test_o))], axis=1)
        
        
        
        
    #taking target wise mean
    meta_mean = meta_test_preds.mean(axis=1)
    ridge_mean = ridge_test_preds.mean(axis=1)
    enet_mean = enet_test_preds.mean(axis=1)
    svr_mean = svr_test_preds.mean(axis=1)
    
    meta_train_mean = meta_test_train.mean(axis=1)
    
    
    
    #saving test_predictions to dfs
    sub[target] = meta_mean
    ridge_test[target] = ridge_mean
    enet_test[target] = enet_mean
    svr_test[target] = svr_mean
    meta_train_df[target] = meta_train_mean
        
       
        

        

In [None]:
#submit function for saving predictions to submit format csv
def submit(pred_df, test_df):
    def to_sub(pred_df):
        sub_df = pd.melt(pred_df[["Id", "age", "domain1_var1", "domain1_var2", "domain2_var1", "domain2_var2"]], id_vars=['Id'], value_name='Predicted')

        sub_df["Id"] = sub_df["Id"].astype("str") + "_" +  sub_df["variable"].astype("str")

        sub_df = sub_df.drop("variable", axis=1).sort_values("Id")

        #assert here is for debugging
        assert sub_df.shape[0] == test_df.shape[0]*5

        return sub_df

    test_df.reset_index(drop=True, inplace=True)
    pred_df.reset_index(drop=True, inplace=True)
    pred_df['Id'] = test_df['Id'].astype('int')
    sub_df = to_sub(pred_df)
    sub_df.to_csv('sub.csv', index=False)

In [None]:
submit(sub, test_df)