In [None]:
import numpy as np 
import pandas as pd
# from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

# Dataset

In [None]:
DATA_PATH = '/kaggle/input/trends-assessment-prediction/'

In [None]:
loading = pd.read_csv(DATA_PATH + 'loading.csv')
loading.head()

In [None]:
fnc = pd.read_csv(DATA_PATH + 'fnc.csv')
fnc.head()

In [None]:
dataset = loading.merge(fnc, on='Id')
dataset.head()

In [None]:
y_train = pd.read_csv(DATA_PATH + 'train_scores.csv')
print('Number of training samples: {}'.format(len(y_train)))
y_train.head()

In [None]:
y_train = y_train.fillna(y_train.mean()) #interpolate(method='nearest')
y_train.head()

In [None]:
x_train = dataset.loc[dataset['Id'].isin(y_train['Id'])]
x_train.head()

In [None]:
x_test = dataset.loc[~dataset['Id'].isin(y_train['Id'])]
test_ids = x_test['Id'] # Needed for submission
print('Number of test samples: {}'.format(len(x_test)))
x_test.head()

In [None]:
x_train = x_train.drop('Id', axis=1).values
x_test = x_test.drop('Id', axis=1).values
y_train = y_train.drop('Id', axis=1).values

# Model

In [None]:
model = MultiOutputRegressor(BayesianRidge(normalize=True), n_jobs=-1)

# Training and Evaluation

In [None]:
def score(y_pred, y_true):
     return sum(list(map(lambda w, s: w * s, [.3, .175, .175, .175, .175], np.sum(np.abs(y_true - y_pred), axis=0)/np.sum(y_true, axis=0))))

In [None]:
n = 7
y_test = np.zeros((len(x_test), 5, n))
scores = np.zeros(n)
for i, (train_indexes, valid_indexes) in enumerate(KFold(n, shuffle=True, random_state=0).split(x_train)):
    print('Fold {} of {} ...'.format(i + 1, n))
    x_train_f, x_valid = x_train[train_indexes], x_train[valid_indexes]
    y_train_f, y_valid = y_train[train_indexes], y_train[valid_indexes]
    
    model.fit(x_train_f, y_train_f)

    y_pred = model.predict(x_valid)
    scores[i] = score(y_pred, y_valid)
    print('Score = {}'.format(scores[i]))
    
    y_test[:,:,i] = model.predict(x_test)

In [None]:
print('Average score = {}'.format(scores.mean()))

# Prediction

In [None]:
y_test = y_test.mean(axis=2)
print(y_test)

# Submission

In [None]:
outputs = ['age', 'domain1_var1', 'domain1_var2', 'domain2_var1', 'domain2_var2']
ids = ['{}_{}'.format(int(id_), output)  for id_ in test_ids for output in outputs]
predicted = y_test.reshape(5 * len(y_test))

assert len(predicted) == 29385
submission = pd.DataFrame({'Id': ids, 'Predicted': predicted})
submission.head(10)

In [None]:
submission.to_csv('submission.csv', index = False)
!head submission.csv