In [None]:
# Imports
import json

import numpy as np
import pandas as pd

from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer

from xgboost import XGBRegressor

In [None]:
# Read the data
sample_sub_df = pd.read_csv('../input/stanford-covid-vaccine/sample_submission.csv')
train_df = pd.read_json('../input/stanford-covid-vaccine/train.json', lines=True)
test_df = pd.read_json('../input/stanford-covid-vaccine/test.json', lines=True)

In [None]:
print(train_df.shape)
print(test_df.shape)
print(sample_sub_df.shape)

In [None]:
train_df.head(3)

In [None]:
test_df.head(3)

In [None]:
sample_sub_df.head(3)

In [None]:
# Calculate Means of targets
train_df['reactivity'] = train_df['reactivity'].apply(lambda x: np.mean(x))
train_df['deg_Mg_pH10'] = train_df['deg_Mg_pH10'].apply(lambda x: np.mean(x))
train_df['deg_pH10'] = train_df['deg_pH10'].apply(lambda x: np.mean(x))
train_df['deg_Mg_50C'] = train_df['deg_Mg_50C'].apply(lambda x: np.mean(x))
train_df['deg_50C'] = train_df['deg_50C'].apply(lambda x: np.mean(x))

In [None]:
train_df.head()

In [None]:
# Drop unnecessary columns for now
train_df = train_df.drop(['id', 'index', 'reactivity_error', 'deg_error_Mg_pH10', 'deg_error_pH10', 'deg_error_Mg_50C', 'deg_error_50C', 'SN_filter', 'signal_to_noise', 'deg_pH10', 'deg_50C'], axis=1)
train_df.head()

In [None]:
# Split data in features and labels
X_train = train_df.drop(['reactivity', 'deg_Mg_pH10', 'deg_Mg_50C'], axis=1)
Y_train = train_df[['reactivity', 'deg_Mg_pH10', 'deg_Mg_50C']]

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_train, Y_train, test_size=0.15)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

In [None]:
def featurize(df):
    
    df['total_A_count'] = df['sequence'].apply(lambda s: s.count('A'))
    df['total_G_count'] = df['sequence'].apply(lambda s: s.count('G'))
    df['total_U_count'] = df['sequence'].apply(lambda s: s.count('U'))
    df['total_C_count'] = df['sequence'].apply(lambda s: s.count('C'))
    
    df['total_dot_count'] = df['structure'].apply(lambda s: s.count('.'))
    df['total_ob_count'] = df['structure'].apply(lambda s: s.count('('))
    df['total_cb_count'] = df['structure'].apply(lambda s: s.count(')'))
    
    return df

In [None]:
X_train = featurize(X_train)
X_test = featurize(X_test)

In [None]:
X_train = X_train.drop(['sequence', 'structure', 'predicted_loop_type'], axis=1)
X_test = X_test.drop(['sequence', 'structure', 'predicted_loop_type'], axis=1)

In [None]:
X_train.head()

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
def mcrmse_loss(y_true, y_pred, N=3):
    """
    Calculates competition eval metric
    """
    assert len(y_true) == len(y_pred)
    n = len(y_true)
    return np.sum(np.sqrt(np.sum((y_true - y_pred)**2, axis=0)/n)) / N

custom_scorer = make_scorer(mcrmse_loss, greater_is_better=False)

In [None]:
# Hyperparameter tune multioutput XGBoost Regressor
xgb = XGBRegressor(
    subsample=0.8,
    colsample_bytree=0.75,
    reg_lambda=2,
    reg_alpha=1,
    random_state=28
)

params = {
    'estimator__n_estimators': [ 800, 900, 1000],
    'estimator__learning_rate': [0.1, 0.25, 0.30],
    'estimator__max_depth': [3, 4, 5]
}

reg = MultiOutputRegressor(xgb)

# Perform 5 fold cross validation on set 1 features
gs = GridSearchCV(reg, param_grid=params, cv=5, return_train_score=True , n_jobs=-1, scoring=custom_scorer)
gs.fit(X_train, Y_train)

In [None]:
gs.best_params_

In [None]:
# Train using best parameters
xgb = XGBRegressor(
    max_depth=gs.best_params_['estimator__max_depth'],
    subsample=0.8,
    colsample_bytree=0.75,
    reg_lambda=2,
    reg_alpha=1,
    n_estimators=gs.best_params_['estimator__n_estimators'],
    learning_rate=gs.best_params_['estimator__learning_rate'],
    random_state=28
)

reg = MultiOutputRegressor(xgb)
reg.fit(X_train, Y_train)

In [None]:
# Train score
mcrmse_loss(reg.predict(X_train), np.array(Y_train))

In [None]:
# Validation score
mcrmse_loss(reg.predict(X_test), np.array(Y_test))

In [None]:
test = featurize(test_df.drop(['index', 'id'], axis=1))
test = test.drop(['sequence', 'structure', 'predicted_loop_type'], axis=1)
test = scaler.transform(test)

In [None]:
# Predict
preds = pd.DataFrame(reg.predict(test))

In [None]:
# Create submission csv
submission_df = preds.loc[preds.index.repeat(list(test_df['seq_length']))].reset_index(drop=True)
submission_df = submission_df.rename(columns={0: 'reactivity', 1: 'deg_Mg_pH10', 2: 'deg_Mg_50C'})
submission_df['id_seqpos'] = sample_sub_df['id_seqpos']
submission_df['deg_pH10'] = 0.0
submission_df['deg_50C'] = 0.0
submission_df = submission_df[['id_seqpos', 'reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C', 'deg_50C']]
submission_df.head()

In [None]:
# Save that CSV for submission
submission_df.to_csv('submission.csv', index=False)