In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, PowerTransformer
from catboost import CatBoostRegressor
from sklearn.neural_network import MLPRegressor

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/train.csv', index_col=0)
df

In [None]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
X.shape, y.shape

In [None]:
scaler = PowerTransformer()
X = scaler.fit_transform(X)

In [None]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=1/3)
Xtrain.shape, Xtest.shape, ytrain.shape, ytest.shape

In [None]:
N_ESTIMATORS = 10000

In [None]:
catboost1_params={
    'objective': 'Poisson',
    'bootstrap_type': 'Poisson',
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    'task_type': 'GPU',
    'max_depth': 8,
    'learning_rate': 5e-3,
    'n_estimators': N_ESTIMATORS,
    'max_bin': 280,
    'min_data_in_leaf': 64,
    'l2_leaf_reg': 0.01,
    'subsample': 0.8,
    'verbose': 0
}
# mlp_params=dict(
#     hidden_layer_sizes=5000,
#     max_iter=15000,
#     early_stopping=True
# )
xgb_params={
    'objective': 'count:poisson',
    'learning_rate': 5e-3,
    'subsample': 0.8,
    'colsample_bytree': 0.6,
    'n_estimators': N_ESTIMATORS,
    'max_depth': 11,
    'alpha': 20,
    'lambda': 9,
    'min_child_weight': 256,
    'importance_type': 'total_gain',
    'tree_method': 'gpu_hist'
}
models = [
    CatBoostRegressor(**catboost1_params),
#     MLPRegressor(**mlp_params),
    XGBRegressor(**xgb_params)
]

In [None]:
%%time

for model in models:
    model.fit(X, y)

In [None]:
predicted = np.array([model.predict(X) for model in models])
predicted

In [None]:
def cost(w, X, y):
    return .5/y.shape[0]*np.linalg.norm(X.T.dot(w) - y, 2)**2

def grad(w, X, y):
    return 1/y.shape[0]*X.dot(X.T.dot(w) - y)

In [None]:
def predicting(coef, predicted):
    return predicted.T.dot(coef)

In [None]:
lr = 0.001
gamma = 0.01
coef = np.array([1/len(models)] * len(models))

In [None]:
w = [coef]
v_old = np.zeros_like(coef)
for it in range(30000):
    v_new = gamma*v_old + lr*grad(w[-1], predicted, y)
    w_new = w[-1] - v_new
    if np.linalg.norm(grad(w_new, predicted, y))/w_new.shape[0] < 1e-3:
        print('Converged at {} steps'.format(it))
        break
    w.append(w_new)
    v_old = v_new
w = np.array(w)
w.shape

In [None]:
w[-1]

In [None]:
pred = predicting(w[-1], predicted)
mean_squared_error(y, pred, squared=False)

In [None]:
def predict_new(w, data):
    predicted_value = np.array([model.predict(data) for model in models])
    return predicting(w, predicted_value)

In [None]:
plt.scatter(predict_new(coef, X), y)

## Predict for `test` file

In [None]:
test = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/test.csv')

In [None]:
testX = test.iloc[:, 1:]
testX = scaler.transform(testX)

In [None]:
outcome = predict_new(w[-1], testX)
outcome.shape

In [None]:
pd.DataFrame({'id': test['id'], 'loss': outcome}).to_csv('aug_submission.csv', index=False)

In [None]:
pd.read_csv('aug_submission.csv')