In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
# import cuml
from sklearn.linear_model import LinearRegression
from xgboost import  XGBRegressor
import warnings

from catboost import CatBoostRegressor

In [None]:
base_train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
base_test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')

In [None]:
train_df = base_train.copy()
test_df = base_test.copy()

In [None]:
features = [col for col in test_df.columns if 'f' in col] + ['n_missing']

In [None]:
train_df['n_missing'] = train_df.isna().sum(axis=1)
test_df['n_missing'] = test_df.isna().sum(axis=1)

train_df['num_missing_std'] = train_df[features].isna().std(axis=1).astype('float')
test_df['num_missing_std'] = test_df[features].isna().std(axis=1).astype('float')

In [None]:
train_df = train_df.fillna(train_df.mean())
test_df = test_df.fillna(test_df.mean())
features += ['num_missing_std']

In [None]:
train_df['min_row'] = train_df[features].min(axis=1)
train_df['max_row'] = train_df[features].max(axis=1)
train_df['mean_row'] = train_df[features].mean(axis=1)
train_df['std_row'] = train_df[features].std(axis=1)
train_df['median'] = train_df[features].median(axis=1)
train_df['sem'] = train_df[features].sem(axis=1)

test_df['min_row'] = test_df[features].min(axis=1)
test_df['max_row'] = test_df[features].max(axis=1)
test_df['mean_row'] = test_df[features].mean(axis=1)
test_df['std_row'] = test_df[features].std(axis=1)
test_df['median'] = test_df[features].median(axis=1)
test_df['sem'] = test_df[features].sem(axis=1)

In [None]:
features += ['min_row', 'max_row', 'mean_row', 'std_row', 'sem', 'median']

In [None]:
dataframe = pd.DataFrame(train_df.groupby(['n_missing'])['claim'].mean())
dataframe['non-claim'] = 1 - dataframe['claim']
dataframe['ratio'] = np.log(dataframe['claim'] / dataframe['non-claim'])
ratio_mapping = dataframe['ratio'].to_dict()

train_df['woe'] = train_df['n_missing'].map(ratio_mapping)
test_df['woe'] = test_df['n_missing'].map(ratio_mapping)

In [None]:
features += ['woe']

In [None]:
scaler = StandardScaler()
train_df[features] = scaler.fit_transform(train_df[features])
test_df[features] = scaler.transform(test_df[features])

In [None]:
test_df.head()

#### This is whole code (without optuna part). I boost-ed result several time with this notebook until the final score.

In [None]:
y_test = pd.read_csv('../input/stolen-y/submission45.csv').claim
X_train = train_df.drop(columns=['claim', 'id']).values
X_test = test_df.drop(columns=['id']).values
y_train = train_df.claim

In [None]:
print(X_train.shape, X_test.shape)

In [None]:
my_y = y_test

lr = 0.3
for w in range(2):
    # ----------------------------------------------
    xgb1_params = {
                    'n_estimators': 20000,
                    'learning_rate': 0.005,
                    'min_child_weight': 167,
                    'colsample_bytree': 0.3513017494226757,
                    'subsample': 0.7786913835450154,
                    'max_leaves': 178, 
                    }
#     svm1 = cuml.svm.SVR(handle=cuml.Handle())
#     svm1.fit(X_test, my_y)
#     print('svm1')
    
    xgb_tree_1 = XGBRegressor(tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor', **xgb1_params)
    xgb_tree_1.fit(X_test[:450_000], my_y[:450_000])
    print('tree1')
    
    xgb_lin_1 = XGBRegressor(booster='gblinear')
    xgb_lin_1.fit(X_test[:450_000], my_y[:450_000])
    print('lin1')
    
    test_pred1 = xgb_tree_1.predict(X_test[450_000:])
    test_pred2 = xgb_lin_1.predict(X_test[450_000:])
    
    tree_error = my_y[450_000:] - test_pred1
    lin_error = my_y[450_000:] - test_pred2
    tree_error = tree_error.map(abs)
    lin_error = lin_error.map(abs)
    what_better = (tree_error < lin_error).astype('int')
    
    lin_reg = XGBRegressor(tree_method='gpu_hist', gpu_id=0)
    lin_reg.fit(X_test[450_000:], what_better)
    pred_alpha = lin_reg.predict(X_train)
    
    pred1 = xgb_tree_1.predict(X_train)
    pred2 = xgb_lin_1.predict(X_train)
    
    
    new_y = y_train - (pred1 * pred_alpha + pred2 * (1 - pred_alpha))
    # ----------------------------------------------
    
    params_loss_2 = {
                    'n_estimators': 10000,
                    'learning_rate': 0.001,
                    'min_child_weight': 295,
                    'colsample_bytree': 0.2915087392510538,
                    'subsample': 0.8549961258824171,
                    'max_leaves': 105, 
                    }

    xgb_tree_2 = XGBRegressor(tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor', **params_loss_2)
    xgb_tree_2.fit(X_train[:900_000], new_y[:900_000])
    print('tree2')

    xgb_lin_2 = XGBRegressor(booster='gblinear')
    xgb_lin_2.fit(X_train[:900_000], new_y[:900_000])
    print('lin2')
    
    test_pred1 = xgb_tree_2.predict(X_train[900_000:])
    test_pred2 = xgb_lin_2.predict(X_train[900_000:])
    
    tree_error = new_y[900_000:] - test_pred1
    lin_error = new_y[900_000:] - test_pred2
    tree_error = tree_error.map(abs)
    lin_error = lin_error.map(abs)
    what_better = (tree_error < lin_error).astype('int')
    
    lin_reg = XGBRegressor(tree_method='gpu_hist', gpu_id=0)
    lin_reg.fit(X_train[900_000:], what_better)
    pred_alpha = lin_reg.predict(X_test)

    pred1_2 = xgb_tree_2.predict(X_test)
    pred2_2 = xgb_lin_2.predict(X_test)

    my_y = my_y + (pred1_2 * pred_alpha + pred2_2 * (1 - pred_alpha)) * lr
    print(w)
    # ----------------------------------------------

In [None]:
final_loss = my_y

In [None]:
sample_submission = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')
sample_submission['claim'] = final_loss
sample_submission.to_csv('submission.csv', index=False)

In [None]:
sample_submission.head()

In [None]:
pd.read_csv('./submission.csv')