In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
random_state = 42

In [None]:
import random
random.seed(random_state)
np.random.seed(random_state)

In [None]:
data = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/train.csv', index_col='id')

In [None]:
data.head()

In [None]:
import matplotlib.pyplot as plt
fig, axs = plt.subplots(7, 2, figsize=(15, 20))
for i in range(14):
    f = 'cont' + str(i + 1)
    row_idx = i // 2
    col_idx = i % 2
    axs[row_idx][col_idx].hist(data[f], bins=20)
    axs[row_idx][col_idx].set_title(f)
fig.tight_layout(pad=3.0)

In [None]:
data.isnull().sum()

In [None]:
data.describe()

In [None]:
data[data == 0].sum()

In [None]:
import itertools

def add_interaction_features(df, part = 0):
    def add(a, b):
        return a + b
    def minus(a, b):
        return a - b
    def mul(a, b):
        return a * b
    def div(a, b):
        return a / b
    com_ops = [
        ('+', add),
        ('-', minus),
        ('*', mul)
    ]
    non_com_ops = [
        ('/', div)
    ]
    ops = [
        ('+', add),
        ('-', minus),
        ('*', mul),
        ('/', div)
    ]
    cols = df.columns.tolist()
    if 'target' in cols:
        cols.remove('target')
    joint_df = pd.DataFrame(index=df.index)
    for col1, col2 in itertools.combinations(cols, 2):
        for op_sign, op_func in com_ops:
            joint_df[col1 + op_sign + col2] = op_func(df[col1], df[col2])

    for col1, col2 in itertools.permutations(cols, 2):
        for op_sign, op_func in non_com_ops:
            joint_df[col1 + op_sign + col2] = op_func(df[col1], df[col2])

    for col in cols:
        joint_df[col + '*' + col] = df[col] * df[col]

    
#     two_var_cols = joint_df.columns.tolist()
#     splits = 20
#     begin_idx = part * len(two_var_cols) // splits
#     end_idx = (part + 1) * len(two_var_cols) // splits
#     if part == splits - 1:
#         end_idx = len(two_var_cols)
#     print('begin_idx', begin_idx)
#     print('end_idx', end_idx)
#     two_var_cols_to_generate = two_var_cols[begin_idx:end_idx]
#     for joint_col in two_var_cols_to_generate:
#         for col3 in cols:    
#             for op_sign2, op_func2 in ops:
#                 joint_df['(' + joint_col + ')' + op_sign2 + col3] = \
#                     op_func2(joint_df[joint_col], df[col3])
    
    print('Totally new generated columns', len(joint_df.columns))
    return df.join(joint_df)

In [None]:
import math
def add_nonlinear_interaction(df):
    cols = df.columns.tolist()
    if 'target' in cols:
        cols.remove('target')
    joint_df = pd.DataFrame(index=df.index)
    for col in cols:
        joint_df['sqrt(' + col + ')'] = df[col] ** 0.5
        joint_df['sin(' + col + ')'] = np.sin(df[col])
        joint_df['cos(' + col + ')'] = np.cos(df[col])
        joint_df['tan(' + col + ')'] = np.tan(df[col])
        joint_df['arctan(' + col + ')'] = np.arctan(df[col])
        joint_df['degrees(' + col + ')'] = np.degrees(df[col])
        joint_df['radians(' + col + ')'] = np.radians(df[col])
        joint_df['tanh(' + col + ')'] = np.tanh(df[col])
        joint_df['exp(' + col + ')'] = np.exp(df[col])
        joint_df['exp2(' + col + ')'] = np.exp2(df[col])
    joint_df = joint_df.fillna(0)
    return df.join(joint_df)

In [None]:
# data_with_features = add_nonlinear_interaction(data)
data_with_features = add_interaction_features(data, part = 0)
data_with_features.head()

In [None]:
data_with_features_nulls = data_with_features.isnull().sum()
data_with_features_nulls[data_with_features_nulls != 0]

In [None]:
from sklearn.model_selection import train_test_split
X = data_with_features.drop('target', axis=1, inplace=False)
y = data_with_features.target
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state = random_state)

In [None]:
y_train.head()

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif
selector = SelectKBest(f_classif, k=15)
X_new = selector.fit_transform(X_train, y_train)
selected_features = pd.DataFrame(selector.inverse_transform(X_new),
                                 index=X_train.index, columns=X_train.columns.tolist())
dropped_columns = selected_features.columns[selected_features.var() == 0]
selected_columns = selected_features.columns[selected_features.var() != 0]

In [None]:
dropped_columns[:20]

In [None]:
from xgboost import XGBRegressor
def get_xgb_regressor():
#     return XGBRegressor(n_estimators = 2000, learning_rate = 0.01, random_state = random_state, n_jobs = -1)
    return XGBRegressor(
        #colsample_bytree=0.9,
        #gamma=0.0,
        learning_rate=0.02,
        max_depth=10,
        #min_child_weight=7.5,
        n_estimators=7000,                                                                  
        reg_alpha=0.9,
        reg_lambda=0.9,
        subsample=0.9,
        seed=42,
        metric_period=100,
        early_stopping_rounds=200,
        silent=1)

In [None]:
# from sklearn.model_selection import GridSearchCV

# CV = GridSearchCV(xgbRegressor, cv_parameters, scoring = 'neg_mean_absolute_error', n_jobs = -1, 
#                   cv = 2, refit = True, return_train_score = True, verbose = 2)
# xgbRegressor = get_xgb_regressor()
# xgbRegressor.fit(X_train.drop(dropped_columns, axis=1), y_train)   

# print("best_score_ = {}".format(CV.best_score_))
# print("best_params_ = {}".format(CV.best_params_))
# print("cv_results_ = {}".format(CV.cv_results_))
# model = CV

In [None]:
# import eli5
# from eli5.sklearn import PermutationImportance

# perm = PermutationImportance(xgbRegressor, random_state=random_state).fit(X_valid.drop(dropped_columns, axis=1),\
#                                                                           y_valid)

# eli5.show_weights(perm, feature_names = X_valid.drop(dropped_columns, axis=1).columns.tolist())

In [None]:
# from math import sqrt
# from sklearn.metrics import mean_squared_error
# y_pred = xgbRegressor.predict(X_valid.drop(dropped_columns, axis=1))
# mse = mean_squared_error(y_valid, y_pred)
# print(sqrt(mse))
# # initial model with xgbRegressor - 0.6804986578618811 - with leakage
# # model with feature generation and select 25 best features - 0.710598138093024

In [None]:
final_model = get_xgb_regressor()
final_model.fit(X.drop(dropped_columns, axis=1), y)

In [None]:
X_test = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/test.csv', index_col = 'id')

In [None]:
X_test.describe()

In [None]:
X_test.head()

In [None]:
X_test.isnull().sum()

In [None]:
X_test_with_features = add_interaction_features(X_test)

In [None]:
preds = final_model.predict(X_test_with_features.drop(dropped_columns, axis=1))

In [None]:
sample_submission = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/sample_submission.csv')

In [None]:
sample_submission.head()

In [None]:
output = pd.DataFrame({
    'id': X_test_with_features.index,
    'target': preds
})
output.to_csv('submission.csv', index = False)

In [None]:
from IPython.display import FileLink
FileLink('submission.csv')