In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from sklearn.inspection import partial_dependence, plot_partial_dependence
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_selection import f_regression
from sklearn.model_selection import KFold, train_test_split
import optuna
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

#pd.options.plotting.backend = "plotly"
%matplotlib inline

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-jan-2021/train.csv')
test_df = pd.read_csv('../input/tabular-playground-series-jan-2021/test.csv')
sub_df = pd.read_csv('../input/tabular-playground-series-jan-2021/sample_submission.csv')

seed = 66

In [None]:
train_df.head(3)
test_df.head(3)

In [None]:
train_df.describe()
test_df.describe()

In [None]:
train_df.isna().sum().sum()
test_df.isna().sum().sum()

In [None]:
cols = list(train_df.drop('id', axis=1).columns)

for i, col in enumerate(cols):
    train_df[col].plot(kind='kde', figsize=(10,7), legend=True)
    
plt.tight_layout();

In [None]:
fig = go.Figure(data=go.Heatmap(
                    z=train_df.corr(),
                    x=train_df.corr().columns,
                    y=train_df.corr().columns))
fig.show()

In [None]:
fig = go.Figure(data=go.Heatmap(
                    z=test_df.corr(),
                    x=test_df.corr().columns,
                    y=test_df.corr().columns))
fig.show()

In [None]:
X = train_df.drop(['id', 'target'], axis=1)
y = train_df.target
estim = DecisionTreeRegressor()
estim.fit(X,y)

In [None]:
cols.remove('target')
fig, ax = plt.subplots(2,7, figsize=(13,5)) #plt.Figure(figsize=(9,5));
plot_partial_dependence(estim, X, [*cols], target=y, ax=ax);
plt.tight_layout();

In [None]:
X, y

In [None]:
fig, ax = plt.subplots(1,3, figsize=(12,5))
plot_partial_dependence(estim, X, [(cols[9], cols[10])], target=y, ax=ax[0]);
plot_partial_dependence(estim, X, [(cols[3], cols[-2])], target=y, ax=ax[1]);
plot_partial_dependence(estim, X, [(cols[6], cols[-1])], target=y, ax=ax[2]);
plt.tight_layout();

In [None]:
F_test = f_regression(X, y)
F_test

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=seed)

def opt_lgm(trial, rounds=10):
    
    param = {
        'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'rf', 'dart']),
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.005, 0.1),
        'importance_type': ['gain', 'split']
        }
    
    param['n_estimators'] = 500


    model = lgb.LGBMRegressor(**param)
    model = model.fit(X_train, y_train)
    
    valid_prediction = model.predict(X_val)
    mse = mean_squared_error(y_val, valid_prediction, squared=False) 
    return mse

study = optuna.create_study(direction='minimize')
study.optimize(opt_lgm, n_trials=2)
params = study.best_params

In [None]:
params

In [None]:
params['n_estimators'] = 500

model = lgb.LGBMRegressor(**params)

kf = KFold(n_splits=5)

i=0
for train_index, test_index in kf.split(X):

    X_train, X_test = X.loc[train_index, :], X.loc[test_index, :]
    y_train, y_test = y[train_index], y[test_index]
    clf = model.fit(X_train, y_train)
    preds = clf.predict(X_test)
    print('mse:', mean_squared_error(y_test, preds, squared=False))
    
    sub_preds = test_df.drop(['id'], axis=1)
    sub_df[f'pred_{i}'] = clf.predict(sub_preds)
    i+=1

In [None]:
sub_df

In [None]:
sub_df['target'] = sub_df.drop(['id', 'target'], axis=1).mean(axis=1) #[0]
sub_df['target'] = sub_df['target']
sub_df = sub_df[['id', 'target']]
sub_df.head(3)

In [None]:
sub_df.to_csv('submission.csv', index=False)