In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")

import pandas_profiling
%matplotlib inline

import plotly as py
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots

import sklearn.metrics as metrics

import warnings
warnings.filterwarnings("ignore")

# Data

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2021/test.csv')

In [None]:
train.head()

# Continuous

In [None]:
cont = [c for c in train.columns if "cont" in c]
train_cont = train[cont]

In [None]:
fig = plt.figure(figsize=(18,16))

for index,col in enumerate(train_cont):
    plt.subplot(5,3,index+1)
    sns.distplot(train_cont.loc[:,col], kde=False)
fig.tight_layout(pad=1.0)

# Categorical

In [None]:
cat = [ca for ca in train.columns if "cat" in ca]
train_cat = train[cat]

In [None]:
fig = plt.figure(figsize=(18,16))

for index,col in enumerate(train_cat):
    plt.subplot(5,3,index+1)
    sns.countplot(y=col, data=train)
fig.tight_layout(pad=1.0)

## Outliers search

In [None]:
for c in train_cont.columns:
        fig, axs = plt.subplots(1, 3, figsize=(16, 5))
        sns.boxplot(y=c, data=train_cont, ax=axs[0]) # 1

        sns.violinplot(y=c, data=train_cont, ax=axs[1]) # 2

        sns.stripplot(y=c, data=train_cont, size=4, color=".3", linewidth=0, ax=axs[2]) # 3


        fig.suptitle(c, fontsize=15, y=1.1)
        axs[0].set_title('Box Plot')
        axs[1].set_title('Violin Plot')
        axs[2].set_title('Strip Plot')

        plt.tight_layout()
        plt.show()

# Scaling 

In [None]:
from sklearn.preprocessing import StandardScaler
s = StandardScaler() 
train_scaled = s.fit_transform(train[cont])
test_scaled = s.transform(test[cont])

In [None]:
X_scaled_df = pd.DataFrame(train_scaled,columns=train[cont].columns)
X_scaled_test_df = pd.DataFrame(test_scaled,columns=test[cont].columns)

# Gaussian Mixture (FeatureEng)

In [None]:
from sklearn.mixture import GaussianMixture

def k_selection(data,feat):
    name = str(feat)
    print(name)
    gms_per_k = [GaussianMixture(n_components=k, n_init=7, random_state=42).fit(data.values.reshape(-1, 1))
             for k in range(1, 9)]
    
    bics = [model.bic(data.values.reshape(-1, 1)) for model in gms_per_k]
    aics = [model.aic(data.values.reshape(-1, 1)) for model in gms_per_k]
    
    #silhouette_scores = [silhouette_score(X, model.labels_)
    #                 for model in gms_per_k[1:]]


    plt.figure(figsize=(8, 3))
    plt.plot(range(1, 9), bics, "bo-", label="BIC")
    plt.plot(range(1, 9), aics, "go--", label="AIC")
    plt.xlabel("$k$", fontsize=14)
    plt.ylabel("Information Criterion", fontsize=14)
    plt.axis([1, 9.5, np.min(aics) - 50, np.max(aics) + 50])
    
    plt.legend()
    plt.savefig(f"aic_bic_vs_k_plot_{name}.png")
    plt.show()

In [None]:
#%%time
#for col in cont:
#    k_selection(X_scaled_df[col],col)

In [None]:
from sklearn.mixture import GaussianMixture
def get_gmm_class_feature(feat, n):
    gmm = GaussianMixture(n_components=n, random_state=42)

    gmm.fit(X_scaled_df[feat].values.reshape(-1, 1))

    train[f'{str(feat)}_class'] = gmm.predict(X_scaled_df[feat].values.reshape(-1, 1))
    test[f'{str(feat)}_class'] = gmm.predict(X_scaled_test_df[feat].values.reshape(-1, 1))

for col in cont:
    get_gmm_class_feature(col,8)

# Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

for col in train_cat:
    le = LabelEncoder()
    le.fit(train[col])
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])

In [None]:
X_train = train.drop(['id','target'],axis=1)
y_train = train['target']

# Tuning

In [None]:
import optuna
from sklearn.model_selection  import KFold
from sklearn.metrics import mean_squared_error
import xgboost as xgb

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 400, 1000),
        'max_depth': trial.suggest_int('max_depth', 6, 13),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.001, 0.10),
        'subsample': trial.suggest_uniform('subsample', 0.50, 1),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.50, 1),
        'gamma': trial.suggest_int('gamma', 0, 0.05),
        'objective':'reg:squarederror',
        'eval_metric' : 'rmse',
        'tree_method':'gpu_hist',
       }
        
    clf = xgb.XGBRegressor(**params)
    rmse_scores = []
    X_train_k = X_train.values
    y_train_k = y_train.values
    skf = KFold(n_splits=5,shuffle=True, random_state=2001)
    for train_idx, valid_idx in skf.split(X_train_k,y_train_k):
        
        clf.fit(X_train_k[train_idx, :], y_train_k[train_idx],eval_set=[(X_train_k[train_idx, :], y_train_k[train_idx]), (X_train_k[valid_idx, :], y_train_k[valid_idx])], eval_metric='rmse',
            verbose=100, early_stopping_rounds=100)
        pred = clf.predict(X_train_k[valid_idx, :])
        rmse = np.sqrt(mean_squared_error(y_train_k[valid_idx],pred))
        rmse_scores.append(rmse)
    print(f'Trial done: Accuracy values on folds: {rmse_scores}')
    return np.average(rmse_scores)
#  Just for lesser time I've used less trials,Please do increase the trials 
n_trials = 10

FIT_XGB = False

if FIT_XGB:
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=n_trials)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

In [None]:
best_param_1 = {'n_estimators': 674, 'max_depth': 6, 'learning_rate': 0.0834557625148096,
 'subsample': 0.7835176160094526, 'colsample_bytree': 0.889939455423845, 'gamma': 0}
best_param_2 = {'n_estimators': 442, 'max_depth': 6, 'learning_rate': 0.09530143944404941, 
                'subsample': 0.8592488449341857, 
                'colsample_bytree': 0.7800448158708977, 'gamma': 0,
                'objective':'reg:squarederror',
                'eval_metric' : 'rmse',
                'tree_method':'gpu_hist' }

# Training

In [None]:
X = X_train
y = y_train 
columns = X_train.columns
models = []
feature_importance = pd.DataFrame()
scores = []
import time

In [None]:
folds = KFold(n_splits=10, shuffle=True, random_state=2001)
for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)):
    print(f'Fold {fold_n} started at {time.ctime()}')
    X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    model = xgb.XGBRegressor(**best_param_2, n_jobs = -1)
    model.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric='rmse',
            verbose=100, early_stopping_rounds=200)
    pred = model.predict(X_valid)
    score = np.sqrt(mean_squared_error(y_valid,pred))
    
    models.append(model)
    scores.append(score)

    fold_importance = pd.DataFrame()
    fold_importance["feature"] = columns
    fold_importance["importance"] = model.feature_importances_
    fold_importance["fold"] = fold_n + 1
    feature_importance = pd.concat([feature_importance, fold_importance], axis=0)

In [None]:
feature_importance["importance"] /= 1
cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values(
    by="importance", ascending=False)[:50].index

best_features = feature_importance.loc[feature_importance.feature.isin(cols)]

plt.figure(figsize=(16, 12));
sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False));
plt.title('LGB Features (avg over folds)');

# LGBM Updating

In [None]:
lgb_params = {
    "objective": "regression",
    "metric": "root_mean_squared_error",
    "verbosity": -1,
    "learning_rate": 0.001,
    'device': 'gpu',
    'gpu_platform_id': 0,
    'gpu_device_id': 0 
}

In [None]:
#import optuna.integration.lightgbm as lgb

#lgb_data = lgb.Dataset(X, y)

#folds = KFold(n_splits=10, shuffle=True, random_state=2001)
#LGB_train = False

#if LGB_train:
#    tuner_cv = lgb.LightGBMTunerCV(lgb_params,lgb_data, num_boost_round=1000, early_stopping_rounds=100, folds=folds, verbose_eval=100)
#    tuner_cv.run()

# Submission

In [None]:
prediction = pd.DataFrame(columns=['target'])
prediction['target'] = np.zeros(len(test))

In [None]:
for model in models[-9:]:
    prediction['target'] = prediction['target'] + model.predict(test.drop('id',axis=1))
prediction['target'] /= len(models) - 1

In [None]:
submission = pd.DataFrame({
        "id": test["id"],
        "target":prediction['target']
    })
submission.to_csv('my_submission.csv', index=False)