In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import scipy as sp

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import KFold, GridSearchCV

import lightgbm as lgbm
from xgboost import XGBRegressor

In [None]:
sample_sub = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2021/sample_submission.csv")
train = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2021/test.csv")

In [None]:
# save id column for submission
id_col = test['id']
train.drop("id", axis=1, inplace=True)
test.drop("id", axis=1, inplace=True)

In [None]:
train.head()

In [None]:
num_feat = [x for x in train.columns if x.startswith('co')]
cat_feat = [x for x in train.columns if x.startswith('ca')]

## Numerical Features

In [None]:
def create_num_subplots(df, feat):
    fig = plt.figure(figsize=(14, 20))

    for i, col in enumerate(feat):
        plt.subplot(12, 3, i+1)
        sns.histplot(x=col, data=df)
        plt.tight_layout()
    plt.show()

In [None]:
create_num_subplots(train, num_feat)

As we can see distribution of our numeric features is not Gaussian.

In [None]:
#x = np.cumsum(train['cont0'])
x = train['cont0']
plt.plot(x);

In [None]:
x1 = (sp.stats.rankdata(x) / (len(x)+1)) *2 - 1
#print(np.min(x1), np.max(x1))
x1 = np.arctanh(x1)

In [None]:
fig, ax = plt.subplots(2,2, figsize=(12,8))

ax[0,0].plot(train['cont0'])
ax[0,0].set_title("Original Data")


ax[0,1].plot(x1)
ax[0,1].set_title("Transformed Data")

ax[1,0].hist(train['cont0'], bins=40)
ax[1,0].set_title("Original Data")

ax[1,1].hist(x1, bins=40)
ax[1,1].set_title("Transformed Data")


plt.show()

In [None]:
# Non monotonic relation
plt.plot(x, x1, 's')
plt.xlabel("orginal")
plt.ylabel("transformed");

In [None]:
def gaussian_transformer(df):
    
    new_df = pd.DataFrame()
    
    for col in df.columns:
        z = (sp.stats.rankdata(df[col]) / (len(x)+1)) *2 - 1
        z = np.arctanh(z)
        new_df[col] = z
    return new_df

In [None]:
gaussian_num_df = gaussian_transformer(train[num_feat])

In [None]:
gaussian_num_df

In [None]:
create_num_subplots(gaussian_num_df, num_feat)

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
sc = MinMaxScaler(feature_range=(-1,1))
gaussian_num_df[num_feat] = sc.fit_transform(gaussian_num_df[num_feat])

In [None]:
create_num_subplots(gaussian_num_df, num_feat)

**Test set**

In [None]:
gausian_test = gaussian_transformer(test[num_feat])
gausian_test[num_feat] = sc.transform(gausian_test[num_feat])

## Categorical features

In [None]:
for col in cat_feat:
    diff = set(train[col]) - set(test[col])
    print(f"Differents between train and test set is: {diff}")

In [None]:
def create_cat_subplots(df, feat):
    
    fig = plt.figure(figsize=(14, 20))
    
    for i, col in enumerate(feat):
        plt.subplot(10, 3, i+1)
        sns.countplot(x=df[col])
        plt.tight_layout()
    plt.show()

In [None]:
create_cat_subplots(train, cat_feat)

In [None]:
cord_df = pd.DataFrame(train[cat_feat].nunique().values,
                       index=cat_feat, columns=['cartinality'])

In [None]:
cord_df

In [None]:
train_cat_df = train[cat_feat].copy()
test_cat = test[cat_feat].copy()

In [None]:
from sklearn.preprocessing import LabelBinarizer, LabelEncoder

In [None]:
lb = LabelBinarizer()

In [None]:
for col in cat_feat:
    train_cat_df[col] = lb.fit_transform(train_cat_df[col])
    test_cat[col] = lb.transform(test_cat[col])

In [None]:
train_cat_df

In [None]:
create_cat_subplots(train_cat_df, cat_feat)

## Modeling

In [None]:
# First create new dataframe with transformed data
new_train = pd.concat([train_cat_df, gaussian_num_df], axis=1)
new_train.head()

In [None]:
X = new_train
y = train['target']

In [None]:
new_X = pd.concat([X, y], axis=1)

corr_map = new_X.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr_map, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr_map, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=101)

In [None]:
rmse_score = []


 
params ={"objective": "regression",
         "metric": "rmse",
         "verbosity": -1,
         "boosting_type": "gbdt",
         "feature_fraction": 0.5,
         "max_depth": 10,
         "num_leaves": 60,
         "lambda_l1": 2,
         "lambda_l2": 2,
         "learning_rate": 0.01,
         "min_child_samples":50,
         "bagging_fraction": 0.7,
         "bagging_freq": 1, 
         "max_bin": 80,}
          #"is_unbalance":True,
          #"subsample":0.3}
    
    
lgb_train = lgbm.Dataset(X_train, y_train)
lgb_val = lgbm.Dataset(X_val, y_val)
gbm = lgbm.train(params,
                 lgb_train,
                 valid_sets=[lgb_train, lgb_val],
                 num_boost_round=10000,
                 verbose_eval=100,
                 early_stopping_rounds=100,
                 )
    
# Extra Boosting
lgb_train = lgbm.Dataset(X_train, y_train)
lgb_val = lgbm.Dataset(X_val, y_val)
params = {"objective": "regression",
          "metrics": "rmse",
          "verbosity": -1,
          "boosting_type": "gbdt",
          "feature_fraction": 0.5,
          "max_depth": 10,
          "num_leaves":200,
          "lambda_l1": 2,
          "labmda_l2": 2,
          "learning_rate": 0.003,
          "min_child_samples": 50,
          "max_bin": 80,
          #"is_unbalance":True,
          #"subsample":0.3
          "bagging_fraction": 0.7,
          "bagging_freq": 1,}
    
gbm = lgbm.train(params,
                 lgb_train,
                 valid_sets = [lgb_train, lgb_val],
                 verbose_eval = 100,
                 num_boost_round = 10000,
                 early_stopping_rounds=100,
                 init_model = gbm)
    
y_pred = gbm.predict(X_val)
rmse_score.append(np.sqrt(mean_squared_error(y_val, y_pred)))

In [None]:
rmse_score

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=1)
oof = np.zeros(len(X))
score_list = []
fold = 1


for train_index, test_index in kf.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    
    

    #X_train = X_train.abs()

    
    y_pred_list = []
    for seed in [1]:
        dtrain = lgbm.Dataset(X_train, y_train)
        dvalid = lgbm.Dataset(X_val, y_val)
        print(seed)
        params = {"objective": "regression",
                  "metric": "rmse",
                  "verbosity": -1,
                  "boosting_type": "gbdt",
                  "feature_fraction": 0.5,
                  "max_depth": 10,
                  "num_leaves": 120,
                  "lambda_l1": 2,
                  "lambda_l2": 2,
                  "learning_rate": 0.01,
                  "min_child_samples":50,
                  #"bagging_fraction": 0.7,
                  #"bagging_freq": 1, 
                  "max_bin": 80,
                  "is_unbalance":True,}
                  #"subsample":0.3}
        params["seed"] = seed
        model = lgbm.train(params,
                        dtrain,
                        valid_sets=[dtrain, dvalid],
                        verbose_eval=100,
                        num_boost_round=100000,
                        early_stopping_rounds=100
                    )
        
        # Extra boosting.
        dtrain = lgbm.Dataset(X_train, y_train)
        dvalid = lgbm.Dataset(X_val, y_val)
        params = {"objective": "regression",
                  "metric": "rmse",
                  "verbosity": -1,
                  "boosting_type": "gbdt",
                  "feature_fraction": 0.5,
                  "max_depth": 10,
                  "num_leaves": 120,
                  "lambda_l1": 2,
                  "lambda_l2": 2,
                  "learning_rate": 0.03,
                  "min_child_samples":50,
                  #"bagging_fraction": 0.7,
                  #"bagging_freq": 1, 
                  "max_bin": 80,
                  "is_unbalance":True,}
                  #"subsample":0.3}

        params["seed"] = seed
        model = lgbm.train(params,
                            dtrain,
                            valid_sets=[dtrain, dvalid],
                            verbose_eval=100,
                            num_boost_round=1000,
                            early_stopping_rounds=100,
                            init_model = model
                            )

    
    
        y_pred_list.append(model.predict(X_val))
    
   
    
    oof[test_index] = np.mean(y_pred_list,axis=0)    
    score = np.sqrt(mean_squared_error(y_val, oof[test_index]))
    score_list.append(score)
    print(f"RMSE Fold-{fold} : {score}")
    fold+=1

np.mean(score_list)

In [None]:
print(score_list)
print(np.mean(score_list))

## Optuna to the rescue

In [None]:
import optuna

In [None]:
def objective(trial):
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=45)
    dtrain = lgbm.Dataset(X_train, label=y_train)
    
    param = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'max_depth': trial.suggest_int("max_depth", 1, 100),
        'max_bin': trial.suggest_int('max_bin', 1, 255)
    }
    
    gbm = lgbm.train(param, dtrain)
    preds = gbm.predict(X_val)
    pred_labels = np.rint(preds)
    rmse = np.sqrt(mean_squared_error(y_val, pred_labels))
    return rmse

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

print("Number of finished trials: ", len(study.trials))
print("Best trial: ", study.best_trial.params)

In [None]:
study.best_trial.params

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=45)
oof = np.zeros(len(X))
score_list = []
fold = 1
test_preds = []


for train_index, test_index in kf.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    


    y_pred_list = []
    for seed in [1]:
        dtrain = lgbm.Dataset(X_train, y_train)
        dvalid = lgbm.Dataset(X_val, y_val)
        print(seed)
        params = {"objective": "regression",
                  "metric": "rmse",
                  "verbosity": -1,
                  "boosting_type": "gbdt",
                  "feature_fraction": 0.44071200607037225,
                  "max_depth": 90,
                  "num_leaves": 192,
                  "lambda_l1": 1.264462581934323,
                  "lambda_l2": 0.024398254447942604,
                  "learning_rate": 0.01,
                  "min_child_samples":91,
                  "bagging_fraction": 0.9326330031125017,
                  "bagging_freq": 3,
                  "max_bin": 145,
                  "is_unbalance":True}
                  #"subsample":0.3}
        params["seed"] = seed
        model = lgbm.train(params,
                           dtrain,
                        valid_sets=[dtrain, dvalid],
                        verbose_eval=100,
                        num_boost_round=2500,
                        early_stopping_rounds=100
                    )
        
        # Extra boosting.
        dtrain = lgbm.Dataset(X_train, y_train)
        dvalid = lgbm.Dataset(X_val, y_val)
        params = {"objective": "regression",
                  "metric": "rmse",
                  "verbosity": -1,
                  "boosting_type": "gbdt",
                  "feature_fraction": 0.44071200607037225,
                  "max_depth": 90,
                  "num_leaves": 192,
                  "lambda_l1": 1.264462581934323,
                  "lambda_l2": 0.024398254447942604,
                  "learning_rate": 0.0001,
                  "min_child_samples":91,
                  "bagging_fraction": 0.9326330031125017,
                  "bagging_freq": 3,
                  "max_bin": 145,
                  "is_unbalance":True,}
                 # "subsample":0.3}

        params["seed"] = seed
        model = lgbm.train(params,
                            dtrain,
                            valid_sets=[dtrain, dvalid],
                            verbose_eval=100,
                            num_boost_round=1500,
                            early_stopping_rounds=100,
                            init_model = model
                            )

    
    
        y_pred_list.append(model.predict(X_val))
        #test_preds.append(model.predict(new_test))
    
   
    
    oof[test_index] = np.mean(y_pred_list,axis=0)    
    score = np.sqrt(mean_squared_error(y_val, oof[test_index]))
    score_list.append(score)
    print(f"RMSE Fold-{fold} : {score}")
    fold+=1

np.mean(score_list)
print(score_list)
print(np.mean(score_list))

In [None]:
print(score_list)
print(np.mean(score_list))

## Submission

In [None]:
new_test = pd.concat([test_cat, gausian_test], axis=1)

In [None]:
preds = model.predict(new_test)

sub = pd.DataFrame({"id":id_col,
                    "target": preds})

sub.to_csv("sub3_with_optuna.csv", index=False)

In [None]:
import joblib

In [None]:
joblib.dump(model, "lgbm_optuna_model.joblib")

In [None]:
loaded_model = joblib.load("./lgbm_optuna_model.joblib")

## GridSearchCV

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=101)

In [None]:
from sklearn.model_selection import GridSearchCV

def model_gridsearchCV(algo,param,name):
    """
    Function will perform gridsearchCV for given algorithm
    and parameter grid. Returns grid model, y_pred. Prints out 
    mean absolute error, root mean squared error, R-square score
    """
    # Instatiate base model
    model = algo()
    
    # Instantiate grid for a model
    model_grid = GridSearchCV(model, 
                             param,
                             scoring="r2",
                             verbose=2,
                             n_jobs=-1,
                             cv=3)
    # Fit the grid model
    model_grid.fit(X_train, y_train)
    
    # Make prediction
    y_pred = model_grid.predict(X_val)
    
    # Evaluate model
    mae = mean_absolute_error(y_val, y_pred)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    r2score = r2_score(y_val, y_pred)
    
    # Print 
    print(f"**{name} with GridSearchCV**")
    print(f"MAE: {mae:}")
    print(f"RMSE: {rmse:}")
    print(f"R-squared: {r2score:.2f}%")
    
    return mae, rmse, r2score, y_pred, model_grid

In [None]:
param_grid = {"loss":["ls","huber","quantile"],
              "learning_rate": [ 0.01],
              "subsample": [0.5, 0.2, 0.1],
              "max_depth": [3,6,8]}

gbr_grid_mae, gbr_grid_rmse, gbr_grid_r2, _ , gbr_grid = model_gridsearchCV(GradientBoostingRegressor, 
                                                                            param_grid,
                                                                            "GradientBoostingRegressor")

In [None]:
joblib.dump(gbr_grid, "GradientBoostingRegressor_model.joblib")

In [None]:
from sklearn.svm import SVR

In [None]:
param_grid = {"kernel":["linear","rbf",],
              "gamma": ["scale","auto"],
              "C": [0.1, 0.5, 10],
              "epsilon": [0.1, 0.01]}

svr_grid_mae, svr_grid_rmse, svr_grid_r2, svr_grid_y_pred, svr_grid_model = model_gridsearchCV(SVR,
                                                                                param_grid,
                                                                               "SVR")

In [None]:
joblib.dump(gbr_grid, "SVR_model.joblib")

In [None]:
param_grid = {"learning_rate":[0.01],
              "max_depth":[3,4,8],
              "min_child_weight":[3,5,7],
              "colsample_bytree":[0.3, 0.5, 0.7]}

xboost_gr_mae, xboost_gr_rmse, xboost_gr_r2, _ , xboost_gr_model = model_gridsearchCV(XGBRegressor,
                                                                                      param_grid,
                                                                                      "XGBoost")

In [None]:
xboost_gr_model.best_params_

{'colsample_bytree': 0.7,
 'learning_rate': 0.01,
 'max_depth': 8,
 'min_child_weight': 7}

In [None]:
joblib.dump(xboost_gr_model, "XGBoost_model.joblib")

## Keras

In [None]:
X = new_train.values
y = train['target'].values

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout
from keras.callbacks import EarlyStopping, TensorBoard
from keras.layers import LeakyReLU
from keras.optimizers import Adam, RMSprop, SGD

In [None]:
X.shape, y.shape

In [None]:
def evaluate_model(model):
    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    print(f"RMSE: {rmse}")

In [None]:
def create_ann():
    # Instantiate a model
    model = Sequential()
    # Add hidden layer 
    model.add(Dense(14, activation='relu'))
    # Add hidden layer
    model.add(Dense(24, activation='relu'))
    model.add(Dropout(0.1))
    # Add hidden layer
    model.add(Dense(14, activation='relu'))
    # Add output layer
    model.add(Dense(1))
    
    # Compile the model
    model.compile(optimizer=Adam(lr=0.001), loss=tensorflow.keras.losses.MeanSquaredError(),
                  metrics=['mse'])
    
    return model


In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=45)
oof = np.zeros(len(X))
score_list = []
fold = 1

y_pred_list = []
for train_idx, test_idx in kf.split(X):
    X_train, X_val = X[train_idx], X[test_idx]
    y_train, y_val = y[train_idx], y[test_idx]
    
    early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
    
    ann_model = create_ann()
    ann_model.fit(X_train, 
                  y_train, 
                  validation_data=(X_val, y_val), 
                  batch_size=32, epochs=30, 
                  callbacks=[early_stop])
    
    
    y_pred_list.append(ann_model.predict(X_val))
    
    
    oof[test_idx] = np.mean(y_pred_list, axis=0).reshape(len(X_val),)
    score = np.sqrt(mean_squared_error(y_val, oof[test_idx]))
    score_list.append(score)
    print(f"RMSE fold -{fold} : {score}")
    fold +=1
    
print(f"RMSE mean 5 folds: {np.mean(score_list)}")

In [None]:
ann_model.save("ANN_model.h5")