In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder #Encode Categorical Features
import lightgbm as lgb #Gradient Boosting Machine
import matplotlib.pyplot as plt #Visualization
import seaborn as sns #Visualization
from sklearn.model_selection import KFold #N-Fold Validation
from sklearn.metrics import mean_squared_error #Evaluation Metric
import optuna #hyperparams Tuning

In [None]:
trainSet = pd.read_csv('../input/tabular-playground-series-feb-2021/train.csv')
testSet = pd.read_csv('../input/tabular-playground-series-feb-2021/test.csv')

# Data Preprocessing

In [None]:
trainSet.head()

In [None]:
#plot the Target Distribution
sns.displot(data=trainSet, x="target", kde=True)

In [None]:
#encode categorical feats
cat_feat = [f"cat{val}" for val in range(0,10)]

labelEnc = [LabelEncoder() for _ in range(len(cat_feat))]

for i in range(len(cat_feat)):
    trainSet[cat_feat[i]] = labelEnc[i].fit_transform(trainSet[cat_feat[i]])

In [None]:
for i in range(len(cat_feat)):
    testSet[cat_feat[i]] = labelEnc[i].transform(testSet[cat_feat[i]])

In [None]:
cont_var = [f"cont{val}" for val in range(14)]
for i in cont_var:
    trainSet[i] = np.log(trainSet[i])
    testSet[i] = np.log(testSet[i])

From the correlation matrix, I could say that there is no single feature that is highly correlated to the target. So for this notebook, I will use all those features.

In [None]:
#Seperate features and its target
y = trainSet.target
X = trainSet.drop(['target', 'id'], axis=1)
X_test = testSet.drop('id', axis=1)

In [None]:
X.head()

In [None]:
X_test.head()

# Create pseudo Label

In [None]:
#For time sake, I will not rerun the hyperparam tunning, here is the best Hyperparams I got from optuna tunner
best_params = {
    'num_iterations': 979, 
    'learning_rate': 0.04867910597290001, 
    'min_data_in_leaf': 109, 
    'num_leaves': 15, 
    'lambda_l1': 19.336354545776132, 
    'lambda_l2': 22.70600360390991, 
    'bagging_freq': 1, 
    'cat_smooth': 18.499097172037967,
    'boosting_type': 'gbdt',
    'metric': 'rmse',
    'verbose': -1
}

In [None]:
N_FOLDS = 5
rmse_score = 0
lgbm_models = []
eval_results = [{} for _ in range (N_FOLDS)]

kf = KFold(n_splits = N_FOLDS)

In [None]:
#Train our LGBM using the best parameter

import warnings
warnings.filterwarnings("ignore")

y_test = pd.DataFrame()

for folds, (train_idx,val_idx) in enumerate(kf.split(X, y)):
    print(f"folds: {folds}")
    trainSet = lgb.Dataset(X.iloc[train_idx], y.iloc[train_idx])
    valSet = lgb.Dataset(X.iloc[val_idx], y.iloc[val_idx])
    
    model = lgb.train(best_params, trainSet, valid_sets=[trainSet, valSet], evals_result=eval_results[folds], verbose_eval= 100)
    lgbm_models.append(model)
    y_pred = model.predict(X.iloc[val_idx])
    
    rmse_score += mean_squared_error(y.iloc[val_idx], y_pred, squared=False)/N_FOLDS
    
    print(mean_squared_error(y.iloc[val_idx], y_pred, squared=False))
    
    y_test = pd.concat([y_test, pd.Series(model.predict(X_test))], axis=1)

In [None]:
y_test = y_test.mean(axis=1)

In [None]:
X_concat = pd.concat([X, X_test])
y_concat = pd.concat([y, y_test])

# Train using Pseudo Label

In [None]:
N_FOLDS = 5
rmse_score = 0
lgbm_models = []
eval_results = [{} for _ in range (N_FOLDS)]


kf2 = KFold(n_splits=10, shuffle=True, random_state=5473)

In [None]:
#Train our LGBM using the best parameter

import warnings
warnings.filterwarnings("ignore")

y_test = pd.DataFrame()

for folds, (train_idx,val_idx) in enumerate(kf.split(X_concat, y_concat)):
    print(f"folds: {folds}")
    trainSet = lgb.Dataset(X_concat.iloc[train_idx], y_concat.iloc[train_idx])
    valSet = lgb.Dataset(X_concat.iloc[val_idx], y_concat.iloc[val_idx])
    
    model = lgb.train(best_params, trainSet, valid_sets=[trainSet, valSet], evals_result=eval_results[folds], verbose_eval= 100)
    lgbm_models.append(model)
    y_pred = model.predict(X_concat.iloc[val_idx])
    
    rmse_score += mean_squared_error(y_concat.iloc[val_idx], y_pred, squared=False)/N_FOLDS
    
    print(mean_squared_error(y_concat.iloc[val_idx], y_pred, squared=False))
    
    y_test = pd.concat([y_test, pd.Series(model.predict(X_test))], axis=1)

In [None]:
y_test = y_test.mean(axis=1)

In [None]:
#plot the rmse score for each iteration in 5th fold model
lgb.plot_metric(eval_results[4])

In [None]:
lgb.plot_importance(lgbm_models[4])

# Predict the Test Set

In [None]:
id = testSet.id
testSet.drop('id', axis=1, inplace=True)

# Create Submission File as in sample_submission.csv

In [None]:
submFile = pd.concat([id, y_test],axis=1)
submFile.columns = ['id', 'target']

In [None]:
submFile.to_csv('submFile.csv', index=False)