In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_train = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2021/train.csv")
df_test = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2021/test.csv")
df_sub = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2021/sample_submission.csv")

In [None]:
!pip install catboost
!pip install shap
!pip install featuretools

In [None]:
import numpy as np
import pandas as pd
import random
seed = 44  
random.seed(seed)
np.random.seed(seed)

# Import the libraries
import numpy as np 
import pandas as pd 
import re
import sys, gc, os
from IPython.display import display

from scipy import stats

import shap
shap.initjs()
import featuretools as ft

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

from tqdm import tqdm
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, Normalizer, MaxAbsScaler
from sklearn.preprocessing import (StandardScaler, PowerTransformer, QuantileTransformer ,LabelEncoder, 
                                   OneHotEncoder, OrdinalEncoder)
import catboost as cb
import lightgbm as lgb
from xgboost import XGBRegressor

In [None]:
test_ids = df_test["id"].tolist()

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
df_sub.head()

In [None]:
df_train.shape, df_test.shape, df_sub.shape

In [None]:
df_train.isnull().value_counts().sum()

In [None]:
df_test.isnull().value_counts().sum()

# Checking for categorical features

In [None]:
categorical_feats= df_train.dtypes[df_train.dtypes == "object"].index; categorical_feats

# Checking for numerical features

In [None]:
numerical_feats= df_train.dtypes[df_train.dtypes != "object"].index; numerical_feats

# Exploring categorical features

In [None]:
target = df_train['target']
df_train.drop('target', axis=1, inplace=True)

In [None]:
CAT= df_train.select_dtypes(include='object').columns.tolist()

In [None]:
idx = 0
f, axes = plt.subplots(5, 2, sharex=True, figsize=(12,14))
plt.suptitle('Categorical features distribution', size=16, y=(0.94))

for row in range(5):
    for col in range(2):
        data = df_train[CAT[idx]].value_counts()
        sns.barplot(x = data.values, y = data.index, palette='deep', ax=axes[row, col])
        axes[row,col].set_title(CAT[idx])
        idx += 1

# Exploring continuous features

In [None]:
NUM = df_train.select_dtypes('float64').columns.tolist()

In [None]:
plt.figure(figsize=(16,5))
sns.violinplot(data=df_train[NUM], color='slategray')
plt.title('Continuous features distribution');


In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df_train[NUM].join(target).corr(), square=True, linewidths=0.7, cmap="bone_r");

# Exploring target distribution

In [None]:
plt.figure(figsize=(10,5))
sns.histplot(target, color='slategray', stat='frequency');

Let's see how many values bellow mark 4.3 do we have here.

In [None]:
len(target[target <= 4.3])

**we have like 89 outliers.**

In [None]:
to_drop = target[target <= 4.3].index
target.drop(to_drop, inplace=True)
df_train.drop(to_drop, inplace=True)

plt.figure(figsize=(10,5))
sns.histplot(target, color='slategray', stat='frequency');

Looking Much better now.

# Categorical features encoding

In [None]:
CAT_01 = list(set(CAT).difference(set(['cat6'])))
CAT_01

# Simple one-hot encoding

In [None]:
dummies_train = pd.get_dummies(df_train[CAT_01])
dummies_test = pd.get_dummies(df_test[CAT_01])

train = df_train[NUM].join(dummies_train)
test = df_test[NUM].join(dummies_test)

In [None]:
set(train.columns.tolist()).difference(set(test.columns.tolist()))

In [None]:
#train.shape[1], test.shape[1]

In [None]:
train.head()

In [None]:
train = df_train.copy()
test = df_test.copy()

train.drop(['id'], axis=1, inplace=True)
test.drop(['id'], axis=1, inplace=True)

train.shape, test.shape

# Feature engineering

In [None]:
SEED = 44

In [None]:
cols = train.columns.tolist()

ct = ColumnTransformer([('onehot',OrdinalEncoder(), slice(len(CAT))),
                        ('quantile',QuantileTransformer(random_state=SEED, n_quantiles=1500),
                         slice(len(CAT),len(CAT) + len(NUM) + 5))])

train = ct.fit_transform(train)
test = ct.transform(test)

train = pd.DataFrame(train, columns = cols)
test = pd.DataFrame(test, columns = cols)

train[CAT] = train[CAT] / 10
test[CAT] = test[CAT] / 10

# Manual feature egineering

In [None]:
def feat_eng(df):

    df['cont001'] = df['cont8'] * df['cont0']
    df['cont002'] = df['cont9'] * df['cont0']
    df['cont003'] = df['cont9'] * df['cont5']
    df['cont004'] = df['cont8'] * df['cont5']
    df['cont005'] = df['cont2'] * df['cont4']
    df['cont006'] = df['cont1'] * df['cont3']
    df['cont007'] = df['cont13'] * df['cont1']
    
    return df

In [None]:
train = feat_eng(train)
test = feat_eng(test)

train.shape, test.shape

# FeatureTools

In [None]:
to_transform = ['cont0', 'cont1', 'cont4', 'cont5', 'cont8', 'cont9', 'cont12']

In [None]:
def feat_eng_01(df):
    
    es = ft.EntitySet(id = 'data')

    original_cols = to_transform

    es = es.entity_from_dataframe(entity_id = 'data', 
                              dataframe = df[original_cols], 
                              index = 'id', 
                              time_index = None)
    
    new_features, new_feature_names = ft.dfs(entityset = es, target_entity = 'data', 
                                 trans_primitives = ['multiply_numeric'])
    
    new_features.reset_index(drop=True, inplace=True)
    new_features.drop(original_cols, axis=1, inplace=True)
    
    return new_features

In [None]:
train_fe = feat_eng_01(train)
test_fe = feat_eng_01(test)

train_fe.index = train.index
test_fe.index = test.index

train = train.join(train_fe)
test = test.join(test_fe)

train.shape, test.shape

# Splitting train and test sets

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(train, target, test_size=0.15, random_state=SEED)

X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

# MODEL: LGBMClassifier

In [None]:
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_valid, label=y_valid)

param = {'objective': 'regression',
         'boosting': 'gbdt',  
         'metric': 'rmse',
         'learning_rate': 0.05, 
         'num_iterations': 7500,
         'max_depth': -1,
         'min_data_in_leaf': 15,
         'bagging_fraction': 0.8,
         'bagging_freq': 1,
         'feature_fraction': 0.8
         }

clf = lgb.train(params=param, 
                early_stopping_rounds=100,
                verbose_eval=100,
                train_set=train_data,
                valid_sets=[test_data])

y_pred = clf.predict(X_valid)

In [None]:
y_pred

In [None]:
np.sqrt(mean_squared_error(y_valid, y_pred))

# Feature Importance:

In [None]:
feature_imp = pd.DataFrame(sorted(zip(clf.feature_importance(), train.columns), reverse=True)[:], columns=['Value','Feature'])
plt.figure(figsize=(20,20))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('LightGBM Features')
plt.tight_layout()
plt.show()

# Predict on Test Set

In [None]:
Xtest = test

Xtest.head()

In [None]:
y = target

In [None]:
errlgb = []
y_pred_totlgb = []

fold = KFold(n_splits= 10, shuffle=True, random_state=42)

for train_index, test_index in fold.split(train):
    
    X_train, X_test = train.loc[train_index], train.loc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    train_data = lgb.Dataset(X_train, label=y_train)
    test_data = lgb.Dataset(X_test, label=y_test)
    
    clf = lgb.train(params=param, 
                     early_stopping_rounds=200,
                     verbose_eval=100,
                     train_set=train_data,
                     valid_sets=[test_data])

    y_pred = clf.predict(X_test) 

    print("RMSE: ", np.sqrt(mean_squared_error(y_test, y_pred)))
    
    errlgb.append(np.sqrt(mean_squared_error(y_test, y_pred)))
    
    p = clf.predict(Xtest)
    
    y_pred_totlgb.append(p)

In [None]:
np.mean(y_pred_totlgb,0)

# Submission:

In [None]:
y_pred = np.mean(y_pred_totlgb,0)

In [None]:
df_sub['target'] = y_pred
df_sub.head()

In [None]:
df_sub.to_csv('lgb_sub1_v4.csv', index=False)  # 0.84389 on LB