# Introduction

This is the first EDA book for Tabular Playground Series - Nov 2021.<BR>
In this notebook, I mainly would like to look through features distributions in this notebook.<BR>
<BR>
* Since the data size is large, it is necessary to use the memory effectively.
* There are 100 features. All features are Continuous. There are no NaN.
* target is <font color="red"><B>balanced</B></font>
* It seems <font color="red"><B>'f34','f55','f43','f71','f91' features </B></font> are important.

So, shall we start to dive into data?<BR>
<BR>
![](https://images.unsplash.com/photo-1427751840561-9852520f8ce8?ixid=MnwxMjA3fDB8MHxzZWFyY2h8MTR8fGFuYWx5c2lzfGVufDB8fDB8fA%3D%3D&ixlib=rb-1.2.1&auto=format&fit=crop&w=500&q=60)

In [None]:
#import libraries
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn import preprocessing

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('float_format', '{:f}'.format)

# Load Data and Memory reduction


In [None]:
%%time
train = pd.read_csv("../input/tabular-playground-series-nov-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-nov-2021/test.csv")
sample_submission = pd.read_csv("../input/tabular-playground-series-nov-2021/sample_submission.csv")

In [None]:
traintest = pd.concat([train,test],axis=0).reset_index(drop=True)
traintest = traintest.drop(['id'],axis=1)
print("traintest.shape:",traintest.shape)
traintest.head()

In [None]:
#Memory reduction
for c, dtype in zip(traintest.columns, traintest.dtypes):
    if dtype == np.float64:
        traintest[c] = traintest[c].astype(np.float32)
    elif dtype == np.int64:
        traintest[c] = traintest[c].astype(np.int32)

In [None]:
print("train.shape:",train.shape)
print("test.shape:",test.shape)

In [None]:
#Standard scaling
for col in traintest.drop(['target'],axis=1).columns:
    traintest[col] = preprocessing.scale(traintest[col])

In [None]:
train = traintest.iloc[:train.shape[0]]
test = traintest.iloc[train.shape[0]:]

In [None]:
target_col = "target"
feature_cols = [col for col in train.columns if col not in target_col]

# RandomForest Feature importances 


It seems feature f34, f55, f43, f71, f91 are important according to RandomForest.

In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

RFC = RandomForestClassifier(random_state=42)

X = train[feature_cols]
y = train[target_col].replace({False:0,True:1})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

RFC.fit(X,y)
importances = RFC.feature_importances_
df_fi_top10=pd.DataFrame({"feature_cols":feature_cols,"importances":importances}).sort_values(by="importances",ascending=False).reset_index(drop=True).head(10)

plt.bar(df_fi_top10.feature_cols,df_fi_top10.importances,align='center')
plt.title('Feature Importance')
plt.show()

# Distribution train vs test

I learned from [this](https://www.kaggle.com/vishwas21/tps-oct-21-eda-modeling) last month.<BR>
Thank you very much<BR>
<BR>
Let's see the feature distributions bewteen train and test.<BR>
I assume there is not big difference.
<BR>
It looks there are 2 types of distributions among features. One has two mountains and the other one is concentrated around 0.

In [None]:
%%time
df = train

cat_features = [col for col in feature_cols if df[col].nunique() < 25]
cont_features = [col for col in feature_cols if df[col].nunique() >= 25]

print("--- Distribution train vs test ---")
ncols = 5
nrows = int(len(cont_features) / ncols + (len(feature_cols) % ncols > 0))

fig, axes = plt.subplots(nrows, ncols, figsize=(18, 150), facecolor='#FFFFFF')

for r in range(nrows):
    for c in range(ncols):
        col = cont_features[r*ncols+c]
        sns.kdeplot(x=train[col], ax=axes[r, c], color='#00ffff', label='Train data')
        sns.kdeplot(x=test[col], ax=axes[r, c], color='#ffa64d', label='Test data')
        axes[r, c].set_ylabel('')
        axes[r, c].set_xlabel(col, fontsize=8, fontweight='bold')
        axes[r, c].tick_params(labelsize=5, width=0.5)
        axes[r, c].xaxis.offsetText.set_fontsize(4)
        axes[r, c].yaxis.offsetText.set_fontsize(4)
plt.show()

# Distribution between True and False

Let's see <font color="red"><B>'f34','f55','f43','f71','f91' features </B></font> if we can find differences between True and False.<BR>

In [None]:
%%time
df = train

cat_features = [col for col in feature_cols if df[col].nunique() < 25]
cont_features = [col for col in feature_cols if df[col].nunique() >= 25]

print("--- Train distribution target:True vs target:False ---")
ncols = 5
nrows = int(len(cont_features) / ncols + (len(feature_cols) % ncols > 0))

fig, axes = plt.subplots(nrows, ncols, figsize=(18, 150), facecolor='#FFFFFF')

for r in range(nrows):
    for c in range(ncols):
        col = cont_features[r*ncols+c]
        sns.kdeplot(x=train[col][train['target']==True], ax=axes[r, c], color='#00ffff', label='Train:True')
        sns.kdeplot(x=train[col][train['target']==False], ax=axes[r, c], color='#ffa64d', label='Train:False')
        axes[r, c].set_ylabel('')
        axes[r, c].set_xlabel(col, fontsize=8, fontweight='bold')
        axes[r, c].tick_params(labelsize=5, width=0.5)
        axes[r, c].xaxis.offsetText.set_fontsize(4)
        axes[r, c].yaxis.offsetText.set_fontsize(4)
plt.show()

I will update more later.<BR>
Thank you very much.

# Create Model

In [None]:
%%time
!pip install pycaret --ignore-installed llvmlite

In [None]:
%%time
# install lightgbm GPU
#Running LightGBM on GPU https://www.kaggle.com/abhishek/running-lightgbm-on-gpu
!pip uninstall -y lightgbm
!apt-get install -y libboost-all-dev
!git clone --recursive https://github.com/Microsoft/LightGBM

In [None]:
%%bash
cd LightGBM
rm -r build
mkdir build
cd build
cmake -DUSE_GPU=1 -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/ ..
make -j$(nproc)

In [None]:
!cd LightGBM/python-package/;python setup.py install --precompile

In [None]:
!mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
!rm -r LightGBM

In [None]:
target_col = "target"
feature_cols = ['f34','f55','f43','f71','f91']
exc_cols = ['f34','f55','f43','f71','f91','target']
ignore_features = [col for col in train.columns if col not in exc_cols]

In [None]:
%%time
# initialize the setup
from pycaret.classification import *
exp_name = setup(data = train,  target = 'target',numeric_features=['f34','f55','f43','f71','f91'],ignore_features=ignore_features, use_gpu = True)

In [None]:
#Check models which allow to use GPU
models(internal=True)[['Name', 'GPU Enabled']]

In [None]:
# check all metrics used for model evaluation
get_metrics()
# add Log Loss metric in pycaret
from sklearn.metrics import log_loss
add_metric('logloss', 'LogLoss', log_loss, greater_is_better=False)

In [None]:
%%time
# compare baseline models
best = compare_models(fold=10,sort='AUC')

In [None]:
%%time
lr = create_model('lr')
tuned_lr = tune_model(lr, n_iter = 500, optimize = 'AUC')
print(tuned_lr)

In [None]:
%%time
evaluate_model(tuned_lr)

In [None]:
%%time
nb = create_model('nb')
tuned_nb = tune_model(nb, n_iter = 500, optimize = 'AUC')
print(tuned_nb)

In [None]:
%%time
evaluate_model(tuned_nb)