# EDA + Basic Model by LightGBM

## Install Libraries

In [None]:
import time
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import lightgbm as lgb

## EDA

- First, lets's check the outline of the data.
- All of the feature columns, cat0 - cat9 are categorical, and the feature columns cont0 - cont13 are continuous.

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-feb-2021/train.csv')
train_df

　- You should also check for missing values. You can see that there are no missing values

In [None]:
train_df.isnull().sum()

- The categorical variable has 15 letters of the alphabet from A to O, and the number of alphabets used is different between each feature quantity.

In [None]:
# cat
unique_cats = set()

fig = plt.figure(figsize=(20, 15))
for i in range(10):
  column_name = f'cat{i}'
  for cat in train_df[column_name].unique():
    unique_cats.add(cat)
  plt.subplot(4, 3, i+1)
  plt.title(column_name)
  train_df[column_name].value_counts().plot.bar()

unique_cats = sorted(unique_cats)
print(f'unique_categories : {unique_cats}  len : {len(unique_cats)}')

- The continuous variables seem to be distributed in the range 0-1.

In [None]:
# cont
fig = plt.figure(figsize=(20, 20))
for i in range(14):
  column_name = f'cont{i}'
  plt.subplot(4, 4, i+1)
  plt.title(column_name)
  train_df[column_name].hist(bins=10)

In [None]:
fig = plt.figure(figsize=(15, 6))
ax = fig.add_subplot()
train_df.iloc[:, 11:25].plot.box(ax=ax)

- The median value of the target variable is around 7, but you can see that there are a certain number of outliers.

In [None]:
train_df['target'].plot.box()

## Preprocessing
- Preprocess the data to put it in the model.
- Normally, categorical variables cannot be submitted directly into the model.
It is necessary to convert the categorical variables to numerical values, but since there is no order between the categorical variables, they are converted by'One Hot Encoding'.
- When OneHotEncoding is performed, the column of each feature is increased by the number of categorical variables.
- In this data, the features of each categorical variable are limited to A-M, so by adding the transformations of each column, the features of cat0-cat9 are converted to 15 features.
- Eventually scale to 0-1 to match continuous variables and tones

In [None]:
X = train_df.iloc[:, 1:25].values
y = train_df['target'].values
X.shape, y.shape

In [None]:
X_onehot = np.zeros((300000, len(unique_cats)))

onehot_enc = OneHotEncoder().fit(np.array(unique_cats).reshape(-1, 1))

for i in range(10):
  X_onehot += onehot_enc.transform(X[:, i].reshape(-1, 1)).toarray()
  
X_onehot_minmax = MinMaxScaler().fit_transform(X_onehot)
X_onehot_minmax

In [None]:
X_trans = np.hstack([X_onehot_minmax, X[:, 10:]])
X_trans.shape

## Model Training
- Divide the data into training data and validation data and validate with the model.
- The model used is LightGBM.　[documentationn](https://lightgbm.readthedocs.io/en/latest/Python-Intro.html)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X_trans, y, test_size=0.3, random_state=42,
)

In [None]:
train_data = lgb.Dataset(X_train, y_train)
val_data = lgb.Dataset(X_val, y_val)

In [None]:
params = {'metric': 'rmse'}

start = time.time()
bst = lgb.train(params=params, train_set=train_data, valid_sets=val_data, num_boost_round=500, early_stopping_rounds=10)
print(f'elapsed time : {time.time() - start}')

## Prediction and Submission

In [None]:
test_df = pd.read_csv('../input/tabular-playground-series-feb-2021/test.csv')
test_df

In [None]:
X_test = test_df.iloc[:, 1:25].values

X_test_onehot = np.zeros((200000, len(unique_cats)))
for i in range(10):
  X_test_onehot += onehot_enc.transform(X_test[:, i].reshape(-1, 1)).toarray()
  
X_test_onehot_minmax = MinMaxScaler().fit_transform(X_test_onehot)

X_test_trans = np.hstack([X_test_onehot_minmax, X_test[:, 10:]])

In [None]:
y_pred = bst.predict(X_test_trans, num_iteration=bst.best_iteration)
y_pred, y_pred.shape

In [None]:
submit_df = pd.DataFrame({'id': test_df['id'], 'target': y_pred})
submit_df

In [None]:
today = datetime.date.today().strftime('%Y%m%d')
submit_df.to_csv(f'{today}_submit.csv', index=False)

- Public Score is 0.84988.

## Simple OneHotEncoding
- I was doing processing to not increase the number of features, but I put all the categorical variables into the model with OneHotEncoding.

In [None]:
X_all = np.vstack([X, X_test])
X_all.shape

In [None]:
X_all_onehot = OneHotEncoder().fit_transform(X_all[:, :10]).toarray()
X_all_onehot.shape

In [None]:
X_all_trans = np.hstack([X_all_onehot, X_all[:, 10:]])
X_trans2, X_test_trans2 = X_all_trans[:300000, :], X_all_trans[300000:, :]
X_trans2.shape, X_test_trans2.shape

In [None]:
X_train2, X_val2, y_train2, y_val2 = train_test_split(
    X_trans2, y, test_size=0.3, random_state=42,
)

train_data2 = lgb.Dataset(X_train2, y_train2)
val_data2 = lgb.Dataset(X_val2, y_val2)

params = {'metric': 'rmse'}

start = time.time()
bst2 = lgb.train(params=params, train_set=train_data2, valid_sets=val_data2, num_boost_round=500, early_stopping_rounds=10)
print(f'elapsed time : {time.time() - start}')

In [None]:
y_pred2 = bst2.predict(X_test_trans2, num_iteration=bst2.best_iteration)

submit_df2 = pd.DataFrame({'id': test_df['id'], 'target': y_pred2})

today = datetime.date.today().strftime('%Y%m%d')
submit_df2.to_csv(f'{today}_2_submit.csv', index=False)

- Public Score is 0.84669.　This is Improved by about 0.003.
- But processing time is increasing.

## Categorical features
- According to [documentetion of LightGBM](https://lightgbm.readthedocs.io/en/latest/Python-Intro.html), we can see that.
- LightGBM can use categorical features as input directly. It doesn’t need to convert to one-hot coding, and is much faster than one-hot coding (about 8x speed-up).
- So, I try it.

In [None]:
train_df2 = train_df.copy()
test_df2 = test_df.copy()

y = train_df2['target']

concat_df = pd.concat([train_df2, test_df2], axis=0)
cat_columns = train_df2.columns[1: 11]

for column in cat_columns:
  concat_df[column] = LabelEncoder().fit_transform(concat_df[column])

train_enc_df2 = concat_df[:300000]
X3 = train_enc_df2.drop(['id', 'target'], axis=1)
y3 = train_enc_df2['target']

test_enc_df2 = concat_df[300000:].drop(['id', 'target'], axis=1)
test_enc_df2

In [None]:
X_train3, X_val3, y_train3, y_val3 = train_test_split(
    X3, y3, test_size=0.3, random_state=42,
)

In [None]:
feature_name = list(X3.columns)
feature_name

In [None]:
train_data3 = lgb.Dataset(X_train3, y_train3, feature_name=feature_name, categorical_feature=feature_name[:10])
val_data3 = lgb.Dataset(X_val3, y_val3, reference=train_data3)

In [None]:
params = {'metric': 'rmse'}

start = time.time()
bst3 = lgb.train(params=params, train_set=train_data3, valid_sets=val_data3, num_boost_round=500, early_stopping_rounds=10)
print(f'elapsed time : {time.time() - start}')

In [None]:
y_pred3 = bst3.predict(test_enc_df2, num_iteration=bst3.best_iteration)

submit_df3 = pd.DataFrame({'id': test_df['id'], 'target': y_pred3})

today = datetime.date.today().strftime('%Y%m%d')
submit_df3.to_csv(f'{today}_3_submit.csv', index=False)

- Public Score is 0.84608. It is slightly improved.
- And processing time is decreasing.