## <p style="background-color:GreenYellow; font-family:newtimeroman; font-size:120%; text-align:center">Table of Content</p>

* [1. Read the Data Files](#1)
* [2. Data Exploration](#2)
    * [2.1 Target Variable](#2.1)
    * [2.2 Categorical Columns](#2.2)
    * [2.3 Numerical Columns](#2.3)
    * [2.4 Correlation Check](#2.4)
* [3. Encode Features](#3)
* [4. Model Building](#4)
    * [4.1 Logistic Regression](#4.1)
    * [4.2 Ridge Classifier](#4.2)
    * [4.3 Random Forest](#4.3)
    * [4.4 XGBoost](#4.4)
    * [4.5 LightGBM](#4.5)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
from pathlib import Path

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import RandomForestClassifier

import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)

%matplotlib inline
        
input_path = Path('/kaggle/input/tabular-playground-series-mar-2021/')

<a id='1'></a>
# <p style="background-color: #4285f4; color: #FFFFFF ;font-family:newtimeroman; font-size:120%">1. Read the Data Files</p>

In [None]:
train = pd.read_csv(input_path / 'train.csv', index_col='id')
display(train.head())

In [None]:
test = pd.read_csv(input_path / 'test.csv', index_col='id')
display(test.head())

In [None]:
submission = pd.read_csv(input_path / 'sample_submission.csv', index_col='id')
display(submission.head())

<a id='2'></a>
# <p style="background-color: #ea4335; color: #FFFFFF ;font-family:newtimeroman; font-size:120%">2. Data Exploration</p>

In [None]:
train.info()

In [None]:
train.shape

In [None]:
test.info()

We checked that there is no null values either in train or test data.

**Train data:** 300k rows, 31 columns
* 12 numerical columns
* 19 categorical columns                

**Test data:** 200k rows, 30 columns
 * 11 numerical columns (since target not included)
 * 19 categorical columns   

In [None]:
# All columns
cols = train.columns
# Find numerical columns
num_cols = train._get_numeric_data().columns
len(num_cols)

In [None]:
# Categorical Columns
categorical = list(set(cols) - set(num_cols))
len(categorical)

In [None]:
train.describe()

In [None]:
profile = ProfileReport(train)
profile

<a id='2.1'></a>
## <p style="background-color: #ea4335; color: #FFFFFF ; font-family:newtimeroman; font-size:120%">2.1 Target Variable</p>

In [None]:
train['target'].value_counts()

In [None]:
# as we can see here, only 16.07% of customers who have churned.
sns.countplot('target', data=train)

Here we can see that our data is unbalanced. Only 26.5% of out target = 1, rest of them target = 0.

<a id='2.2'></a>
## <p style="background-color: #ea4335; color: #FFFFFF ; font-family:newtimeroman; font-size:120%">2.2 Categorical Columns</p>

In [None]:
# Plot categorical columns
def pltCountplot(cat, df):
    
    fig, axis = plt.subplots((len(cat) // 4)+1, 4, figsize=(30,16))  

    index = 0
    sns.set()
    for i in range((len(cat) // 4)+1):
            
        for j in range(4):
            
            if index == len(cat):
                break

            ax = sns.countplot(cat[index], data=df, ax=axis[i][j]);
        
            #for item in ax.get_xticklabels():
            #    item.set_rotation(15)

            for p in ax.patches:
                height = p.get_height()
                ax.text(p.get_x()+p.get_width(), height + 3, '{:1.2f}%'.format(height/len(df)*100), ha="center", fontsize=14) 
            
            index += 1
        
    plt.subplots_adjust(wspace=0.3, hspace=0.4)

In [None]:
# Plot categorical columns
pltCountplot(categorical, train);

In [None]:
# Plot categorical columns with different y using matplotlib library
def pltCrosstab(cat, df):
    
    fig, axis = plt.subplots((len(cat) // 4)+1, 4, figsize=(30,16))  
    fig.tight_layout()

    index = 0
    #sns.set()
    for i in range((len(cat) // 4)+1):
            
        for j in range(4):
            
            # Since we have 11 numerical columns, some plots will be empty
            if index == len(cat):
                break
            
            ax = pd.crosstab(df[cat[index]], df['target']).plot(kind='bar', ax=axis[i][j])
        
            for item in ax.get_xticklabels():
                item.set_rotation(90)

            for p in ax.patches:
                height = p.get_height()
                ax.text(p.get_x()+p.get_width(), height + 3, '{:1.2f}%'.format(height/len(df)*100), ha="center", fontsize=8) 
            
            index += 1
            
    plt.subplots_adjust(wspace=0.1, hspace=0.4)

In [None]:
# Plot categorical columns with different Attrition_Flag
pltCrosstab(categorical, train);

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objs as go

# Plot categorical columns by y with Plotly bar plots
def pltStackedBarsPlots(train0, train1):

    num_rows, num_cols = 5,4
    fig = make_subplots(rows=num_rows, cols=num_cols)

    for index, column in enumerate(train[categorical].columns):
        i,j = ((index // num_cols)+1, (index % num_cols)+1)
        data = train_0.groupby(column)[column].count().sort_values(ascending=False)
        data = data 
        fig.add_trace(go.Bar(
            x = data.index,
            y = data.values,
            name='Label: 0',
        ), row=i, col=j)

        data = train_1.groupby(column)[column].count().sort_values(ascending=False)
        data = data 
        fig.add_trace(go.Bar(
            x = data.index,
            y = data.values,
            name='Label: 1'
        ), row=i, col=j)

        fig.update_xaxes(title=column, row=i, col=j)
        fig.update_layout(barmode='stack')

    fig.update_layout(
        autosize=False,
        width=1400,
        height=1400,
        showlegend=False,
    )
    fig.show()


Here is another way to compare categorical features by it's target values:

Using stacked bar plots, we can compare target 1 and 0 ratio within the category easily. And plotly library give us more dynamic plots.

In [None]:
train_0 = train.loc[train['target'] == 0]
train_1 = train.loc[train['target'] == 1]
    
# Plot stacked categorical columns by y with Plotly
pltStackedBarsPlots(train_0, train_1)

cat1, cat2, cat3, cat4, cat5, cat6, cat7, cat8, cat9 and cat10 have a lot of categories that it wasn't showing pretty when we print all the categorical columns' categories. 

The more categories you have, the more columns you get when using one-hot encoding, which can create huge tables but it's more difficult to handle and look through the data. Some tree-based algorithms also use subsets of columns to prevent overfitting, which can cause problems when using a lot of one-hot-encoded columns.

Let's check those features seperately.

In [None]:
def countPlot(column, target, df):
    ax = pd.crosstab(df[column], df[target]).plot(kind='bar')

    #for item in ax.get_xticklabels():
    #    item.set_rotation(90)

    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x()+p.get_width(), height + 3, '{:1.2f}%'.format(height/len(df)*100), ha="center", fontsize=8) 
    
    fig = plt.gcf()
    fig.set_size_inches(20, 8)
    plt.tight_layout()
    plt.show()

In [None]:
countPlot('cat1', 'target', train)

In [None]:
countPlot('cat2', 'target', train)

In [None]:
countPlot('cat3', 'target', train)

In [None]:
countPlot('cat4', 'target', train)

In [None]:
countPlot('cat6', 'target', train)

In [None]:
countPlot('cat9', 'target', train)

In [None]:
countPlot('cat7', 'target', train)

In [None]:
countPlot('cat8', 'target', train)

In [None]:
countPlot('cat5', 'target', train)

In [None]:
countPlot('cat10', 'target', train)

cat5, cat7, cat8 and cat10 have high cardinality(high number of dictinct values):
* cat5 has 84 distinct categories
* cat7 has 51 distinct categories
* cat8 has 61 distinct categories
* cat10 hast 299 distinct categories

<a id='2.3'></a>
## <p style="background-color: #ea4335; color: #FFFFFF ; font-family:newtimeroman; font-size:120%">2.3 Numerical Columns </p>

In [None]:
fig, axes = plt.subplots(nrows=4, ncols=3, figsize=(30, 16))
fig.suptitle('Distribution of Features by Target', fontsize=16)

for index, column in enumerate(num_cols):
    i,j = (index // 3, index % 3)
    sns.kdeplot(data=train, x=column, hue="target", multiple="stack",ax=axes[i,j])
    #sns.kdeplot(train.loc[train['target'] == 0, column], color="m", shade=True, ax=axes[i,j])
    #sns.kdeplot(train.loc[train['target'] == 1, column], color="b", shade=True, ax=axes[i,j])

fig.delaxes(axes[3, 2])
plt.tight_layout()
plt.show()

<a id='2.4'></a>
## <p style="background-color: #ea4335; color: #FFFFFF ; font-family:newtimeroman; font-size:120%">2.4 Correlation Check </p>

In [None]:
corr = train[num_cols].corr().abs()

fig, ax = plt.subplots(figsize=(20, 12))

# plot heatmap
sns.heatmap(corr, mask=np.triu(np.ones_like(corr, dtype=np.bool)), annot=True, fmt=".2f", cmap='coolwarm',
            cbar_kws={"shrink": .8}, vmin=0, vmax=1)
# yticks
plt.yticks(rotation=0) #Don't rotate the features
plt.show()

#### Top Absolute Correlations

In [None]:
# Check corralated features with eachother
def get_redundant_pairs(df):
    '''Get diagonal and lower triangular pairs of correlation matrix'''
    pairs_to_drop = set()
    cols = df.select_dtypes(include=[np.number]).columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

def get_top_abs_correlations(df, n=5):
    corr = df.select_dtypes(include=[np.number]).corr()
    au_corr = corr.abs().unstack()
    labels_to_drop = get_redundant_pairs(df.select_dtypes(include=[np.number]))
    #From corrolation table, drop the diagonals(which gives 1 corr)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return au_corr[0:n]

print("Top Absolute Correlations")
print(get_top_abs_correlations(train, 10))

Some features seems correlated with eachother.

* **cont1** has the highest correlation with **cont2** with a correlation of 0.861623.
* **cont10** has a high correlation of 0.807896 and 0.775453 with **cont0** and **cont7**.
* Continuous features that have a correlation > 0.7:
 **cont0** and **cont7**, 
 **cont8** with **cont1**, 
 **cont8** with **cont2**

#### Top Correlated Features with Target

In [None]:
# Examine the correlations between the features and the target.
print("Top Correlated Features with Target")
corr = train.select_dtypes(include=[np.number]).corr()
print (corr['target'].sort_values(ascending=False)[:5], '\n')
print (corr['target'].sort_values(ascending=False)[-5:])

Seems like there is no highly correlated numeric feature with target.

<a id='3'></a>
# <p style="background-color: #fbbc05; color: #FFFFFF ;font-family:newtimeroman; font-size:120%">3. Encode Features</p>

[This source](https://towardsdatascience.com/smarter-ways-to-encode-categorical-data-for-machine-learning-part-1-of-3-6dca2f71b159 ) provides good explanation of types to encode categorical data and which encoding technique should be used when. 

Since **we have features with high cordinality**(high number of dictinct values) such as **cat5**(84 distinct categories), **cat7**(51 distinct categories), **cat8**(61 distinct categories) and **cat10**(299 distinct categories), It would be great if we use Bayesian encoders.  

The Bayesian encoders use information from the dependent variable in their encodings. They output one column and can work well with high cardinality data.

In [None]:
from category_encoders import LeaveOneOutEncoder

def loo_encode(train_df, test_df, column):
    loo = LeaveOneOutEncoder()
    new_feature = "{}_loo".format(column)
    loo.fit(train_df[column], train_df["target"])
    train_df[new_feature] = loo.transform(train_df[column])
    test_df[new_feature] = loo.transform(test_df[column])
    return new_feature

loo_features = []
for feature in categorical:
    loo_features.append(loo_encode(train, test, feature))

In [None]:
numerical = num_cols.drop('target').tolist()
features = numerical + loo_features

In [None]:
# Check features that are in the train DF but not in test DF
features_not_in_test = set(train.drop('target', axis=1).columns) - set(test.columns)
features_not_in_test

In [None]:
# Check features that are in the train DF but not in test DF
features_not_in_train = set(test.columns) - set(train.drop('target', axis=1).columns)
features_not_in_train

In [None]:
print("Top Absolute Correlations")
print(get_top_abs_correlations(train, 10))

In [None]:
# Examine the correlations between the features and the target.
print("Top Correlated Features with Target")
corr = train.select_dtypes(include=[np.number]).corr()
print (corr['target'].sort_values(ascending=False)[:5], '\n')
print (corr['target'].sort_values(ascending=False)[-5:])

<a id='4'></a>
# <p style="background-color: #34a853; color: #FFFFFF ;font-family:newtimeroman; font-size:120%">4. Model Building</p>

<a id='4.1'></a>
## <p style="background-color: #34a853; color: #FFFFFF ; font-family:newtimeroman; font-size:120%">4.1 Logistic Regression </p>

## Pull out the target, and make a validation split

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

# set x and y
X = train[features]
y = train['target']

# train test split
# stratify=y preserve the proportion of target as in orginal dataset in the train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=99, stratify=y) #train_size=0.60

# scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# set the model
logreg = LogisticRegression()

# fit model
logreg.fit(X_train, y_train)

In [None]:
# Baseline accuracy = proportion of the majority class
print('Baseline Accuracy: ',1. - y_train.mean())
print('Train Accuracy :',logreg.score(X_train, y_train))
print('Test Accuracy: ',logreg.score(X_test, y_test))

In [None]:
print('ROC AUC Score = {}'.format(roc_auc_score(y_test, logreg.predict_proba(X_test)[:, 1])))

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
auc = []
for n in range(5,10):
    
    pipe = make_pipeline(StandardScaler(), LogisticRegression())

    # Note the results will vary each run since we take a different subset of the data each time (since shuffle=True)
    scores = np.mean(cross_val_score(pipe, X, y, cv=StratifiedKFold(n, random_state=10, shuffle=True), scoring='roc_auc'))
    auc.append(scores)
    print(str(n), ' folds: ', 'ROC AUC Score: ', scores)

# plot to see clearly
plt.plot(range(5,10), auc)
plt.xlabel('n split')
plt.ylabel('Mean ROC AUC Score for all folds')
plt.show();

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

pipe = Pipeline([('classifier' , LogisticRegression())])

# Create param grid.
param_grid = [
    {'classifier' : [LogisticRegression()],
     'classifier__penalty' : ['l1', 'l2'],
    'classifier__C' : np.logspace(-4, 4, 20),
    'classifier__solver' : ['liblinear']}]

clf = GridSearchCV(pipe, 
                   param_grid = param_grid, 
                   cv = StratifiedKFold(8, random_state=10, shuffle=True), 
                   scoring='roc_auc',
                   return_train_score=True,
                   n_jobs=-1,
                   verbose=True)
clf.fit(X, y)  
clf.best_estimator_

## Submission

In [None]:
logreg = LogisticRegression()
logreg.fit(X, y)
submission['target'] = logreg.predict_proba(test[features])[:, 1]
submission.to_csv('logistic_regression.csv')

<a id='4.2'></a>
## <p style="background-color: #34a853; color: #FFFFFF ; font-family:newtimeroman; font-size:120%">4.2 Ridge Classifier </p>

In [None]:
# RidgeClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.preprocessing import StandardScaler

# set x and y
X = train[features]
y = train['target']

# train test split
# stratify=y preserve the proportion of target as in orginal dataset in the train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=99, stratify=y) #train_size=0.60

# scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# set the model
ridge = RidgeClassifier()

# fit model
ridge.fit(X_train, y_train)

In [None]:
# Baseline accuracy = proportion of the majority class
print('Baseline Accuracy: ',1. - y_train.mean())
print('Train Accuracy :',ridge.score(X_train, y_train))
print('Test Accuracy: ',ridge.score(X_test, y_test))

In [None]:
#print('ROC AUC Score = {}'.format(roc_auc_score(y_test, ridge.predict_proba(X_test)[:, 1])))

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
auc = []
for n in range(5,10):
    
    pipe = make_pipeline(StandardScaler(), RidgeClassifier())

    # Note the results will vary each run since we take a different subset of the data each time (since shuffle=True)
    scores = np.mean(cross_val_score(pipe, X, y, cv=StratifiedKFold(n, random_state=10, shuffle=True), scoring='roc_auc'))
    auc.append(scores)
    print(str(n), ' folds: ', 'ROC AUC Score: ', scores)

# plot to see clearly
plt.plot(range(5,10), auc)
plt.xlabel('n split')
plt.ylabel('Mean ROC AUC Score for all folds')
plt.show();

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedKFold

pipe=make_pipeline(StandardScaler(),RidgeClassifier()) 

grid_search = GridSearchCV(
    pipe, 
    {'ridgeclassifier__alpha': [0, 0.001, 0.01, 0.1, 0.5, 1, 1.5, 100]},  # Tried [0, 0.001, 0.01, 0.1, 0.5, 1, 1.5, 100] as well range(0,100)
    cv=StratifiedKFold(8, random_state=10, shuffle=True),
    scoring='roc_auc',
    return_train_score=True,
    n_jobs=-1,
    verbose=3)

#X_sc = scaler.transform(X)  # can use this if I don't use pipe
grid_search.fit(X, y)
grid_search.best_estimator_

## Submission

In [None]:
#ridge = RidgeClassifier()
#ridge.fit(X, y)
#submission['target'] = ridge.predict_proba(test[features])[:, 1]
#submission.to_csv('ridge_classifier.csv')

In [None]:
ridge = RidgeClassifier()
ridge.fit(X, y)
predictions = ridge.predict(test[features])
predictions=pd.DataFrame(predictions,index=test.index)
predictions.rename(columns={predictions.columns[0]:'target'}, inplace=True)
predictions.to_csv('ridge_classifier.csv')

<a id='4.3'></a>
## <p style="background-color: #34a853; color: #FFFFFF ; font-family:newtimeroman; font-size:120%">4.3 Random Forest </p>

In [None]:
# RidgeClassifier
from sklearn.ensemble import RandomForestClassifier

# set x and y
X = train[features]
y = train['target']

# train test split
# stratify=y preserve the proportion of target as in orginal dataset in the train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=99, stratify=y) #train_size=0.60

# set the model
clf = RandomForestClassifier(n_estimators=200, max_depth=7, n_jobs=-1)

# fit model
clf.fit(X_train, y_train)

In [None]:
# Baseline accuracy = proportion of the majority class
print('Baseline Accuracy: ',1. - y_train.mean())
print('Train Accuracy :',clf.score(X_train, y_train))
print('Test Accuracy: ',clf.score(X_test, y_test))

In [None]:
y_pred = clf.predict_proba(X_test)[:, 1] # This grabs the positive class prediction
# score = roc_auc_score(y_test, y_pred)
# print(f'{score:0.5f}')
print('ROC AUC Score = {}'.format(roc_auc_score(y_test, y_pred)))

## Let's take a look at how the model predicted the various classes

The graph below shows that the model does well with most of the negative observations, but struggles with many of the positive observations.

In [None]:
plt.figure(figsize=(8,4))
plt.hist(y_pred[np.where(y_test == 0)], bins=100, alpha=0.75, label='neg class')
plt.hist(y_pred[np.where(y_test == 1)], bins=100, alpha=0.75, label='pos class')
plt.legend()
plt.show()

## Submission

In [None]:
clf = RandomForestClassifier(n_estimators=200, max_depth=7, n_jobs=-1)
clf.fit(X, y)
submission['target'] = clf.predict_proba(test[features])[:, 1]
submission.to_csv('random_forest.csv')

<a id='4.4'></a>
## <p style="background-color: #34a853; color: #FFFFFF ; font-family:newtimeroman; font-size:120%">4.4 XGBoost </p>

In [None]:
import xgboost as xgb

# set x and y
X = train[features]
y = train['target']

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=99, stratify=y)

# Create xgboost model
m_xgb = xgb.XGBClassifier(n_estimators=200, max_depth=2, random_state=42)

#Train the model using the training sets
m_xgb.fit(X_train,y_train)

In [None]:
# Baseline accuracy = proportion of the majority class
print('Baseline Accuracy: ',1. - y_train.mean())
print('Train Accuracy :',m_xgb.score(X_train, y_train))
print('Test Accuracy: ',m_xgb.score(X_test, y_test))

In [None]:
y_pred = m_xgb.predict_proba(X_test)[:, 1] # This grabs the positive class prediction
# score = roc_auc_score(y_test, y_pred)
# print(f'{score:0.5f}')
print('ROC AUC Score = {}'.format(roc_auc_score(y_test, y_pred)))

In [None]:
plt.figure(figsize=(8,4))
plt.hist(y_pred[np.where(y_test == 0)], bins=100, alpha=0.75, label='neg class')
plt.hist(y_pred[np.where(y_test == 1)], bins=100, alpha=0.75, label='pos class')
plt.legend()
plt.show()

## Optuna

## Tuned XGBoost Model

In [None]:
# set x and y
X = train[features]
y = train['target']

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=99, stratify=y)

# Create xgboost model
m_xgb = xgb.XGBClassifier(seed=2021,
                        n_estimators=10000,
                        verbosity=1,
                        eval_metric="auc",
                        alpha=7.105038963844129,
                        colsample_bytree=0.25505629740052566,
                        gamma=0.4999381950212869,
                        reg_lambda=1.7256912198205319,
                        learning_rate=0.011823142071967673,
                        max_bin=338,
                        max_depth=8,
                        min_child_weight=2.286836198630466,
                        subsample=0.618417952155855,
                        use_label_encoder=False)

#Train the model using the training sets
m_xgb.fit(X_train,y_train)

In [None]:
y_pred = m_xgb.predict_proba(X_test)[:, 1] # This grabs the positive class prediction
# score = roc_auc_score(y_test, y_pred)
# print(f'{score:0.5f}')
print('ROC AUC Score = {}'.format(roc_auc_score(y_test, y_pred)))

## Submission

In [None]:
m_xgb = xgb.XGBClassifier(seed=2021,
                        n_estimators=10000,
                        verbosity=1,
                        eval_metric="auc",
                        alpha=7.105038963844129,
                        colsample_bytree=0.25505629740052566,
                        gamma=0.4999381950212869,
                        reg_lambda=1.7256912198205319,
                        learning_rate=0.011823142071967673,
                        max_bin=338,
                        max_depth=8,
                        min_child_weight=2.286836198630466,
                        subsample=0.618417952155855,
                        use_label_encoder=False)
m_xgb.fit(X, y)
submission['target'] = m_xgb.predict_proba(test[features])[:, 1]
submission.to_csv('tuned_xgboost.csv')

<a id='4.5'></a>
## <p style="background-color: #34a853; color: #FFFFFF ; font-family:newtimeroman; font-size:120%">4.5 LightGBM </p>

In [None]:
import lightgbm as lbm

# set x and y
X = train[features]
y = train['target']

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=99, stratify=y) #train_size=0.60

# Create LightGBM model
lgbm = lbm.LGBMClassifier()

#Train the model using the training sets
lgbm.fit(X_train,y_train)

In [None]:
# Baseline accuracy = proportion of the majority class
print('Baseline Accuracy: ',1. - y_train.mean())
print('Train Accuracy :',lgbm.score(X_train, y_train))
print('Test Accuracy: ',lgbm.score(X_test, y_test))

In [None]:
y_pred = lgbm.predict_proba(X_test)[:, 1] # This grabs the positive class prediction
# score = roc_auc_score(y_test, y_pred)
# print(f'{score:0.5f}')
print('ROC AUC Score = {}'.format(roc_auc_score(y_test, y_pred)))

In [None]:
plt.figure(figsize=(8,4))
plt.hist(y_pred[np.where(y_test == 0)], bins=100, alpha=0.75, label='neg class')
plt.hist(y_pred[np.where(y_test == 1)], bins=100, alpha=0.75, label='pos class')
plt.legend()
plt.show()

## Optuna

In [None]:
import optuna
from optuna import Trial, visualization
import lightgbm as lbm
def objective(trial, X=train[features], y=train['target']):

    X_train,X_test,y_train,y_test=train_test_split(X, y, random_state=99, stratify=y)


    lgb_params={
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-2),
        'max_depth': trial.suggest_int('max_depth', 6, 200),
        'num_leaves': trial.suggest_int('num_leaves', 31, 120),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0),
        'random_state': 2021,
        'metric': 'auc',
        'n_estimators': trial.suggest_int('n_estimators', 6, 300000),
        'n_jobs': 12,
        'cat_feature': [x for x in range(len(categorical))],
        'bagging_seed': 2021,
        'feature_fraction_seed': 2021,
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 0.9),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 500),
        'subsample_freq': trial.suggest_int('subsample_freq', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.3, 0.9),
        'max_bin': trial.suggest_int('max_bin', 128, 1024),
        'min_data_per_group': trial.suggest_int('min_data_per_group', 50, 350),
        'cat_smooth': trial.suggest_int('cat_smooth', 10, 250),
        'cat_l2': trial.suggest_int('cat_l2', 1, 20)
    }

    lgb = lbm.LGBMClassifier(
        **lgb_params
    )
    lgb.fit(
        X_train,
        y_train,
        eval_set=(X_test,y_test),
        eval_metric='auc',
        early_stopping_rounds=100,
        verbose=False
    )
    predictions=lgb.predict_proba(X_test)[:,1]

    return roc_auc_score(y_test,predictions)

In [None]:
opt = optuna.create_study(direction='maximize')
opt.optimize(objective, timeout=3600*7, n_trials=15)

In [None]:
opt.best_params

## Tuned LightGBM Model

In [None]:
# set x and y
X = train[features]
y = train['target']

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=99, stratify=y) #train_size=0.60

# Create LightGBM model
lgbm = lbm.LGBMClassifier(
    learning_rate= 0.00630790267395036,
     max_depth= 197,
     num_leaves= 55,
     reg_alpha= 3.1353823119798956,
     reg_lambda= 6.232626693293953,
     n_estimators= 297391,
     colsample_bytree= 0.6074065972411667,
     min_child_samples= 299,
     subsample_freq= 4,
     subsample= 0.8950482914463721,
     max_bin= 137,
     min_data_per_group= 51,
     cat_smooth= 111,
     cat_l2= 20)

#Train the model using the training sets
lgbm.fit(X_train,y_train)

In [None]:
y_pred = lgbm.predict_proba(X_test)[:, 1] # This grabs the positive class prediction
# score = roc_auc_score(y_test, y_pred)
# print(f'{score:0.5f}')
print('ROC AUC Score = {}'.format(roc_auc_score(y_test, y_pred)))

In [None]:
lgbm = lbm.LGBMClassifier(
    learning_rate= 0.00630790267395036,
     max_depth= 197,
     num_leaves= 55,
     reg_alpha= 3.1353823119798956,
     reg_lambda= 6.232626693293953,
     n_estimators= 297391,
     colsample_bytree= 0.6074065972411667,
     min_child_samples= 299,
     subsample_freq= 4,
     subsample= 0.8950482914463721,
     max_bin= 137,
     min_data_per_group== 51,
     cat_smooth= 111,
     cat_l2= 20)
lgbm.fit(X, y)
submission['target'] = lgbm.predict_proba(test[features])[:, 1]
submission.to_csv('tuned2_LightGBM.csv')

## Tuned LightGBM Model

In [None]:
# set x and y
X = train[features]
y = train['target']

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=99, stratify=y) #train_size=0.60

# Create LightGBM model
lgbm = lbm.LGBMClassifier(learning_rate= 0.00605886703283976,
    max_depth= 42,
    num_leaves= 108,
    reg_alpha= 0.9140720355379223,
    reg_lambda= 9.97396811596188,
    colsample_bytree= 0.2629101393563821,
    min_child_samples= 61,
    subsample_freq= 2,
    subsample= 0.8329687190743886,
    max_bin= 899,
    min_data_per_group= 73,
    cat_smooth= 21,
    cat_l2= 11,
    random_state= 2021,
    metric= 'auc',
    n_estimators= 20000,
    n_jobs= -1,
    bagging_seed= 2021,
    feature_fraction_seed= 2021)

#Train the model using the training sets
lgbm.fit(X_train,y_train)

In [None]:
y_pred = lgbm.predict_proba(X_test)[:, 1] # This grabs the positive class prediction
# score = roc_auc_score(y_test, y_pred)
# print(f'{score:0.5f}')
print('ROC AUC Score = {}'.format(roc_auc_score(y_test, y_pred)))

## Submission

In [None]:
lgbm = lbm.LGBMClassifier(learning_rate= 0.00605886703283976,
    max_depth= 42,
    num_leaves= 108,
    reg_alpha= 0.9140720355379223,
    reg_lambda= 9.97396811596188,
    colsample_bytree= 0.2629101393563821,
    min_child_samples= 61,
    subsample_freq= 2,
    subsample= 0.8329687190743886,
    max_bin= 899,
    min_data_per_group= 73,
    cat_smooth= 21,
    cat_l2= 11,
    random_state= 2021,
    metric= 'auc',
    n_estimators= 20000,
    n_jobs= -1,
    bagging_seed= 2021,
    feature_fraction_seed= 2021)
lgbm.fit(X, y)
submission['target'] = lgbm.predict_proba(test[features])[:, 1]
submission.to_csv('tuned_LightGBM.csv')