In [24]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTE, SMOTENC, BorderlineSMOTE,ADASYN,SVMSMOTE
from collections import Counter
from boruta import BorutaPy
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from skopt import gbrt_minimize, gp_minimize 
from skopt.space import Real, Integer, Categorical
from skopt.utils import use_named_args

warnings.filterwarnings('ignore')

In [2]:
# Read the given train data
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,ID,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Is_Lead
0,NNVBBKZB,Female,73,RG268,Other,X3,43,No,1045696,No,0
1,IDD62UNG,Female,30,RG277,Salaried,X1,32,No,581988,No,0
2,HD3DSEMC,Female,56,RG268,Self_Employed,X3,26,No,1484315,Yes,0
3,BF3NC7KV,Male,34,RG270,Salaried,X1,19,No,470454,No,0
4,TEASRWXV,Female,30,RG282,Salaried,X1,33,No,886787,No,0


In [3]:
# Find the Number of counts of Target
df['Is_Lead'].value_counts()

0    187437
1     58288
Name: Is_Lead, dtype: int64

In [4]:
# Splitting the data in train test split with stratify
X_train, X_test, y_train, y_test = train_test_split(df.drop(['Is_Lead'],axis=1), 
                                                    df['Is_Lead'], 
                                                    test_size=0.2,
                                                    stratify = df['Is_Lead'],
                                                    random_state=42)

In [5]:
X_train.shape, X_test.shape

((196580, 10), (49145, 10))

In [6]:
# Checking for any missing data
X_train.isna().sum(), X_test.isna().sum()

(ID                         0
 Gender                     0
 Age                        0
 Region_Code                0
 Occupation                 0
 Channel_Code               0
 Vintage                    0
 Credit_Product         23422
 Avg_Account_Balance        0
 Is_Active                  0
 dtype: int64, ID                        0
 Gender                    0
 Age                       0
 Region_Code               0
 Occupation                0
 Channel_Code              0
 Vintage                   0
 Credit_Product         5903
 Avg_Account_Balance       0
 Is_Active                 0
 dtype: int64)

In [7]:
# Using Missing value 'NA' as a category
X_train['Credit_Product'] = X_train['Credit_Product'].fillna('NA')
X_test['Credit_Product'] = X_test['Credit_Product'].fillna('NA')

In [8]:
# Checking for any missing data
X_train.isna().sum(), X_test.isna().sum()

(ID                     0
 Gender                 0
 Age                    0
 Region_Code            0
 Occupation             0
 Channel_Code           0
 Vintage                0
 Credit_Product         0
 Avg_Account_Balance    0
 Is_Active              0
 dtype: int64, ID                     0
 Gender                 0
 Age                    0
 Region_Code            0
 Occupation             0
 Channel_Code           0
 Vintage                0
 Credit_Product         0
 Avg_Account_Balance    0
 Is_Active              0
 dtype: int64)

In [9]:
# Observe the train data
X_train.head()

Unnamed: 0,ID,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active
75880,2QOUTFDT,Male,39,RG268,Other,X1,69,No,2466394,No
3536,6I9US9ZJ,Male,30,RG254,Salaried,X1,13,No,982141,Yes
26300,GMGQOTQ7,Female,59,RG277,Self_Employed,X3,129,Yes,802762,No
228409,KEZJFBM9,Male,44,RG283,Self_Employed,X3,21,,1920664,No
725,NPG6I5WU,Male,52,RG254,Self_Employed,X1,15,No,529194,Yes


In [10]:
# Dropping ID feature and resetting the index
X_train = X_train.drop(['ID'], axis=1).reset_index(drop=True)
X_test = X_test.drop(['ID'], axis=1).reset_index(drop=True)

In [11]:
X_train.head()

Unnamed: 0,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active
0,Male,39,RG268,Other,X1,69,No,2466394,No
1,Male,30,RG254,Salaried,X1,13,No,982141,Yes
2,Female,59,RG277,Self_Employed,X3,129,Yes,802762,No
3,Male,44,RG283,Self_Employed,X3,21,,1920664,No
4,Male,52,RG254,Self_Employed,X1,15,No,529194,Yes


In [12]:
# Custom function to perform one hot encoding
def one_hot_enocode(X_train, X_test):
    
    # Encode the categorical columns using One hot encoding
    cat_col = X_train.select_dtypes('O').columns

    for col in cat_col:

        # Initialise OHE
        ohe = OneHotEncoder(sparse=False)
        ohe.fit(X_train[col].values.reshape(-1,1))

        # One Hot Encode columns in Train Data
        X1 = pd.DataFrame(ohe.transform(X_train[col].values.reshape(-1,1)) )   
        X1.columns = [str(col) + '_' + str(i) for i in ohe.categories_[0]]   
        X_train = X_train.drop([col], axis =1,)
        X_train = pd.concat([X_train, X1], axis =1,)


        # One Hot Encode columns in Test Data
        X2 = pd.DataFrame(ohe.transform(X_test[col].values.reshape(-1,1)) )   
        X2.columns = [str(col) + '_' + str(i) for i in ohe.categories_[0]]   
        X_test = X_test.drop([col], axis =1,)
        X_test = pd.concat([X_test, X2], axis =1,)
    
    
    return X_train, X_test


In [13]:
# One Hot Encode all the categorical Variable
X_train, X_test = one_hot_enocode(X_train, X_test)

In [14]:
X_train

Unnamed: 0,Age,Vintage,Avg_Account_Balance,Gender_Female,Gender_Male,Region_Code_RG250,Region_Code_RG251,Region_Code_RG252,Region_Code_RG253,Region_Code_RG254,...,Occupation_Self_Employed,Channel_Code_X1,Channel_Code_X2,Channel_Code_X3,Channel_Code_X4,Credit_Product_NA,Credit_Product_No,Credit_Product_Yes,Is_Active_No,Is_Active_Yes
0,39,69,2466394,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,30,13,982141,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,59,129,802762,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
3,44,21,1920664,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
4,52,15,529194,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196575,27,20,929596,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
196576,49,50,466484,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
196577,44,103,1515732,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
196578,25,19,1005824,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [15]:
X_test

Unnamed: 0,Age,Vintage,Avg_Account_Balance,Gender_Female,Gender_Male,Region_Code_RG250,Region_Code_RG251,Region_Code_RG252,Region_Code_RG253,Region_Code_RG254,...,Occupation_Self_Employed,Channel_Code_X1,Channel_Code_X2,Channel_Code_X3,Channel_Code_X4,Credit_Product_NA,Credit_Product_No,Credit_Product_Yes,Is_Active_No,Is_Active_Yes
0,59,51,844547,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,27,14,884535,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,49,109,3390285,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
3,62,69,1171774,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,32,32,1965556,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49140,37,75,1366995,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
49141,28,15,1039769,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
49142,26,13,791672,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
49143,26,14,656544,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


### Since data is imbalance we need to handle the same by oversampling the minority data. We would be using ADASYN oversampling method

In [16]:
# Target Data before Oversampling
Counter(y_train)

Counter({0: 149950, 1: 46630})

In [17]:
# Now target data is imbalanced we need to perform upsampling of the data
smot = ADASYN(random_state = 101, n_neighbors=5)  # => BEST
X_train, y_train =  smot.fit_resample(X_train, y_train)

In [18]:
# Target Data After Oversampling
Counter(y_train)

Counter({0: 149950, 1: 143502})

In [19]:
X_train

Unnamed: 0,Age,Vintage,Avg_Account_Balance,Gender_Female,Gender_Male,Region_Code_RG250,Region_Code_RG251,Region_Code_RG252,Region_Code_RG253,Region_Code_RG254,...,Occupation_Self_Employed,Channel_Code_X1,Channel_Code_X2,Channel_Code_X3,Channel_Code_X4,Credit_Product_NA,Credit_Product_No,Credit_Product_Yes,Is_Active_No,Is_Active_Yes
0,39,69,2466394,0.000000,1.000000,0.0,0.000000,0.0,0.0,0.0,...,0.000000,1.0,0.000000,0.000000,0.0,0.000000,1.000000,0.000000,1.00000,0.00000
1,30,13,982141,0.000000,1.000000,0.0,0.000000,0.0,0.0,1.0,...,0.000000,1.0,0.000000,0.000000,0.0,0.000000,1.000000,0.000000,0.00000,1.00000
2,59,129,802762,1.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,...,1.000000,0.0,0.000000,1.000000,0.0,0.000000,0.000000,1.000000,1.00000,0.00000
3,44,21,1920664,0.000000,1.000000,0.0,0.000000,0.0,0.0,0.0,...,1.000000,0.0,0.000000,1.000000,0.0,1.000000,0.000000,0.000000,1.00000,0.00000
4,52,15,529194,0.000000,1.000000,0.0,0.000000,0.0,0.0,1.0,...,1.000000,1.0,0.000000,0.000000,0.0,0.000000,1.000000,0.000000,0.00000,1.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
293447,38,28,695207,0.845298,0.154702,0.0,0.845298,0.0,0.0,0.0,...,1.000000,0.0,0.845298,0.154702,0.0,0.845298,0.000000,0.154702,1.00000,0.00000
293448,43,75,695229,1.000000,0.000000,0.0,0.248830,0.0,0.0,0.0,...,1.000000,0.0,0.248830,0.751170,0.0,0.248830,0.000000,0.751170,0.24883,0.75117
293449,53,73,989142,1.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.345297,0.0,0.345297,0.654703,0.0,0.000000,0.000000,1.000000,1.00000,0.00000
293450,50,69,989076,1.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.134810,0.0,0.134810,0.865190,0.0,0.865190,0.000000,0.134810,0.13481,0.86519


In [20]:
# Dropping the either of columns Gender of female or Gender of male as they both are complimentary to each other
# Similar is the case for Is active 
X_train.drop(['Is_Active_No', 'Gender_Female'], axis=1, inplace=True)
X_test.drop(['Is_Active_No', 'Gender_Female'], axis=1, inplace=True)

In [22]:
# Defining Hyperparameter
param_grid = {'n_estimators' : np.random.randint(10,500,5), 
              'random_state' : np.random.randint(10,50000,5),
              'learning_rate' :[0.001, 0.01, 0.01, 0.1, 1, 10 ]
             }
param_grid

{'n_estimators': array([361, 372, 174, 203, 373]),
 'random_state': array([10049,  1082, 49195, 17360, 40890]),
 'learning_rate': [0.001, 0.01, 0.01, 0.1, 1, 10]}

In [None]:
# Initialise Decision Tree Model and Grid
gbdt = lgb.LGBMClassifier()
grid_search = GridSearchCV(estimator=gbdt, param_grid=param_grid, scoring='roc_auc',
                           return_train_score=True, cv = 3)
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
grid_search.cv_results_

In [28]:
# Fitting the model on best parameter
clf = lgb.LGBMClassifier(random_state=8848) # random state 8848

clf.fit(X_train, y_train)

LGBMClassifier(random_state=8848)

In [29]:
# Obtaining the ROC metric
y_pred = clf.predict(X_test)

roc_auc_score(y_test, y_pred)

0.7542995427755608

## Now that we have Obtained the Best parameter lets the train the model again based on the optimise parameter set on the complete data

In [30]:
# Using complete data for training
X_train = X_train.append(X_test, ignore_index = True)
y_train = y_train.append(y_test, ignore_index =True)

In [31]:
# Fitting the model on best parameter on entire data
clf_final = lgb.LGBMClassifier(random_state = 8848)

clf_final.fit(X_train, y_train)

LGBMClassifier(random_state=8848)

## Testing the result of the model on the unseen data

In [32]:
# Read the Unseen Data
df_unseen = pd.read_csv('test.csv')
df_unseen

Unnamed: 0,ID,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active
0,VBENBARO,Male,29,RG254,Other,X1,25,Yes,742366,No
1,CCMEWNKY,Male,43,RG268,Other,X2,49,,925537,No
2,VK3KGA9M,Male,31,RG270,Salaried,X1,14,No,215949,No
3,TT8RPZVC,Male,29,RG272,Other,X1,33,No,868070,No
4,SHQZEYTZ,Female,29,RG270,Other,X1,19,No,657087,No
...,...,...,...,...,...,...,...,...,...,...
105307,DBENJOYI,Male,52,RG268,Salaried,X2,86,Yes,4242558,Yes
105308,CWQ72DWS,Male,55,RG277,Other,X2,86,Yes,1159153,No
105309,HDESC8GU,Male,35,RG254,Salaried,X4,15,No,1703727,No
105310,2PW4SFCA,Male,53,RG254,Other,X3,93,No,737178,Yes


In [33]:
df_unseen.isna().sum()

ID                         0
Gender                     0
Age                        0
Region_Code                0
Occupation                 0
Channel_Code               0
Vintage                    0
Credit_Product         12522
Avg_Account_Balance        0
Is_Active                  0
dtype: int64

In [34]:
# Filling the missing column with NA category
df_unseen['Credit_Product'] = df_unseen['Credit_Product'].fillna('NA')

In [35]:
# Storing the id column seperately as it would be needed later
id_df = df_unseen['ID']
df_unseen.drop(['ID'], axis=1, inplace=True)

In [36]:
# Encode the categorical columns using One hot encoding
cat_col = df_unseen.select_dtypes('O').columns

for col in cat_col:

    # Initialise OHE
    ohe = OneHotEncoder(sparse=False)
    ohe.fit(df_unseen[col].values.reshape(-1,1))

    # One Hot Encode columns in Train Data
    X1 = pd.DataFrame(ohe.transform(df_unseen[col].values.reshape(-1,1)) )   
    X1.columns = [str(col) + '_' + str(i) for i in ohe.categories_[0]]   
    df_unseen = df_unseen.drop([col], axis =1,)
    df_unseen = pd.concat([df_unseen, X1], axis =1,)


In [37]:
df_unseen

Unnamed: 0,Age,Vintage,Avg_Account_Balance,Gender_Female,Gender_Male,Region_Code_RG250,Region_Code_RG251,Region_Code_RG252,Region_Code_RG253,Region_Code_RG254,...,Occupation_Self_Employed,Channel_Code_X1,Channel_Code_X2,Channel_Code_X3,Channel_Code_X4,Credit_Product_NA,Credit_Product_No,Credit_Product_Yes,Is_Active_No,Is_Active_Yes
0,29,25,742366,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,43,49,925537,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,31,14,215949,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,29,33,868070,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,29,19,657087,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105307,52,86,4242558,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
105308,55,86,1159153,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
105309,35,15,1703727,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
105310,53,93,737178,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0


In [38]:
# Dropping the either of columns Gender of female or Gender of male as they both are complimentary to each other
# Similar is the case for Is active 
df_unseen.drop(['Is_Active_No', 'Gender_Female'], axis=1, inplace=True)

In [39]:
# Observing the final data
df_unseen

Unnamed: 0,Age,Vintage,Avg_Account_Balance,Gender_Male,Region_Code_RG250,Region_Code_RG251,Region_Code_RG252,Region_Code_RG253,Region_Code_RG254,Region_Code_RG255,...,Occupation_Salaried,Occupation_Self_Employed,Channel_Code_X1,Channel_Code_X2,Channel_Code_X3,Channel_Code_X4,Credit_Product_NA,Credit_Product_No,Credit_Product_Yes,Is_Active_Yes
0,29,25,742366,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,43,49,925537,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,31,14,215949,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,29,33,868070,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,29,19,657087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105307,52,86,4242558,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
105308,55,86,1159153,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
105309,35,15,1703727,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
105310,53,93,737178,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0


In [40]:
# Performing the model prediction on the unseen data
pred_unseen = clf_final.predict_proba(df_unseen)[:,1]
pred_unseen

array([0.04813787, 0.86529421, 0.05421379, ..., 0.07284868, 0.22198347,
       0.05531019])

In [41]:
# Generating the final submission dataframe
df_sub = pd.DataFrame(pred_unseen, columns=['Is_Lead'])
df_sub['ID'] = id_df.values
df_sub

Unnamed: 0,Is_Lead,ID
0,0.048138,VBENBARO
1,0.865294,CCMEWNKY
2,0.054214,VK3KGA9M
3,0.023892,TT8RPZVC
4,0.025202,SHQZEYTZ
...,...,...
105307,0.991219,DBENJOYI
105308,0.582564,CWQ72DWS
105309,0.072849,HDESC8GU
105310,0.221983,2PW4SFCA


In [42]:
# Saving the final dataframe
df_sub.to_csv('LGBM_NEW_Classifier.csv', index=False)