## Importing Required Packages

In [None]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn import metrics   
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from bayes_opt import BayesianOptimization
from skopt  import BayesSearchCV 
import lightgbm as lgb
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
import time
import sys
from sklearn.metrics import roc_auc_score, roc_curve
import shap

## Reading Data

In [None]:
data = pd.read_csv('train_s3TEQDk.csv')

In [None]:
data.head()

## Basic Statisctics

In [None]:
data.describe()

In [None]:
data.corr()

## Checking for Missing Values

In [None]:
data.isnull().sum()

**We Found 29325 Missing Values in the column Credit_Product**

### So let's fill those missing values Using KNN Imputer and applying Standard Scaler

In [None]:
imputer = KNNImputer(n_neighbors=3)
SC = StandardScaler()

In [None]:
new = data.drop(axis=1,labels=['ID'])
new.head()

### Encoding all Categorical Attributes using Label Encoder

In [None]:
x = new.iloc[:,:].values
label_encoder = LabelEncoder()
x[:,0] = label_encoder.fit_transform(x[:,0])

x[:,2] = label_encoder.fit_transform(x[:,2])
x[:,3] = label_encoder.fit_transform(x[:,3])
x[:,4] = label_encoder.fit_transform(x[:,4])
x[:,6] = label_encoder.fit_transform(x[:,6])
x[:,-2] = label_encoder.fit_transform(x[:,-2])

x[0:5]
print('After Encoding',x.shape)

In [None]:
x =imputer.fit_transform(x)
print('After Imputing',x.shape)

X = SC.fit_transform(x[:,[0,1,2,3,4,5,6,8]])
print('After Standard Scaling',X.shape)
Y = x[:,-1]

In [None]:
X[0]

In [None]:
Y[:10]

## Splitting Data for Train, Test and Validation

In [None]:
X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.10, random_state=43)
X_val, X_test, y_val, y_test = train_test_split( X_test, y_test, test_size=0.5, random_state=43)

print ('Train set     :', X_train.shape,  y_train.shape)
print ('Validation set:', X_val.shape,  y_val.shape)
print ('Test set      :', X_test.shape,  y_test.shape)

In [None]:
%%time

def bayes_parameter_opt_lgb(X, y, init_round=15, opt_round=25, n_folds=3, random_seed=6,n_estimators=10000, output_process=False):
    # prepare data
    train_data = lgb.Dataset(data=X, label=y, free_raw_data=False)
    # parameters
    def lgb_eval(learning_rate,num_leaves, feature_fraction, bagging_fraction, max_depth, max_bin, min_data_in_leaf,min_sum_hessian_in_leaf,subsample):
        params = {'application':'binary', 'metric':'auc'}
        params['learning_rate'] = max(min(learning_rate, 1), 0)
        params["num_leaves"] = int(round(num_leaves))
        params['feature_fraction'] = max(min(feature_fraction, 1), 0)
        params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
        params['max_depth'] = int(round(max_depth))
        params['max_bin'] = int(round(max_depth))
        params['min_data_in_leaf'] = int(round(min_data_in_leaf))
        params['min_sum_hessian_in_leaf'] = min_sum_hessian_in_leaf
        params['subsample'] = max(min(subsample, 1), 0)
        
        cv_result = lgb.cv(params, train_data, nfold=n_folds, seed=random_seed, stratified=True, verbose_eval =200, metrics=['auc'])
        return max(cv_result['auc-mean'])
     
    lgbBO = BayesianOptimization(lgb_eval, {'learning_rate': (0.01, 1.0),
                                            'num_leaves': (24, 80),
                                            'feature_fraction': (0.1, 0.9),
                                            'bagging_fraction': (0.8, 1),
                                            'max_depth': (5, 30),
                                            'max_bin':(20,90),
                                            'min_data_in_leaf': (20, 80),
                                            'min_sum_hessian_in_leaf':(0,100),
                                           'subsample': (0.01, 1.0)}, random_state=200)

    
    #n_iter: How many steps of bayesian optimization you want to perform. The more steps the more likely to find a good maximum you are.
    #init_points: How many steps of random exploration you want to perform. Random exploration can help by diversifying the exploration space.
    
    lgbBO.maximize(init_points=init_round, n_iter=opt_round)
    
    model_auc=[]
    for model in range(len( lgbBO.res)):
        model_auc.append(lgbBO.res[model]['target'])
    
    # return best parameters
    return lgbBO.res[pd.Series(model_auc).idxmax()]['target'],lgbBO.res[pd.Series(model_auc).idxmax()]['params']



In [None]:
%%time

opt_params = bayes_parameter_opt_lgb(X, Y, init_round=10, opt_round=10, n_folds=5, random_seed=42,n_estimators=10000)

In [None]:
opt_params[1]["num_leaves"] = int(round(opt_params[1]["num_leaves"]))
opt_params[1]['max_depth'] = int(round(opt_params[1]['max_depth']))
opt_params[1]['min_data_in_leaf'] = int(round(opt_params[1]['min_data_in_leaf']))
opt_params[1]['max_bin'] = int(round(opt_params[1]['max_bin']))
opt_params[1]['objective']='binary'
opt_params[1]['metric']='auc'
opt_params[1]['is_unbalance']=True
opt_params[1]['boost_from_average']=False
opt_params=opt_params[1]
opt_params

In [None]:
from sklearn.model_selection import cross_val_predict 
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import KFold 

In [None]:
my_scores = [] 

In [None]:
cv = KFold(n_splits=30, shuffle=False) 

In [None]:
model  = lgb.LGBMClassifier(bagging_fraction = 0.8015970276069816,
 feature_fraction= 0.7006594439332303,
 learning_rate= 0.14145368629183372,
 max_bin= 86,
 max_depth= 29,
 min_data_in_leaf= 65,
 min_sum_hessian_in_leaf= 61.044672381045665,
 num_leaves= 25,
 subsample= 0.5856822903390296,
 objective= 'binary',
 metric= 'auc',
 is_unbalance= True,
 boost_from_average= False)

In [None]:
for train_index, test_index in cv.split(x): 
    print("Training data index: ", train_index, "\n") 
    print("Test data index: ", test_index) 
    X_train, X_test, y_train, y_test = X[train_index], X[test_index], Y[train_index], Y[test_index] 
    model.fit(X_train, y_train) 
    my_scores.append(model.score(X_test, y_test)) 
    model.fit(X_train, y_train) 
    my_scores.append(model.score(X_test, y_test)) 

In [None]:
print("The mean value is" ) 
print(np.mean(my_scores)) 
#or 
cross_val_score(model, X, Y, cv=10) 
#(or) 
cross_val_predict(model, X, Y, cv=10, method='predict_proba')

In [None]:
model = lgb.LGBMClassifier(bagging_fraction = 0.9280786928822388,
 feature_fraction= 0.5869250335390612,
 learning_rate= 0.11437976620936573,
 max_bin= 88,
 max_depth= 24,
 min_data_in_leaf= 61,
 min_sum_hessian_in_leaf= 32.93481038105413,
 num_leaves= 25,
 subsample= 0.805623782185316,
 objective= 'binary',
 metric= 'auc',
 is_unbalance= True,
 boost_from_average= False)

In [None]:
param_test1 = {'max_bin': range(20,100,2),
 'max_depth': range(5,100,2),
 'min_data_in_leaf': range(10,100,2)}
gsearch1 = GridSearchCV(estimator =model, param_grid = param_test1,scoring='roc_auc',n_jobs=4,cv=5)
gsearch1.fit(x,Y)
gsearch1.best_params_, gsearch1.best_score_

## LGBM Classifier

In [None]:
LGBM=lgb.LGBMClassifier(bagging_fraction = 0.9280786928822388,
 feature_fraction= 0.5869250335390612,
 learning_rate= 0.11437976620936573,
 max_bin= 88,
 max_depth= 24,
 min_data_in_leaf= 61,
 min_sum_hessian_in_leaf= 32.93481038105413,
 num_leaves= 25,
 subsample= 0.805623782185316,
 objective= 'binary',
 metric= 'auc',
 is_unbalance= True,
 boost_from_average= False).fit(X,Y)

In [None]:
lgbm_pred=LGBM.predict_proba(X_val)

In [None]:
print('Training Accuracy: ',roc_auc_score(y_train, LGBM.predict_proba(X_train)[:,1]))
print('Validation Accuracy: ',roc_auc_score(y_val, lgbm_pred[:,1]))
print('Total Data Accuracy: ',roc_auc_score(Y, LGBM.predict_proba(X)[:,1]))

In [None]:
print('Test Data Accuracy: ',roc_auc_score(y_test, LGBM.predict_proba(X_test)[:,1]))


## Random Forest Classifier

In [None]:
RFC = RandomForestClassifier().fit(X_train,y_train)

In [None]:
rfc_pred = RFC.predict_proba(X_val)

In [None]:
print('Training Accuracy: ',roc_auc_score(y_train, RFC.predict_proba(X_train)[:,1]))
print('Validation Accuracy: ',roc_auc_score(y_val, rfc_pred[:,1]))
print('Total Data Accuracy: ',roc_auc_score(Y, RFC.predict_proba(X)[:,1]))

## XG Boost

In [None]:
XGB = XGBClassifier(learning_rate =0.3, n_estimators=115, max_depth=7,min_child_weight=6,
                        gamma=0.1,colsample_bytree=0.8,objective= 'binary:logistic',
                                                  nthread=4, scale_pos_weight=1, seed=42)

In [None]:
XGB.fit(X_train,y_train)

In [None]:
xgb_pred = XGB.predict_proba(X_val)

In [None]:
print('Training Accuracy: ',roc_auc_score(y_train, XGB.predict_proba(X_train)[:,1]))
print('Validation Accuracy: ',roc_auc_score(y_val, xgb_pred[:,1]))
print('Total Data Accuracy: ',roc_auc_score(Y, XGB.predict_proba(X)[:,1]))

## Predicting the Test File

In [None]:
test = pd.read_csv('test_mSzZ8RL.csv')

In [None]:
test_data = test.drop(axis=1,labels=['ID'])
test_data.head()

In [None]:
xt = test_data.iloc[:,:].values
label_encoder = LabelEncoder()
xt[:,0] = label_encoder.fit_transform(xt[:,0])

xt[:,2] = label_encoder.fit_transform(xt[:,2])
xt[:,3] = label_encoder.fit_transform(xt[:,3])
xt[:,4] = label_encoder.fit_transform(xt[:,4])
xt[:,6] = label_encoder.fit_transform(xt[:,6])
xt[:,-1] = label_encoder.fit_transform(xt[:,-1])

xt[0:5]
xt_filled =imputer.fit_transform(xt)
xt = SC.fit_transform(xt[:,[0,1,2,3,4,5,6,8]])

In [None]:
yhat = LGBM.predict_proba(xt)

In [None]:
yhat[:,1]

In [None]:
file = test[['ID']]

In [None]:
file.head()

In [None]:
file.insert(1, 'Is_Lead', yhat[:,1])

In [None]:
file.head()

In [None]:
file.to_csv('Trail24.csv',index=False)