**Following problem is based on a hackathon conducted by Analytics Vidhya where the objective was to predict Customers intrested in taking Credit Card. In the code you will find the following:**

* EDA of the continous and categorical variable
* Weight of Evidence(WOE) and Information value(IV) for continous and Categorical Feature
* Modelling using Boosting techniques and WOE transformed features


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestClassifier
import pandas.core.algorithms as algos
from pandas import Series
import scipy.stats.stats as stats
import re
import string
import warnings
warnings.filterwarnings('ignore')
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import train_test_split, StratifiedKFold

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv(r'/kaggle/input/jobathon-may-2021-credit-card-lead-prediction/train.csv')
test = pd.read_csv(r'/kaggle/input/jobathon-may-2021-credit-card-lead-prediction/test.csv')

**EDA**

In [None]:
train.head()

In [None]:
# plot countplots 
cat_col = ['Occupation','Channel_Code','Credit_Product']
plt.figure(figsize=(14, 12), dpi=100)
for i, feature in enumerate(cat_col):
    plt.subplot(3, 3, i+1)
    sns.countplot(data=train, x=feature)
    
sns.despine()

In [None]:
sns.distplot(train["Avg_Account_Balance"]) # highly positively skewed 
# we will use WOE transformation to transform the variable

In [None]:
# features to plot in the count plots


# plot countplots 
cat_col = ['Gender','Region_Code','Occupation','Channel_Code','Credit_Product','Is_Active']
plt.figure(figsize=(14, 12), dpi=100)
for i, feature in enumerate(cat_col):
    plt.subplot(3, 3, i+1)
    sns.countplot(data=train, x=feature, hue='Is_Lead')
    
sns.despine()

**Weight of Evidence(WOE) and Information value (IV)**

The weight of evidence tells the predictive power of an independent variable in relation to the dependent variable. Since it evolved from credit scoring world, it is generally described as a measure of the separation of good and bad customers.

                         WOE  = ln(Distribution of Goods/ Distribution of Bads)
                         
                         Distribution of Goods : % of good customer in a particular group
                         Distribution of Bads  : % of bad customer in a particular group
                         ln : Natural log
                         
                         

In [None]:
max_bin = 20
force_bin = 3

# Binning Function for continous variables
def mono_bin(Y, X, n = max_bin):
    
    df1 = pd.DataFrame({"X": X, "Y": Y})
    justmiss = df1[['X','Y']][df1.X.isnull()]
    notmiss = df1[['X','Y']][df1.X.notnull()]
    r = 0
    while np.abs(r) < 1:
        try:
            d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.qcut(notmiss.X, n)})
            d2 = d1.groupby('Bucket', as_index=True)
            r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
            n = n - 1 
        except Exception as e:
            n = n - 1

    if len(d2) == 1:
        n = force_bin         
        bins = algos.quantile(notmiss.X, np.linspace(0, 1, n))
        if len(np.unique(bins)) == 2:
            bins = np.insert(bins, 0, 1)
            bins[1] = bins[1]-(bins[1]/2)
        d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.cut(notmiss.X, np.unique(bins),include_lowest=True)}) 
        d2 = d1.groupby('Bucket', as_index=True)
    
    d3 = pd.DataFrame({},index=[])
    d3["MIN_VALUE"] = d2.min().X
    d3["MAX_VALUE"] = d2.max().X
    d3["COUNT"] = d2.count().Y
    d3["EVENT"] = d2.sum().Y
    d3["NONEVENT"] = d2.count().Y - d2.sum().Y
    d3=d3.reset_index(drop=True)
    
    if len(justmiss.index) > 0:
        d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
        d4["MAX_VALUE"] = np.nan
        d4["COUNT"] = justmiss.count().Y
        d4["EVENT"] = justmiss.sum().Y
        d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
        d3 = d3.append(d4,ignore_index=True)
    
    d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
    d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
    d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
    d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
    d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["VAR_NAME"] = "VAR"
    d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]       
    d3 = d3.replace([np.inf, -np.inf], 0)
    d3.IV = d3.IV.sum()
    
    return(d3)

# Binning Function for Categorical variables
def char_bin(Y, X):
        
    df1 = pd.DataFrame({"X": X, "Y": Y})
    justmiss = df1[['X','Y']][df1.X.isnull()]
    notmiss = df1[['X','Y']][df1.X.notnull()]    
    df2 = notmiss.groupby('X',as_index=True)
    
    d3 = pd.DataFrame({},index=[])
    d3["COUNT"] = df2.count().Y
    d3["MIN_VALUE"] = df2.sum().Y.index
    d3["MAX_VALUE"] = d3["MIN_VALUE"]
    d3["EVENT"] = df2.sum().Y
    d3["NONEVENT"] = df2.count().Y - df2.sum().Y
    
    if len(justmiss.index) > 0:
        d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
        d4["MAX_VALUE"] = np.nan
        d4["COUNT"] = justmiss.count().Y
        d4["EVENT"] = justmiss.sum().Y
        d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
        d3 = d3.append(d4,ignore_index=True)
    
    d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
    d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
    d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
    d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
    d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["VAR_NAME"] = "VAR"
    d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]      
    d3 = d3.replace([np.inf, -np.inf], 0)
    d3.IV = d3.IV.sum()
    d3 = d3.reset_index(drop=True)
    
    return(d3)

In [None]:

def comb_category(woe,threshold):
    
    count = 0
    similar_col = dict()
    col = []
    columns1 = woe['MIN_VALUE'].unique()
    columns2 = woe['MIN_VALUE'].unique()
    for cat1 in columns1 :
        if cat1 in col: continue
        woe1 = float(woe[woe['MIN_VALUE'] == cat1]['WOE'].values[0])
        col1 = []

        for cat in woe['MIN_VALUE'].unique():
            if cat1 == cat: continue
            if cat in col: continue
            woe2 = float(woe[woe['MIN_VALUE'] == cat]['WOE'].values[0])

            if (woe2 - woe1) >0.0 and (woe2 - woe1)<threshold:
                col1.append(cat)
                col.append(cat)
        col.append(cat1)

        similar_col[cat1] = col1
        
        if len(col1)>0:
            count+=1
    
    return(similar_col,count)


In [None]:

def data_vars(df1,target,test,cat_threshold):
    
    
    
    x = df1.dtypes.index
    count = -1
    replace = {}
    
    for i in x:
        
        if np.issubdtype(df1[i], np.number) and len(Series.unique(df1[i])) > 2:
            conv = mono_bin(target, df1[i])
            conv["VAR_NAME"] = i
            count = count + 1
        else:
            cat_replace = []
            conv = char_bin(target, df1[i])
            conv = conv.sort_values('WOE')
            similar_col,var_count = comb_category(conv,cat_threshold)
            cat_replace.append(similar_col)

            while var_count>0:

                for x,y in zip(similar_col.keys(),similar_col.values()):
                    df1.loc[df1[i].isin(y),i] = x
                    test.loc[test[i].isin(y),i] = x
                conv = char_bin(target, df1[i])
                conv = conv.sort_values('WOE')
                similar_col,var_count = comb_category(conv,cat_threshold)
                cat_replace.append(similar_col)
            replace[i] = cat_replace
            conv["VAR_NAME"] = i            
            count = count + 1

        if count == 0:
            iv_df = conv
        else:
            iv_df = iv_df.append(conv,ignore_index=True)
    
    iv = pd.DataFrame({'IV':iv_df.groupby('VAR_NAME').IV.max()})
    iv = iv.reset_index()
    return(iv_df,iv,df1,test,replace)

In [None]:
train_copy = train.copy()
test_copy  = test.copy()

train_copy.drop('ID',axis =1,inplace = True)
train_copy['Credit_Product'].fillna('NA',inplace = True)

test_copy.drop('ID',axis =1,inplace = True)
test_copy['Credit_Product'].fillna('NA',inplace = True)


In [None]:
#Calculating WOE and IV values
final_iv, IV,new_train,new_test,cat_replace = data_vars(train_copy,train_copy.Is_Lead,test_copy,cat_threshold = 0.1)

In [None]:
final_iv.head()

**Replacing features with WOE values**

In [None]:
def woe_replacement(train,transform_vars_list,transform_prefix):
    for var in transform_vars_list:
        print(var)
        
        small_train = final_iv[final_iv['VAR_NAME'] == var]
        transform_dict = dict(zip(small_train.MAX_VALUE,small_train.WOE))
        replace_cmd = ''
        replace_cmd1 = ''
        
        for i in sorted(transform_dict.items()):
            replace_cmd = replace_cmd + str(i[1]) + str(' if x <= ') + str(i[0]) + ' else '
            replace_cmd1 = replace_cmd1 + str(i[1]) + str(' if x == "') + str(i[0]) + '" else '
        replace_cmd = replace_cmd + '0'
        replace_cmd1 = replace_cmd1 + '0'
        
        if replace_cmd != '0':
            try:
                train[transform_prefix + var] = train[var].apply(lambda x: eval(replace_cmd))
            except:
                train[transform_prefix + var] = train[var].apply(lambda x: eval(replace_cmd1))
                
    return(train)

In [None]:
transform_vars_list = new_train.columns.difference(['Is_Lead'])
transform_prefix = 'new_'

new_train = woe_replacement(new_train,transform_vars_list,transform_prefix)

In [None]:
transform_vars_list = new_test.columns

new_test = woe_replacement(new_test,transform_vars_list,transform_prefix)

In [None]:
new_train.head()

**Modelling (Boosting Algorithm)**

In [None]:
#Function for running cross validation
def boosting(clf, fit_params, train, test, features):
    N_SPLITS = 10
    oofs = np.zeros(len(train))
    preds = np.zeros((len(test)))

    folds = StratifiedKFold(n_splits = N_SPLITS)

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train[TARGET_COL])):
        print(f'\n------------- Fold {fold_ + 1} -------------')

        ### Training Set
        X_trn, y_trn = train[features].iloc[trn_idx], train[TARGET_COL].iloc[trn_idx]

        ### Validation Set
        X_val, y_val = train[features].iloc[val_idx], train[TARGET_COL].iloc[val_idx]

        ### Test Set
        X_test = test[features]

        #print(X_trn)
        #exit(0)

        _ = clf.fit(X_trn, y_trn, eval_set = [(X_val, y_val)], **fit_params)

        ### Instead of directly predicting the classes we will obtain the probability of positive class.
        preds_val = clf.predict_proba(X_val)[:, 1]
        preds_test = clf.predict_proba(X_test)[:, 1]

        roc_score = roc_auc_score(y_val,preds_val)
        print("ROC for validation set is {}".format(roc_score))

        oofs[val_idx] = preds_val
        preds += preds_test / N_SPLITS


    oofs_score = roc_auc_score(train[TARGET_COL], oofs.round())
    print('ROC score for oofs is {}'.format(oofs_score))


    return oofs, preds

In [None]:
#catboost model training
clf = CatBoostClassifier(n_estimators = 3000,
                       learning_rate = 0.02,
                       rsm = 0.4, ## Analogous to colsample_bytree
                       random_state=2054,
                       
                       )

fit_params = {'verbose': 200, 'early_stopping_rounds': 300}

features = ['new_Age', 'new_Avg_Account_Balance', 'new_Channel_Code',
       'new_Credit_Product', 'new_Gender', 'new_Is_Active', 'new_Occupation',
       'new_Region_Code', 'new_Vintage']

TARGET_COL = 'Is_Lead'

cb_oofs, cb_preds = boosting(clf, fit_params,new_train,new_test,features)

optimized_roc = roc_auc_score(new_train[TARGET_COL], (cb_oofs  * 1))
print(f'Optimized ROC is {optimized_roc}')

In [None]:
#training LightGBM model 
clf = LGBMClassifier(n_estimators = 200,
                        learning_rate = 0.05,
                        colsample_bytree = 0.5,
                        )
fit_params = {'verbose': 100, 'early_stopping_rounds': 100}

lgb_oofs, lgb_preds = boosting(clf, fit_params,new_train,new_test,features)


optimized_roc = roc_auc_score(new_train[TARGET_COL], (lgb_oofs * 1))
print(f'Optimized ROC is {optimized_roc}')

In [None]:
#training XGB Classifier
clf = XGBClassifier(n_estimators = 1000,
                    max_depth = 6,
                    learning_rate = 0.05,
                    colsample_bytree = 0.5,
                    random_state=1452,
                    )

fit_params = {'verbose': 200, 'early_stopping_rounds': 200}

xgb_oofs, xgb_preds = boosting(clf, fit_params,new_train,new_test,features)


optimized_f1 = roc_auc_score(new_train[TARGET_COL], (xgb_oofs * 1))
print(f'Optimized F1 is {optimized_f1}')