In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_auc_score


In [3]:
data=pd.read_csv(r"C:\Users\SAKET NANDAN\Documents\current_excelr_lms\logistic_regression_assignment\bank-full.csv")


data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [4]:
#some of the encoder can not able to create realtion ship with categorical type target , so here i m using custom fuction to make the 
#the target as integer 
#target is labeled as integer then no need to do this transformation 

def find_category_mappings(df, variable):
    return {k: i for i, k in enumerate(df[variable].unique(), 0)}

def integer_encode(data,  variable, ordinal_mapping):

    data[variable] = data[variable].map(ordinal_mapping)


#(if target is not categorical then no need to execute )

In [5]:
for variable in ['y']:
    mappings = find_category_mappings(data, variable)
    integer_encode(data, variable, mappings)
    
#(if target is not categorical then no need execute this) 

In [6]:
# let's check for missing data

data.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [7]:
# Let's divide into train and test set

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels='y', axis=1),  # predictors
    data['y'],  # target
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((31647, 16), (13564, 16))

In [8]:
# Let's replace null values in numerical variables by the mean


def impute_na(df, variable, value):
    df[variable].fillna(value, inplace=True)


#impute_na(X_test, 'age', X_train['age'].mean())
#impute_na(X_train, 'age',  X_train['age'].mean())
# note how I impute first the test set, this way the value of
# the median used will be the same for both train and test

In [9]:
X_train.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
24951,58,management,married,primary,no,1021,no,no,unknown,18,nov,205,1,-1,0,unknown
22129,55,blue-collar,married,unknown,no,196,no,no,cellular,21,aug,168,2,-1,0,unknown
986,50,admin.,married,secondary,no,159,yes,no,unknown,7,may,216,2,-1,0,unknown
14999,43,management,married,primary,no,1880,yes,no,cellular,17,jul,63,6,-1,0,unknown
4027,53,technician,divorced,tertiary,no,647,yes,no,unknown,16,may,512,6,-1,0,unknown


In [10]:
# let's check that we have no missing data after NA imputation

X_train.isnull().sum(), X_test.isnull().sum()

(age          0
 job          0
 marital      0
 education    0
 default      0
 balance      0
 housing      0
 loan         0
 contact      0
 day          0
 month        0
 duration     0
 campaign     0
 pdays        0
 previous     0
 poutcome     0
 dtype: int64,
 age          0
 job          0
 marital      0
 education    0
 default      0
 balance      0
 housing      0
 loan         0
 contact      0
 day          0
 month        0
 duration     0
 campaign     0
 pdays        0
 previous     0
 poutcome     0
 dtype: int64)

### one hot encoder  

In [11]:
from feature_engine.categorical_encoders import OneHotCategoricalEncoder

In [12]:
ohe_enc = OneHotCategoricalEncoder(
    top_categories=None,
    variables=['job', 'marital', 'education','default','housing','loan','contact','month','poutcome'], # we can select which variables to encode
    drop_last=True) # to return k-1, false to return k


ohe_enc.fit(X_train)

OneHotCategoricalEncoder(drop_last=True, top_categories=None,
                         variables=['job', 'marital', 'education', 'default',
                                    'housing', 'loan', 'contact', 'month',
                                    'poutcome'])

In [14]:
#ohe_enc.variables

In [16]:
X_train_OHE = ohe_enc.transform(X_train)
X_test_OHE = ohe_enc.transform(X_test)

#X_train_OHE.head()

### count encoding  

In [18]:
from feature_engine.categorical_encoders import CountFrequencyCategoricalEncoder

In [19]:
count_enc = CountFrequencyCategoricalEncoder(
    encoding_method='count', # to do frequency ==> encoding_method='frequency'
    variables=['job', 'marital', 'education','default','housing','loan','contact','month','poutcome'])

count_enc.fit(X_train)

CountFrequencyCategoricalEncoder(encoding_method='count',
                                 variables=['job', 'marital', 'education',
                                            'default', 'housing', 'loan',
                                            'contact', 'month', 'poutcome'])

In [20]:
X_train_count = count_enc.transform(X_train)
X_test_count = count_enc.transform(X_test)

# let's explore the result
#X_train_count.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
24951,58,6676,18970,4788,31080,1021,14039,26561,9082,18,2828,205,1,-1,0,25872
22129,55,6809,18970,1267,31080,196,14039,26561,20507,21,4370,168,2,-1,0,25872
986,50,3584,18970,16264,31080,159,17608,26561,9082,7,9659,216,2,-1,0,25872
14999,43,6676,18970,4788,31080,1880,17608,26561,20507,17,4843,63,6,-1,0,25872
4027,53,5348,3681,9328,31080,647,17608,26561,9082,16,9659,512,6,-1,0,25872


### ordered integer encoding  

In [21]:
from feature_engine.categorical_encoders import OrdinalCategoricalEncoder

In [22]:
ordinal_enc = OrdinalCategoricalEncoder(
    # NOTE that we indicate ordered in the encoding_method, otherwise it assings numbers arbitrarily
    encoding_method='ordered',
    variables=['job', 'marital', 'education','default','housing','loan','contact','month','poutcome'])

In [23]:
ordinal_enc.fit(X_train, y_train)

OrdinalCategoricalEncoder(encoding_method='ordered',
                          variables=['job', 'marital', 'education', 'default',
                                     'housing', 'loan', 'contact', 'month',
                                     'poutcome'])

In [24]:
X_train_ordered = ordinal_enc.transform(X_train)
X_test_ordered = ordinal_enc.transform(X_test)


#### mean ancoding  

In [25]:
from feature_engine.categorical_encoders import MeanCategoricalEncoder

In [26]:
mean_enc = MeanCategoricalEncoder(
    variables=['job', 'marital', 'education','default','housing','loan','contact','month','poutcome'])

In [27]:
# when fitting the transformer, we need to pass the target as well
# just like with any Scikit-learn predictor class

mean_enc.fit(X_train, y_train)

MeanCategoricalEncoder(variables=['job', 'marital', 'education', 'default',
                                  'housing', 'loan', 'contact', 'month',
                                  'poutcome'])

In [29]:
X_train_mean = mean_enc.transform(X_train)
X_test_mean = mean_enc.transform(X_test)

# let's explore the result
#X_train_mean.head()

#### probabilty ratio encoding  

In [30]:
from feature_engine.categorical_encoders import WoERatioCategoricalEncoder

In [34]:
ratio_enc = WoERatioCategoricalEncoder(
    encoding_method = 'ratio',
    variables=['job', 'marital', 'education','default','housing','loan','contact','month','poutcome'])

In [35]:
# when fitting the transformer, we need to pass the target as well
# just like with any Scikit-learn predictor class

ratio_enc.fit(X_train, y_train)

WoERatioCategoricalEncoder(encoding_method='ratio',
                           variables=['job', 'marital', 'education', 'default',
                                      'housing', 'loan', 'contact', 'month',
                                      'poutcome'])

In [37]:
X_train_prob = ratio_enc.transform(X_train)
X_test_prob = ratio_enc.transform(X_test)

# let's explore the result
#X_train_prob.head()

#### weight of evidence encoding  

In [39]:
from feature_engine.categorical_encoders import WoERatioCategoricalEncoder

In [40]:
woe_enc = WoERatioCategoricalEncoder(
    encoding_method = 'woe',
    variables=['job', 'marital', 'education','default','housing','loan','contact','month','poutcome'])

In [41]:
# when fitting the transformer, we need to pass the target as well
# just like with any Scikit-learn predictor class

woe_enc.fit(X_train, y_train)

WoERatioCategoricalEncoder(encoding_method='woe',
                           variables=['job', 'marital', 'education', 'default',
                                      'housing', 'loan', 'contact', 'month',
                                      'poutcome'])

In [43]:
X_train_woe = woe_enc.transform(X_train)
X_test_woe = woe_enc.transform(X_test)

# let's explore the result
#X_train_woe.head()

## Random Forest Performance

In [44]:
# create a function to build random forests and compare performance in train and test set


def run_randomForests(X_train, X_test, y_train, y_test):

    rf = RandomForestClassifier(n_estimators=50, random_state=39, max_depth=3)
    rf.fit(X_train, y_train)

    print('Train set')
    pred = rf.predict_proba(X_train)
    print(
        'Random Forests roc-auc: {}'.format(roc_auc_score(y_train, pred[:, 1])))

    print('Test set')
    pred = rf.predict_proba(X_test)
    print(
        'Random Forests roc-auc: {}'.format(roc_auc_score(y_test, pred[:, 1])))

In [45]:
# OHE
run_randomForests(X_train_OHE, X_test_OHE, y_train, y_test)

Train set
Random Forests roc-auc: 0.8918504214021095
Test set
Random Forests roc-auc: 0.8853752025543522


In [46]:
# counts
run_randomForests(X_train_count, X_test_count, y_train, y_test)

Train set
Random Forests roc-auc: 0.8925055304469132
Test set
Random Forests roc-auc: 0.8837407817635474


In [47]:
# ordered labels
run_randomForests(X_train_ordered, X_test_ordered, y_train, y_test)

Train set
Random Forests roc-auc: 0.8951472627181007
Test set
Random Forests roc-auc: 0.8863546135772377


In [48]:
# mean encoding
run_randomForests(X_train_mean, X_test_mean, y_train, y_test)

Train set
Random Forests roc-auc: 0.8951472917714323
Test set
Random Forests roc-auc: 0.8863548754868573


In [49]:
# woe
run_randomForests(X_train_woe, X_test_woe, y_train, y_test)

Train set
Random Forests roc-auc: 0.8951472917714323
Test set
Random Forests roc-auc: 0.8863548754868573


so above 3 are creted monotonic relationship with target variable so all 3 have same roc-auc value 

## logistic reggression  

In [51]:
def run_logistic(X_train, X_test, y_train, y_test):

    # function to train and test the performance of logistic regression
    logit = LogisticRegression(random_state=44, C=0.01)
    logit.fit(X_train, y_train)

    print('Train set')
    pred = logit.predict_proba(X_train)
    print(
        'Logistic Regression roc-auc: {}'.format(roc_auc_score(y_train, pred[:, 1])))

    print('Test set')
    pred = logit.predict_proba(X_test)
    print(
        'Logistic Regression roc-auc: {}'.format(roc_auc_score(y_test, pred[:, 1])))

In [52]:
# OHE
run_logistic(X_train_OHE, X_test_OHE, y_train, y_test)

Train set
Logistic Regression roc-auc: 0.8482198665479767
Test set
Logistic Regression roc-auc: 0.8321149385127882


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [53]:
# counts
run_logistic(X_train_count, X_test_count, y_train, y_test)

Train set
Logistic Regression roc-auc: 0.8802969912892883
Test set
Logistic Regression roc-auc: 0.8695327087138118


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [54]:
# ordered labels
run_logistic(X_train_ordered, X_test_ordered, y_train, y_test)

Train set
Logistic Regression roc-auc: 0.8142765591659183
Test set
Logistic Regression roc-auc: 0.7981329510849736


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [55]:
# mean encoding
run_logistic(X_train_mean, X_test_mean, y_train, y_test)

Train set
Logistic Regression roc-auc: 0.7692141350810631
Test set
Logistic Regression roc-auc: 0.7519537802855916


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [56]:
# woe
run_logistic(X_train_woe, X_test_woe, y_train, y_test)

Train set
Logistic Regression roc-auc: 0.8926791337869308
Test set
Logistic Regression roc-auc: 0.8853987744201255


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [57]:
#so here we can conclude that for logistic regression woe is giving best roc-auc values 