In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import sys
sys.setrecursionlimit(1500)

In [None]:
adult_ds = pd.read_csv("adult.csv")

In [None]:
adult_ds.columns = ['Age', 'Workclass', 'Fnlwgt', 'Education', 'EducationNum', 'MaritalStatus',
                   'Occupation', 'Relationship', 'Race', 'Sex', 'CapitalGain', 'CapitalLoss',
                   'HoursPerWeek', 'NativeCountry', 'Income']

---

##### Exploratory Data Analysis

In [None]:
# Display first 5 entries
adult_ds.head()

In [None]:
# Info about columns, type of data 
adult_ds.info()

In [None]:
# Getting various statistical data like Mean, Standard Deviation, Median, Max Value, Min Value (excludes categorical vars)
adult_ds.describe()

In [None]:
# Checking if columns have null values, if null- we replace with mean/median or simply drop these entries
adult_ds.isnull().sum()

In [None]:
# Checking correlation between variables (including target var to identify how features affect it) 
sns.heatmap(adult_ds.corr(), annot=True)

In [None]:
# Count of Income <=50K and >50K for each category of WorkingClass
sns.countplot(data=adult_ds, x=adult_ds['Income'], hue='Workclass').set(xlabel='Income Type', ylabel='Count')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)

In [None]:
# Count of Income <=50K and >50K for each category of Education
sns.countplot(data=adult_ds, x=adult_ds['Income'], hue='Education').set(xlabel='Income Type', ylabel='Count')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)

In [None]:
# Count of Income <=50K and >50K for each category of MaritalStatus
sns.countplot(data=adult_ds, x=adult_ds['Income'], hue='MaritalStatus').set(xlabel='Income Type', ylabel='Count')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)

In [None]:
# Count of Income <=50K and >50K for each category of Occupation
sns.countplot(data=adult_ds, x=adult_ds['Income'], hue='Occupation').set(xlabel='Income Type', ylabel='Count')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)

In [None]:
# Count of Income <=50K and >50K for each category of Relationship
sns.countplot(data=adult_ds, x=adult_ds['Income'], hue='Relationship').set(xlabel='Income Type', ylabel='Count')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)

In [None]:
# Count of Income <=50K and >50K for each category of Race
sns.countplot(data=adult_ds, x=adult_ds['Income'], hue='Race').set(xlabel='Income Type', ylabel='Count')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)

In [None]:
# Count of Income <=50K and >50K for each category of Sex
sns.countplot(data=adult_ds, x=adult_ds['Income'], hue='Sex').set(xlabel='Income Type', ylabel='Count')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)

In [None]:
adult_ds['Income'].value_counts()

In [None]:
# Transform <=50K to 0 and >50K to 1 in Income column

adult_ds['Income'] = np.where(adult_ds['Income'] == ' <=50K', 0, 1)
adult_ds.head()

In [None]:
adult_ds['Income'].value_counts()

Checking if Age, Fnlwgt, CapitalGain, CapitalLoss, HoursPerWeek are linearly related to Log-Odds of Income 

In [None]:
# Age
sns.regplot(data=adult_ds, x=adult_ds['Age'], y=adult_ds['Income'], logistic= True).set(xlabel='Log-Odds(Income)', ylabel="Age")

In [None]:
# Fnlwgt
sns.regplot(data=adult_ds, x=adult_ds['Fnlwgt'], y=adult_ds['Income'], logistic= True).set(xlabel='Log-Odds(Income)', ylabel="Fnlwgt")

In [None]:
# CapitalGain
sns.regplot(data=adult_ds, x=adult_ds['CapitalGain'], y=adult_ds['Income'], logistic= True).set(xlabel='Log-Odds(Income)', ylabel="CapitalGain")

In [None]:
# CapitalLoss
sns.regplot(data=adult_ds, x=adult_ds['CapitalLoss'], y=adult_ds['Income'], logistic= True).set(xlabel='Log-Odds(Income)', ylabel="CapitalLoss")

In [None]:
# HoursPerWeek
sns.regplot(data=adult_ds, x=adult_ds['HoursPerWeek'], y=adult_ds['Income'], logistic= True).set(xlabel='Log-Odds(Income)', ylabel="HoursPerWeek")

In [None]:
# Outlier Analysis for Age, Fnlwgt, CapitalGain, CapitalLoss, HoursPerWeek

figure, axes = plt.subplots(3, 2, sharex=True, figsize=(16,8))

sns.boxplot(data=adult_ds[['Age']], y='Age', ax=axes[0, 0]).set(ylabel="Age")
sns.boxplot(data=adult_ds[['Fnlwgt']], y='Fnlwgt', ax=axes[0, 1]).set(ylabel="Fnlwgt")
sns.boxplot(data=adult_ds[['CapitalGain']], y='CapitalGain', ax=axes[1, 0]).set(ylabel="CapitalGain")
sns.boxplot(data=adult_ds[['CapitalLoss']], y='CapitalLoss', ax=axes[1, 1]).set(ylabel="CapitalLoss")
sns.boxplot(data=adult_ds[['HoursPerWeek']], y='HoursPerWeek', ax=axes[2, 0]).set(ylabel="HoursPerWeek")

In [None]:
# Outlier treatment, replacing outliers with Median value
import statistics

def outlier_treatment(cols):
    for i in cols:
        print('Column: ', i)
        q25, q75 = np.percentile(adult_ds[i], 25), np.percentile(adult_ds[i], 75)
        iqr = q75 - q25
        print('Interquartile Range: ', iqr)
        cut_off = iqr * 1.5
        lower = q25 - cut_off
        upper = q75 + cut_off
        outliers = [x for x in adult_ds[i] if x < lower or x > upper]
        adult_ds[i] = np.where(adult_ds[i] < lower, statistics.median(adult_ds[i]), adult_ds[i])
        adult_ds[i] = np.where(adult_ds[i] > upper, statistics.median(adult_ds[i]), adult_ds[i])
        print('-----------------')

In [None]:
cols = ['Age', 'Fnlwgt', 'CapitalGain', 'CapitalLoss', 'HoursPerWeek']
outlier_treatment(cols)

In [None]:
# Checking outliers again
figure, axes = plt.subplots(3, 2, sharex=True, figsize=(16,8))

sns.boxplot(data=adult_ds[['Age']], y='Age', ax=axes[0, 0]).set(ylabel="Age")
sns.boxplot(data=adult_ds[['Fnlwgt']], y='Fnlwgt', ax=axes[0, 1]).set(ylabel="Fnlwgt")
sns.boxplot(data=adult_ds[['CapitalGain']], y='CapitalGain', ax=axes[1, 0]).set(ylabel="CapitalGain")
sns.boxplot(data=adult_ds[['CapitalLoss']], y='CapitalLoss', ax=axes[1, 1]).set(ylabel="CapitalLoss")
sns.boxplot(data=adult_ds[['HoursPerWeek']], y='HoursPerWeek', ax=axes[2, 0]).set(ylabel="HoursPerWeek")

---

##### WOE and IV

In [None]:
adult_woe = adult_ds[['Age', 'Workclass', 'Fnlwgt', 'Education', 'EducationNum', 'MaritalStatus', 
        'Occupation', 'Relationship', 'Race', 'Sex', 'CapitalGain', 'CapitalLoss',
        'HoursPerWeek', 'NativeCountry']]

adult_woe.head()

In [None]:
adult_ds.Income

In [None]:
import pandas.core.algorithms as algos
from pandas import Series
import scipy.stats.stats as stats
import re
import traceback
import string

max_bin = 20
force_bin = 3

# define a binning function
def mono_bin(Y, X, n = max_bin):
    
    df1 = pd.DataFrame({"X": X, "Y": Y})
    justmiss = df1[['X','Y']][df1.X.isnull()]
    notmiss = df1[['X','Y']][df1.X.notnull()]
    r = 0
    while np.abs(r) < 1:
        try:
            d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.qcut(notmiss.X, n)})
            d2 = d1.groupby('Bucket', as_index=True)
            r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
            n = n - 1 
        except Exception as e:
            n = n - 1

    if len(d2) == 1:
        n = force_bin         
        bins = algos.quantile(notmiss.X, np.linspace(0, 1, n))
        if len(np.unique(bins)) == 2:
            bins = np.insert(bins, 0, 1)
            bins[1] = bins[1]-(bins[1]/2)
        d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.cut(notmiss.X, np.unique(bins),include_lowest=True)}) 
        d2 = d1.groupby('Bucket', as_index=True)
    
    d3 = pd.DataFrame({},index=[])
    d3["MIN_VALUE"] = d2.min().X
    d3["MAX_VALUE"] = d2.max().X
    d3["COUNT"] = d2.count().Y
    d3["EVENT"] = d2.sum().Y
    d3["NONEVENT"] = d2.count().Y - d2.sum().Y
    d3=d3.reset_index(drop=True)
    
    if len(justmiss.index) > 0:
        d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
        d4["MAX_VALUE"] = np.nan
        d4["COUNT"] = justmiss.count().Y
        d4["EVENT"] = justmiss.sum().Y
        d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
        d3 = d3.append(d4,ignore_index=True)
    
    d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
    d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
    d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
    d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
    d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["VAR_NAME"] = "VAR"
    d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]       
    d3 = d3.replace([np.inf, -np.inf], 0)
    d3.IV = d3.IV.sum()
    
    return(d3)

def char_bin(Y, X):
        
    df1 = pd.DataFrame({"X": X, "Y": Y})
    justmiss = df1[['X','Y']][df1.X.isnull()]
    notmiss = df1[['X','Y']][df1.X.notnull()]    
    df2 = notmiss.groupby('X',as_index=True)
    
    d3 = pd.DataFrame({},index=[])
    d3["COUNT"] = df2.count().Y
    d3["MIN_VALUE"] = df2.sum().Y.index
    d3["MAX_VALUE"] = d3["MIN_VALUE"]
    d3["EVENT"] = df2.sum().Y
    d3["NONEVENT"] = df2.count().Y - df2.sum().Y
    
    if len(justmiss.index) > 0:
        d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
        d4["MAX_VALUE"] = np.nan
        d4["COUNT"] = justmiss.count().Y
        d4["EVENT"] = justmiss.sum().Y
        d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
        d3 = d3.append(d4,ignore_index=True)
    
    d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
    d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
    d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
    d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
    d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["VAR_NAME"] = "VAR"
    d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]      
    d3 = d3.replace([np.inf, -np.inf], 0)
    d3.IV = d3.IV.sum()
    d3 = d3.reset_index(drop=True)
    
    return(d3)

def data_vars(df1, target):
    
    stack = traceback.extract_stack()
    filename, lineno, function_name, code = stack[-2]
    vars_name = re.compile(r'\((.*?)\).*$').search(code).groups()[0]
    final = (re.findall(r"[\w']+", vars_name))[-1]
    
    x = df1.dtypes.index
    count = -1
    
    for i in x:
        if i.upper() not in (final.upper()):
            if np.issubdtype(df1[i], np.number) and len(Series.unique(df1[i])) > 2:
                conv = mono_bin(target, df1[i])
                conv["VAR_NAME"] = i
                count = count + 1
            else:
                conv = char_bin(target, df1[i])
                conv["VAR_NAME"] = i            
                count = count + 1
                
            if count == 0:
                iv_df = conv
            else:
                iv_df = iv_df.append(conv,ignore_index=True)
    
    iv = pd.DataFrame({'IV':iv_df.groupby('VAR_NAME').IV.max()})
    iv = iv.reset_index()
    return(iv_df,iv) 

In [None]:
final_iv, IV = data_vars(adult_woe, adult_ds.Income)

In [None]:
final_iv

In [None]:
IV.sort_values('IV')

In [1]:
# IV         Predictive Power
# <0.02      Useless
# 0.02-0.1   Weak
# 0.1-0.3    Medium
# 0.3-0.5    Strong
# >0.5       Suspicously too good

In [None]:
# We will drop CapitalGain, CapitalLoss, Fnlwgt to begin with

adult_ds = adult_ds.drop(['Fnlwgt', 'CapitalGain', 'CapitalLoss'], axis=1)
adult_ds.head()

Function to use WOE values for categorical variables

In [None]:
def useWOE_categorical(cols):
    for i in cols:
        col_finalIV = final_iv.loc[final_iv['VAR_NAME'] == i]
        colfinalIV_unq = col_finalIV['MIN_VALUE'].unique()
        for j in range(len(colfinalIV_unq)):
            woe_val = col_finalIV.loc[col_finalIV['MIN_VALUE'] == colfinalIV_unq[j]]['WOE'].values[0]
            adult_ds[i] = adult_ds[i].replace([colfinalIV_unq[j]], woe_val)
            

In [None]:
cols = ['Race', 'NativeCountry', 'Workclass', 'Sex', 'Education', 'Occupation', 'MaritalStatus', 'Relationship']
useWOE_categorical(cols)

In [None]:
adult_ds.head()

Function to use WOE values for numerical variables

In [None]:
def useWOE_numerical(cols):
    for i in cols:
        woe_col = 'woe' + i
        woe_list = list()
        col_finalIV = final_iv.loc[final_iv['VAR_NAME'] == i]
        colFinalIV_len = len(col_finalIV.index)
        for x in adult_ds[i]:
            for j in range(colFinalIV_len):
                woe_val = col_finalIV['WOE'].values[j]
                if x >= col_finalIV['MIN_VALUE'].values[j] and x <= col_finalIV['MAX_VALUE'].values[j]:
                    woe_list.append(woe_val)
        adult_ds[woe_col] = woe_list

In [None]:
cols = ['Age', 'HoursPerWeek', 'EducationNum']
useWOE_numerical(cols)

In [None]:
adult_ds.head()

---

##### Model Development

In [None]:
working_dt = adult_ds
y = working_dt.pop('Income')
X = working_dt

In [None]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
y_test.head()

In [None]:
import statsmodels.api as sm

Function to build Logistic Regression model with columns as an input 

In [None]:
def build_logReg_model(cols):
    # performing the regression and fitting the model
    log_reg = sm.Logit(y_train, X_train[cols]).fit()
    # printing result
    print(log_reg.summary())
    return log_reg

Confusion matrix and accuracy

In [None]:
from sklearn.metrics import (confusion_matrix, accuracy_score)

def confusionMatrix(y_pred):
    cm = confusion_matrix(y_test, y_pred)
    print ("Confusion Matrix : \n", cm)
    # accuracy score of the model
    print('Test accuracy = ', accuracy_score(y_test, y_pred))

Plotting ROC curve

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

def plotROC(y_test, y_pred):
    fpr, tpr, _ = roc_curve(y_test,  y_pred)
    auc = roc_auc_score(y_test, y_pred)
    plt.plot(fpr, tpr, label="AUC: %0.2f" %auc)
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.legend(loc=4)
    plt.show()

---

##### Model 1: With original values of continuous variables i.e Age, HoursPerWeek and EducationNum

In [None]:
cols = ['Race', 'NativeCountry', 'Workclass', 'Sex', 'Education', 'Occupation', 'MaritalStatus', 'Relationship',
       'Age', 'HoursPerWeek', 'EducationNum']
log_reg1 = build_logReg_model(cols)

Test Accuracy

In [None]:
ytest_pred1 = log_reg1.predict(X_test[cols])
ytest_pred1 = list(map(round, ytest_pred1))
confusionMatrix(ytest_pred1)

ROC curve

In [None]:
plotROC(y_test, ytest_pred1)

---

##### Model 2: With WOE values of continuous variables i.e woeAge, woeHoursPerWeek and woeEducationNum

In [None]:
cols = ['Race', 'NativeCountry', 'Workclass', 'Sex', 'Education', 'Occupation', 'MaritalStatus', 'Relationship',
        'woeAge', 'woeHoursPerWeek', 'woeEducationNum']
log_reg2 = build_logReg_model(cols)

In [None]:
# Test accuracy

ytest_pred2 = log_reg2.predict(X_test[cols])
ytest_pred2 = list(map(round, ytest_pred2))
confusionMatrix(ytest_pred2)

In [None]:
# ROC curve
plotROC(y_test, ytest_pred2)

---

##### Model 3: Dropping variables with IV score < 0.1 (Weak predictors); with original values of continuous variables

In [None]:
cols = ['HoursPerWeek', 'Workclass', 'Sex', 'Age', 'EducationNum', 'Education', 'Occupation',
        'MaritalStatus', 'Relationship']
log_reg3 = build_logReg_model(cols)

In [None]:
# Test accuracy

ytest_pred3 = log_reg3.predict(X_test[cols])
ytest_pred3 = list(map(round, ytest_pred3))
confusionMatrix(ytest_pred3)

In [None]:
# ROC curve
plotROC(y_test, ytest_pred3)

---

##### Model 4: Dropping variables with IV score < 0.1 (Weak predictors); with woe values of continuous variables

In [None]:
cols = ['woeHoursPerWeek', 'Workclass', 'Sex', 'woeAge', 'woeEducationNum', 'Education', 'Occupation',
        'MaritalStatus', 'Relationship']
log_reg4 = build_logReg_model(cols)

In [None]:
# Test accuracy

ytest_pred4 = log_reg4.predict(X_test[cols])
ytest_pred4 = list(map(round, ytest_pred4))
confusionMatrix(ytest_pred4)

In [None]:
# ROC curve
plotROC(y_test, ytest_pred4)