In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pd.options.display.float_format = '{:.4f}'.format

# **Import Data**

In [None]:
data = pd.read_csv("/kaggle/input/mushroom-classification/mushrooms.csv")

In [None]:
#Check null values
data.isnull().sum()

In [None]:
#e: edible == 1
#p: poison == 0
data['class'] = np.where(data['class']=='e',1,0)

# **Information Value**

In [None]:
tmp_data = data.fillna('NULL')
summary = pd.DataFrame(columns=['Input','Total','Target','Non-Target','%Target','%Non-Target','WOE','IV','Features','Total_IV'])

for i in range(1,tmp_data.shape[1]):

    tmp_features = tmp_data.columns[i]
    
    tmp_info = pd.DataFrame(tmp_data[[tmp_features]+['class']].groupby(tmp_features).agg(['count','sum'])).reset_index()
    tmp_info = pd.concat([tmp_info[tmp_features],tmp_info['class']],axis=1).rename(columns={tmp_features:'Input','count':'Total','sum':'Target'})
    tmp_info['Non-Target'] = tmp_info['Total']-tmp_info['Target']
    tmp_info['%Target']= tmp_info['Target']/tmp_data.groupby('class').size()[1]
    tmp_info['%Non-Target'] = tmp_info['Non-Target']/tmp_data.groupby('class').size()[0]
    tmp_info['WOE'] = np.log(tmp_info['%Target']/tmp_info['%Non-Target'])
    tmp_info['IV'] = ((tmp_info['%Target']-tmp_info['%Non-Target']).sum())*(tmp_info['WOE'])

    #Edit some bins
    edit_bins_tmp = tmp_info[(tmp_info['WOE']==np.inf)|(tmp_info['WOE']==-np.inf)].reset_index(drop=True)
    
    if len(edit_bins_tmp)==1:
        edit_bins_tmp['WOE'] = 0
        edit_bins_tmp['IV'] = 0
    else:
        for i in range(0,len(edit_bins_tmp)-1):
            edit_bins_tmp['Input'][i] = edit_bins_tmp['Input'][i]+','
        edit_bins_tmp = pd.DataFrame(edit_bins_tmp[['Input','Total','Target','Non-Target']].sum()).transpose()
        edit_bins_tmp['%Target']= edit_bins_tmp['Target']/tmp_data.groupby('class').size()[1]
        edit_bins_tmp['%Non-Target'] = edit_bins_tmp['Non-Target']/tmp_data.groupby('class').size()[0]
        try:
            edit_bins_tmp['WOE'] = np.log((edit_bins_tmp['%Target']/edit_bins_tmp['%Non-Target'])[0])
        except:
            edit_bins_tmp['WOE'] = 0

        try:
            edit_bins_tmp['IV'] = ((edit_bins_tmp['%Target']-edit_bins_tmp['%Non-Target']).sum())*(edit_bins_tmp['WOE'])
        except:
            edit_bins_tmp['IV'] = 0
    #Final bins table
    tmp_info = tmp_info[(tmp_info['WOE']!=np.inf)&(tmp_info['WOE']!=-np.inf)].reset_index(drop=True)

    tmp_info = (tmp_info.append(edit_bins_tmp)).reset_index(drop=True)
    tmp_info['Features'] = tmp_features
    tmp_info['Total_IV'] = tmp_info['IV'].sum()
    
    summary = summary.append(tmp_info)
    summary = summary[summary['Input']!=0]

In [None]:
summary

In [None]:
important_feat = summary[['Features','Total_IV']].groupby('Features').max().reset_index()
important_feat = important_feat.sort_values('Total_IV',ascending=False)
important_feat

# **Apply WOE to data**

In [None]:
tmp_woe = pd.DataFrame(columns=['Features','Input','WOE'])
for i in range(0,len(important_feat)):
        tmp_woe_features = summary[summary['Features']==important_feat['Features'][i]]
        for k in range(0,len(tmp_woe_features)):
            tmp_woe_features_comma = tmp_woe_features.iloc[k:k+1]
            if len(tmp_woe_features_comma['Input'][k].split(','))==1:
                tmp_woe = tmp_woe.append(tmp_woe_features_comma[['Features','Input','WOE']])
            else:
                edit_bins_woe = pd.DataFrame(tmp_woe_features_comma['Input'][k].split(',')).rename(columns={0:'Input'})
                edit_bins_woe['Features'] = important_feat['Features'][i]
                edit_bins_woe['WOE'] = tmp_woe_features_comma['WOE'][k]
                edit_bins_woe = edit_bins_woe[['Features','Input','WOE']]
                
                tmp_woe = tmp_woe.append(edit_bins_woe)

In [None]:
data[[important_feat['Features'][1]]].merge(tmp_woe[tmp_woe['Features']==important_feat['Features'][1]][['Input','WOE']].rename(columns={'Input':important_feat['Features'][1]}),how='left',on=important_feat['Features'][1])['WOE']

In [None]:
important_feat.shape

In [None]:
data_woe = data[['class']]
for i in range(0,important_feat.shape[0]):
    tmp_feat_nm = data.columns[i]+'_woe'
    data_woe[tmp_feat_nm] = data[[important_feat['Features'][i]]].merge(tmp_woe[tmp_woe['Features']==important_feat['Features'][i]][['Input','WOE']].rename(columns={'Input':important_feat['Features'][i]}),how='left',on=important_feat['Features'][i])['WOE']

In [None]:
data_woe

# Traditional Logistic Model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from matplotlib import pyplot

In [None]:
X = data_woe.iloc[:,1:]
y = data_woe.iloc[:,0:1]

In [None]:
# split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

In [None]:
lr = LogisticRegression(random_state=0).fit(X_train, y_train)

In [None]:
ns_probs = [0 for _ in range(len(y))]
lr_train_probs = lr.predict_proba(X_train)[:, 1]
lr_test_probs = lr.predict_proba(X_test)[:, 1]

In [None]:
ns_auc = roc_auc_score(y, ns_probs)
lr_train_auc = roc_auc_score(y_train, lr_train_probs)
lr_test_auc = roc_auc_score(y_test, lr_test_probs)

In [None]:
# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Trainning Logistic: ROC AUC=%.3f' % (lr_train_auc))
print('Testing Logistic: ROC AUC=%.3f' % (lr_test_auc))