In [190]:
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import VarianceThreshold

In [199]:
import scorecardpy as sc
dat=sc.germancredit()

In [200]:
dat['creditability']=dat['creditability'].map({'good':0,'bad':1})

In [221]:
def univariate(data,target):  ##OK
    var_start_list = pd.DataFrame(data.dtypes, index=None)
    uniquecnt = data.apply(pd.Series.nunique)
    desc = data.describe().transpose()
    cor = data.select_dtypes(include=["int64", "float64"]).apply(lambda x: x.corr(data[str(target)]))  # watch out for other numeric data types
    zeros = data.apply(lambda x: (x[x == 0].shape[0]/x.shape[0]))
    null = data.apply(lambda x: (x[x.isnull()].shape[0]/x.shape[0]))
    var_start_list = var_start_list.merge(pd.DataFrame(uniquecnt), how="left", left_index=True, right_index=True)
    var_start_list.rename(columns={"0_x": "type", "0_y": "var_vals"}, inplace=True)
    var_start_list = var_start_list.merge(desc[["min", "max", "mean", "50%"]], how="left", left_index=True, right_index=True)
    var_start_list = var_start_list.merge(pd.DataFrame(cor), how="left", left_index=True,right_index=True)
    var_start_list = var_start_list.merge(pd.DataFrame(zeros), how="left", left_index=True, right_index=True)
    var_start_list = var_start_list.merge(pd.DataFrame(null), how="left", left_index=True, right_index=True)
    var_start_list.rename(columns={0: "percentNull", "0_x": "CorrelationWithTarget", "0_y": "percentZeros", "min": "var_min","max": "var_max", "50%": "var_median", "mean": "var_mean"}, inplace=True)
    return var_start_list


# Calculate information value
def calc_iv(df, feature, target, pr=False):
    """
    Set pr=True to enable printing of output.
    
    Output: 
      * iv: float,
      * data: pandas.DataFrame
    """

    lst = []

    df[feature] = df[feature].fillna("NULL")

    for i in range(df[feature].nunique()):
        val = list(df[feature].unique())[i]
        lst.append([feature,                                                        # Variable
                    val,                                                            # Value
                    df[df[feature] == val].count()[feature],                        # All
                    df[(df[feature] == val) & (df[target] == 0)].count()[feature],  # Good 
                    df[(df[feature] == val) & (df[target] == 1)].count()[feature]]) # Bad

    data = pd.DataFrame(lst, columns=['Variable', 'Value', 'All', 'Good', 'Bad'])

    data['Share'] = data['All'] / data['All'].sum()
    data['Bad Rate'] = data['Bad'] / data['All']
    data['Distribution Good'] = (data['All'] - data['Bad']) / (data['All'].sum() - data['Bad'].sum())
    data['Distribution Bad'] = data['Bad'] / data['Bad'].sum()
    data['WoE'] = np.log(data['Distribution Good'] / data['Distribution Bad'])

    data = data.replace({'WoE': {np.inf: 0, -np.inf: 0}})

    data['IV'] = data['WoE'] * (data['Distribution Good'] - data['Distribution Bad'])

    data = data.sort_values(by=['Variable', 'Value'], ascending=[True, True])
    data.index = range(len(data.index))

    if pr:
        print(data)
        print('IV = ', data['IV'].sum())


    iv = data['IV'].sum()
    # print(iv)

    return iv, data




def variablefilter(data,predictors,target,num_var_threshold=0.95,cat_threshold_perc=0.005,missing_limit_perc=0.7,iv_limit=0.002):
    
    ##missing limit
    nan_rate = lambda a: a[a.isnull()].size/a.size
    na_perc = data[predictors].apply(nan_rate).reset_index(name='missing_rate').rename(columns={'index':'variable'})
     
    ##identical limit
    idt_rate = lambda a: a.value_counts().max() / a.size
    identical_perc = data[columns].apply(idt_rate).reset_index(name='identical_rate').rename(columns={'index':'variable'})    
    
    ###variance threshold drop columns
    num_columns=[i for i in data.columns if data[i].dtype in [np.int64,np.float64]]
    sel=VarianceThreshold(num_var_threshold)
    sel.fit(data[num_columns])
    a=sel.get_support() 
    i,=np.where(a==True)
    num_columns_new=[]
    for a in i:
        num_columns_new.append(num_columns[a])
    data=data[num_columns]
    
    ###categorical threshold rename columns
    
    
    
    ##identical limit
    idt_rate = lambda a: a.value_counts().max() / a.size
    identical_perc = data[columns].apply(idt_rate).reset_index(name='identical_rate').rename(columns={'index':'variable'})
    
    
    
    ###IV
    iv_list = calc_iv(dt, columns, 'creditability', pr=False)
    

In [95]:
univariate(dat,'creditability')

Unnamed: 0,type,var_vals,var_min,var_max,var_mean,var_median,CorrelationWithTarget,percentZeros,percentNull
status.of.existing.checking.account,category,4,,,,,,0.0,0.0
duration.in.month,int64,33,4.0,72.0,20.903,18.0,0.214927,0.0,0.0
credit.history,category,5,,,,,,0.0,0.0
purpose,object,10,,,,,,0.0,0.0
credit.amount,int64,921,250.0,18424.0,3271.258,2319.5,0.154739,0.0,0.0
savings.account.and.bonds,category,5,,,,,,0.0,0.0
present.employment.since,category,5,,,,,,0.0,0.0
installment.rate.in.percentage.of.disposable.income,int64,4,1.0,4.0,2.973,3.0,0.072404,0.0,0.0
personal.status.and.sex,category,3,,,,,,0.0,0.0
other.debtors.or.guarantors,category,3,,,,,,0.0,0.0


In [233]:
calc_iv(dat,'creditability','status.of.existing.checking.account')

TypeError: 'list' object is not callable

In [232]:
iv_list=[]
for i in dat.columns:
    iv_list.append(calc_iv(dat,'creditability',str(i)))

TypeError: 'list' object is not callable

In [230]:
a=['status.of.existing.checking.account', 'duration.in.month',
        'credit.history', 'purpose', 'credit.amount',
        'savings.account.and.bonds', 'present.employment.since',
        'installment.rate.in.percentage.of.disposable.income',
        'personal.status.and.sex', 'other.debtors.or.guarantors',
        'present.residence.since', 'property', 'age.in.years',
        'other.installment.plans', 'housing',
        'number.of.existing.credits.at.this.bank', 'job',
        'number.of.people.being.liable.to.provide.maintenance.for', 'telephone',
        'foreign.worker', 'creditability']

In [192]:
from sklearn.datasets import load_digits
from sklearn.feature_selection import SelectPercentile, chi2

In [209]:
cat_columns=[i for i in dat.columns if dat[i].dtype not in [np.int64,np.float64]]

In [217]:
columns=dat.columns

In [218]:
idt_rate = lambda a: a.value_counts().max() / a.size
identical_perc = data[columns].apply(idt_rate).reset_index(name='identical_rate').rename(columns={'index':'variable'})

In [219]:
identical_perc

Unnamed: 0,variable,identical_rate
0,status.of.existing.checking.account,0.394
1,duration.in.month,0.184
2,credit.history,0.53
3,purpose,0.28
4,credit.amount,0.003
5,savings.account.and.bonds,0.603
6,present.employment.since,0.339
7,installment.rate.in.percentage.of.disposable.i...,0.476
8,personal.status.and.sex,0.548
9,other.debtors.or.guarantors,0.907


In [220]:
dat['foreign.worker'].value_counts()

yes    963
no      37
Name: foreign.worker, dtype: int64