In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import auc
from sklearn.model_selection import cross_val_score
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
df = pd.read_csv('accepts.csv',encoding='utf-8')

In [3]:
df.shape

(5845, 25)

### Answer (i)

In [4]:
df = df[['fico_score','bankruptcy_ind','tot_derog','age_oldest_tr','ltv','bad_ind']]

In [5]:
# Imbalanced Data
df['bad_ind'].value_counts()

0    4648
1    1197
Name: bad_ind, dtype: int64

In [6]:
df.isnull().sum()

fico_score        314
bankruptcy_ind    217
tot_derog         213
age_oldest_tr     216
ltv                 1
bad_ind             0
dtype: int64

In [7]:
df = df.dropna()
df.head()

Unnamed: 0,fico_score,bankruptcy_ind,tot_derog,age_oldest_tr,ltv,bad_ind
0,650.0,N,7.0,64.0,99.0,1
1,649.0,N,0.0,240.0,99.0,0
2,613.0,N,7.0,60.0,92.0,1
3,603.0,N,3.0,35.0,118.0,1
4,764.0,N,0.0,104.0,122.0,0


### Answer (ii)

In [8]:
df.dtypes

fico_score        float64
bankruptcy_ind     object
tot_derog         float64
age_oldest_tr     float64
ltv               float64
bad_ind             int64
dtype: object

In [9]:
df['bankruptcy_ind'].unique()

array(['N', 'Y'], dtype=object)

In [10]:
df['bankruptcy_ind'] = df['bankruptcy_ind'].apply(lambda x : 1 if x == 'Y' else 0)

In [11]:
df.head()

Unnamed: 0,fico_score,bankruptcy_ind,tot_derog,age_oldest_tr,ltv,bad_ind
0,650.0,0,7.0,64.0,99.0,1
1,649.0,0,0.0,240.0,99.0,0
2,613.0,0,7.0,60.0,92.0,1
3,603.0,0,3.0,35.0,118.0,1
4,764.0,0,0.0,104.0,122.0,0


In [12]:
x,y = df[df.columns[:-1]].values,df['bad_ind'].values

In [13]:
sc_x = StandardScaler()
x_std = sc_x.fit_transform(x)

In [14]:
x_train,x_test,y_train,y_test = train_test_split(x_std,y,random_state=1)

### Answer (iii)

In [15]:
log_reg = LogisticRegression()
log_reg.fit(x_train,y_train)
print(log_reg.coef_)

[[-0.85913747 -0.10882728  0.09715537 -0.25018623  0.45182299]]


### Answer (iv)

In [16]:
vif = pd.DataFrame()
vif['VIF_Factor'] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1]-1)]
print(vif)

   VIF_Factor
0   31.112037
1    1.266256
2    1.611147
3    3.876495
4   27.512701


In [17]:
# fico_score / ltv

### Answer (v)

In [18]:
pred_proba_df = pd.DataFrame(log_reg.predict_proba(x_test),columns=['0','1'])
pred_proba_df['result'] = pred_proba_df['1'].apply(lambda x: 1 if x>0.2 else 0)
pred_proba_df.head()

Unnamed: 0,0,1,result
0,0.778444,0.221556,1
1,0.729439,0.270561,1
2,0.80181,0.19819,0
3,0.975869,0.024131,0
4,0.951507,0.048493,0


In [19]:
pred_proba_df['result'].value_counts()

0    799
1    569
Name: result, dtype: int64

### Answer (vi)

In [20]:
conf_metrix = confusion_matrix(y_test,log_reg.predict(x_test).tolist())

In [21]:
tp = conf_metrix[0][0]
fp = conf_metrix[1][0]
fn = conf_metrix[0][1]
tn = conf_metrix[1][1]

print('Accuracy: ',(tp+tn)/(tp+tn+fn+fp))
print('Error Rate: ',(fp+fn)/(tp+tn+fn+fp))
print('Sensitivity: ',tp/(tp+fn))
print('Specificity: ',tn/(tn+fp))

Accuracy:  0.7887426900584795
Error Rate:  0.21125730994152048
Sensitivity:  0.9738072965388214
Specificity:  0.12709030100334448


### Answer (vii)

In [22]:
def calculation(thresold:float):
    temp_df = pd.DataFrame(log_reg.predict_proba(x_test),columns=['0','1'])
    temp_df['result'] = temp_df['1'].apply(lambda x: 1 if x>thresold else 0)
    return temp_df['result'].values.tolist()

In [23]:
bound_rate = 0.1
conf_ = []

for _ in range(9):
    y_pred = calculation(round(bound_rate,1))
    temp_conf = confusion_matrix(y_test,y_pred)
    tp = temp_conf[0][0]
    fp = temp_conf[1][0]
    fn = temp_conf[0][1]
    tn = temp_conf[1][1]
    conf_.append({
        'bound_rate' : bound_rate,
        'conf_' : temp_conf,
        'accuracy' : (tp+tn)/(tp+tn+fn+fp),
        'error_rate' : (fp+fn)/(tp+tn+fn+fp),
        'sensitivity' : tp/(tp+fn),
        'specificity' : tn/(tn+fp)
    })
    bound_rate+=0.1

In [24]:
min(conf_,key=lambda x : x['conf_'][0][1]+x['conf_'][1][0])

{'bound_rate': 0.5,
 'conf_': array([[1041,   28],
        [ 261,   38]], dtype=int64),
 'accuracy': 0.7887426900584795,
 'error_rate': 0.21125730994152048,
 'sensitivity': 0.9738072965388214,
 'specificity': 0.12709030100334448}

In [25]:
conf_

[{'bound_rate': 0.1,
  'conf_': array([[447, 622],
         [ 15, 284]], dtype=int64),
  'accuracy': 0.5343567251461988,
  'error_rate': 0.4656432748538012,
  'sensitivity': 0.41814780168381666,
  'specificity': 0.9498327759197325},
 {'bound_rate': 0.2,
  'conf_': array([[734, 335],
         [ 65, 234]], dtype=int64),
  'accuracy': 0.7076023391812866,
  'error_rate': 0.29239766081871343,
  'sensitivity': 0.686623012160898,
  'specificity': 0.782608695652174},
 {'bound_rate': 0.30000000000000004,
  'conf_': array([[920, 149],
         [141, 158]], dtype=int64),
  'accuracy': 0.7880116959064327,
  'error_rate': 0.21198830409356725,
  'sensitivity': 0.8606173994387278,
  'specificity': 0.5284280936454849},
 {'bound_rate': 0.4,
  'conf_': array([[989,  80],
         [210,  89]], dtype=int64),
  'accuracy': 0.7880116959064327,
  'error_rate': 0.21198830409356725,
  'sensitivity': 0.9251637043966323,
  'specificity': 0.2976588628762542},
 {'bound_rate': 0.5,
  'conf_': array([[1041,   28],
 

### Answer (viii)

In [26]:
y_pred = log_reg.predict(x_test)
fpr, tpr, thresholds = metrics.roc_curve(y_test,y_pred)
metrics.auc(fpr, tpr)

0.5504487987710829

### Answer (ix)

In [27]:
log_model = LogisticRegression()
cv_results = cross_val_score(log_model,x_std,y,cv=5,scoring=('roc_auc'))

In [28]:
cv_results

array([0.75056883, 0.77176295, 0.77351725, 0.76548131, 0.78485932])