In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
gc = pd.read_csv('../input/german-credit-imts/German_Credit_data.csv')
gc.head()

In [None]:
gc.shape

In [None]:
gc.isnull().sum()  # There is  no null in the data set

In [None]:
gc.info() # there is no 'object' column all are 'int'

In [None]:
gc['Creditability'].value_counts() # here we can see that data set is imbalance for good and bad credit

# Lets Explore the Dataset

In [None]:
gc.hist(figsize=(16,15))
plt.show()

In [None]:
corr = gc.corr()
plt.figure(figsize=(16,10)) # it seems there is high correlation between 'Duartion of Credit Month' and 'Credit Amount'
sns.heatmap(corr[(corr>.5)|(corr<-.5)],annot=True,vmin = -1,vmax=1)

In [None]:
sns.countplot(gc.Property,hue=gc.No_of_dependents) # Here we can see that perosn who have more dependents also have less property

### Lets do some Feature Engineering

In [None]:
gc['Age'] = pd.cut(gc.Age_in_years,bins=[0,25,40,77],labels=['Youth','Adult','Senior'])

In [None]:
gc.drop('Age_in_years',axis=1,inplace=True)

In [None]:
gc.head()

In [None]:
sns.countplot(gc.Age,hue = gc.Property)

In [None]:
gc['Per_day_income'] = gc['Credit_Amount']/gc['Duration_of_Credit_month']

In [None]:
gc.drop(['Credit_Amount','Duration_of_Credit_month'],axis = 1,inplace= True)

### Heat Map after handling corrilation

In [None]:
corr1 = gc.corr()
plt.figure(figsize=(16,10))
sns.heatmap(corr1[(corr1>.5)|(corr1<-.5)],annot=True,vmin=-1,vmax=1)

In [None]:
gc['Age'].replace({'Youth':0,'Adult':1,'Senior':2},inplace = True)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,roc_auc_score,f1_score,classification_report

In [None]:
gc_train,gc_test = train_test_split(gc,test_size = .2)

# Model Without Oversampling

In [None]:
gc_x_train = gc_train.drop('Creditability',axis = 1)

In [None]:
gc_y_train = gc_train['Creditability']
gc_x_test = gc_test.drop('Creditability',axis=1)
gc_y_test = gc_test['Creditability']

In [None]:
dt = DecisionTreeClassifier()

In [None]:
dt.fit(gc_x_train,gc_y_train)

In [None]:
dt_pred = dt.predict(gc_x_test)

In [None]:
dt_conf = confusion_matrix(dt_pred,gc_y_test)
dt_conf

In [None]:
dt_acc = accuracy_score(dt_pred,gc_y_test)
dt_precision = precision_score(dt_pred,gc_y_test)
dt_roc = roc_auc_score(dt_pred,gc_y_test)
print('Accuracy for model dt is ---',dt_acc)
print('Precision for model dt is ---',dt_precision)
print('AUROC score for model dt is ---',dt_roc)
print('           --------------------------------------------')
print(classification_report(dt_pred,gc_y_test))

In [None]:
imp_features_dt = pd.DataFrame({'Features':gc_x_train.columns,'Values':dt.feature_importances_}).sort_values(by='Values',ascending=False)

In [None]:
imp_features_dt 

### Jst saw in here that 'Foreign_Worker' have zero importance in our model

# Model after oversampling 

In [None]:
over = gc_train[gc_train['Creditability']==0]

In [None]:
gco_train = pd.concat([over,gc_train])

In [None]:
gco_train['Creditability'].value_counts()

In [None]:
gco_x_train = gco_train.drop('Creditability',axis=1)
gco_y_train = gco_train['Creditability']
gco_x_test = gc_x_test.copy()
gco_y_test = gc_y_test.copy()

In [None]:
dto = DecisionTreeClassifier(criterion='entropy')

In [None]:
dto.fit(gco_x_train,gco_y_train)

In [None]:
dto_pred = dto.predict(gco_x_test)

In [None]:
dto_conf = confusion_matrix(dto_pred,gco_y_test)
dto_conf

In [None]:
dto_acc = accuracy_score(dto_pred,gco_y_test)
dto_precision = precision_score(dto_pred,gco_y_test)
dto_roc = roc_auc_score(dto_pred,gco_y_test)
print('Accuracy for model dt is ---',dto_acc)
print('Precision for model dt is ---',dto_precision)
print('AUROC score for model dt is ---',dto_roc)
print('           --------------------------------------------')
print(classification_report(dto_pred,gco_y_test))

In [None]:
imp_features_dto = pd.DataFrame({'Features':gco_x_train.columns,'Values':dto.feature_importances_}).sort_values(by='Values',ascending=False)

In [None]:
imp_features_dto

# Model After removing 'Foreign Worker' feature

In [None]:
gcf_train = gco_train.drop('Foreign_Worker',axis = 1)

In [None]:
gcf_x_train = gcf_train.drop('Creditability',axis=1)
gcf_y_train = gcf_train['Creditability']
gcf_x_test = gc_x_test.drop('Foreign_Worker',axis = 1)
gcf_y_test = gc_y_test.copy()

In [None]:
dtf = DecisionTreeClassifier()

In [None]:
dtf.fit(gcf_x_train,gcf_y_train)

In [None]:
dtf_pred = dtf.predict(gcf_x_test)

In [None]:
dtf_conf = confusion_matrix(dtf_pred,gcf_y_test)
dtf_conf

In [None]:
dtf_acc = accuracy_score(dtf_pred,gcf_y_test)
dtf_precision = precision_score(dtf_pred,gcf_y_test)
dtf_roc = roc_auc_score(dtf_pred,gcf_y_test)
print('Accuracy for model dt is ---',dtf_acc)
print('Precision for model dt is ---',dtf_precision)
print('AUROC score for model dt is ---',dtf_roc)
print('           --------------------------------------------')
print(classification_report(dtf_pred,gcf_y_test))

##### Removing feature is not good idea

# Log model without over sampling

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr = LogisticRegression()

In [None]:
lr.fit(gc_x_train,gc_y_train)

In [None]:
lr_pred = lr.predict(gc_x_test)

In [None]:
lr_conf = confusion_matrix(lr_pred,gc_y_test)
lr_conf

In [None]:
lr_acc = accuracy_score(lr_pred,gc_y_test)
lr_precision = precision_score(lr_pred,gc_y_test)
lr_roc = roc_auc_score(lr_pred,gc_y_test)
print('Accuracy for model dt is ---',lr_acc)
print('Precision for model dt is ---',lr_precision)
print('AUROC score for model dt is ---',lr_roc)
print('           --------------------------------------------')
print(classification_report(lr_pred,gc_y_test))

# Log model with Over Sampling

In [None]:
lro = LogisticRegression()

In [None]:
lro.fit(gco_x_train,gco_y_train)

In [None]:
lro_pred = lro.predict(gco_x_test)

In [None]:
lro_conf = confusion_matrix(gco_y_test,lro_pred)
lro_conf

In [None]:
lro_acc = accuracy_score(gco_y_test,lro_pred)
lro_precision = precision_score(gco_y_test,lro_pred)
lro_roc = roc_auc_score(gco_y_test,lro_pred)
print('Accuracy for model dt is ---',lro_acc)
print('Precision for model dt is ---',lro_precision)
print('AUROC score for model dt is ---',lro_roc)
print('           --------------------------------------------')
print(classification_report(lro_pred,gco_y_test))

# Log Modle without 'Foreign Worker' Feature

In [None]:
lrf = LogisticRegression()

In [None]:
lrf.fit(gcf_x_train,gcf_y_train)

In [None]:
lrf_pred = lrf.predict(gcf_x_test)

In [None]:
lrf_conf = confusion_matrix(gcf_y_test,lrf_pred)
lrf_conf

In [None]:
lrf_acc = accuracy_score(gcf_y_test,lrf_pred)
lrf_precision = precision_score(gcf_y_test,lrf_pred)
lrf_roc = roc_auc_score(gcf_y_test,lrf_pred)
print('Accuracy for model dt is ---',lrf_acc)
print('Precision for model dt is ---',lrf_precision)
print('AUROC score for model dt is ---',lrf_roc)
print('           --------------------------------------------')
print(classification_report(gcf_y_test,lrf_pred))

# Model Comparison

In [None]:
model_comp = pd.DataFrame({'Comparison Measures':['Accuracy','Precision','AUROC Score'],
                          'DT without OS':[dt_acc,dt_precision,dt_roc],
                          'DT with OS':[dto_acc,dto_precision,dto_roc],
                          'DT removing 1 feature':[dtf_acc,dtf_precision,dtf_roc],
                          'LR without OS':[lr_acc,lr_precision,lr_roc],
                          'LR with OS':[lro_acc,lro_precision,lro_roc],
                          'LR removing 1 feature':[lrf_acc,lrf_precision,lrf_roc]})

In [None]:
model_comp

Log model without 'Foreign Worker' feature works best among other models it can give precision for class(0) = 66% &
for class(1) = 82% which is far better than any other model. For us it is imp to predict class(0) precisely than class(1).