In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt 

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Given dataset about different types of loans in Germany, we need to predict the risk of the loan ( High Risk or Low Risk )

In [None]:
# Reading the data 

data=pd.read_csv('../input/german-credit-data-with-risk/german_credit_data.csv')

In [None]:
data.head()

### Exploring the data 

In [None]:
data.info()

#### We have 6 columns with the type object, so we would need to perform encoding for these columns 

In [None]:
data.isnull().sum()

#### We can see that only two columns have missing values 

### Data Preprocessing 

#### 1. Dropping the duplicate ID column 

In [None]:
data.drop('Unnamed: 0',axis=1,inplace=True)

In [None]:
data.head()

#### 2. Dealing with the missing values 

#### Missing values in 2 columns :- Savings accounts & Checking account 

In [None]:
data['Saving accounts'].unique()

In [None]:
data['Checking account'].unique()

Will change missing values to the most frequently occuring value.  

In [None]:
#Filling missing values using mode 

data['Saving accounts'].fillna(data['Saving accounts'].mode()[0],inplace=True)
data['Checking account'].fillna(data['Checking account'].mode()[0],inplace=True)

In [None]:
print(data['Saving accounts'].unique())


In [None]:
print(data['Checking account'].unique())

In [None]:
data.isnull().sum()

We have no more missing values

### Performing graphical EDA

UNIVARIATE ANALYSIS

In [None]:
#List of Categorical Columns 

cols=data.select_dtypes('object').columns
cat_cols=cols.tolist()

In [None]:
cat_cols=['Sex', 'Housing', 'Saving accounts', 'Checking account', 'Purpose', 'Risk']


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
fig,ax=plt.subplots(2,3,figsize=(20,10))
fig.tight_layout(pad=5.5)

for var, plot in zip(cat_cols,ax.flatten()):
    
    sns.countplot(x=data[var],ax=plot)
    
    for label in plot.get_xticklabels():
        label.set_rotation(90)
    
    

#### INSIGHTS:- 

1. We have almost 2 times more Males than Females
2. Most people who apply for a loan have their own house 
3. Most people have little amount in their accounts 
4. There are maximum number of car loans
5. Most of the loans are low risk loans

In [None]:
#List of Numerical Values

num_cols=data.select_dtypes('int64').columns.tolist()
num_cols=['Age', 'Credit amount', 'Duration']


In [None]:
#Histogram

data[num_cols].hist(bins=100,figsize=(20,6),layout=(1,3))
plt.show()

In [None]:
#DISTRIBUTION PLOTS AND BOX PLOTS 


fig, ax = plt.subplots(1,3,figsize=(20,5))
plt.suptitle('BOX PLOTS')
sns.boxplot(x=data['Credit amount'], ax=ax[0]);
sns.boxplot(x=data['Duration'], ax=ax[1], color='salmon');
sns.boxplot(x=data['Age'], ax=ax[2], color='darkviolet');
#sns.countplot(x='Job',data=data);


plt.show()

In [None]:
sns.countplot(x='Job',data=data)
plt.show()

**INSIGHTS:-**

1. People around the age of 30 apply for more loans 
2. Credit amount for most loans is around 2000
3. Average Duration: 18-20 months
4. Most people have type 2 Job 

### BI-VARIATE ANALYSIS 

#### RISK ANALYSIS 

#### Based on Categorical Data

In [None]:
catcols=['Sex', 'Housing', 'Saving accounts', 'Checking account', 'Purpose','Job']
fig,ax=plt.subplots(2,3,figsize=(20,10))
plt.suptitle('RISK PLOTS')
fig.tight_layout(pad=5.5)

for var, plot in zip(catcols,ax.flatten()):
    
    sns.countplot(x=data[var],hue='Risk',data=data,ax=plot)
    
    for label in plot.get_xticklabels():
        label.set_rotation(90)

##### INSIGHTS:- 

1. There are more number of males. Loans given to males are less likely to be risky. 
2. Most people who take loans own a house, and loans given to them are less likely to be risky compared to people living on rent and free. 
3. Car Loans are the most popular. Education,Domestic Appliances, repairs, vacations seem to have a higher risk.
4. People with type 2 job take the most amount of loans. 


#### Based on Numerical Data

In [None]:
Risky=data[data['Risk']=='bad']
NotRisky=data[data['Risk']=='good']

fig, ax = plt.subplots(1,3,figsize=(25,5))

sns.histplot(Risky['Credit amount'],label='High Risk',color='red',kde=True,ax=ax[0],stat='probability');
sns.histplot(NotRisky['Credit amount'],label='Low Risk',color='Green',kde=True,ax=ax[0],stat='probability');

sns.histplot(Risky['Age'],label='High Risk',color='red',kde=True,ax=ax[1],stat='probability');
sns.histplot(NotRisky['Age'],label='Low Risk',color='Green',kde=True,ax=ax[1],stat='probability');


sns.histplot(Risky['Duration'],label='High Risk',color='red',kde=True,ax=ax[2],stat='probability');
sns.histplot(NotRisky['Duration'],label='Low Risk',color='Green',kde=True,ax=ax[2],stat='probability');

#plt.title('Credit Amount')
plt.legend()
plt.show()

##### INSIGHTS:-
1. For huge credit amounts the risk is a little higher 
2. The risk is higher for younger age groups:- 25-35
3. It seems like the less duration loans have more risk

In [None]:
sns.pairplot(data)

In [None]:
#Feature correlation Map

cor = data.corr()
sns.heatmap(cor, annot=True).set_title("Correlation Graph of Data Set",fontsize=15);
plt.show()

There is correlation between Credit amount and duration.This makes sense since long term credit could be related to higher amounts.  

### Performing Encoding for Categorical Data 

In [None]:
data

In [None]:
data.select_dtypes('object')

In [None]:
# Getting the unique values in each column by creating a dictionary 

{ column:list(data[column].unique()) for column in data.select_dtypes('object').columns}



In [None]:
{ column:len(data[column].unique()) for column in data.select_dtypes('object').columns}


In [None]:

#Writing a function that would do binary encoding

def binary_encode(df,column_and_positive_val): #List of tuples
    df=df.copy()
    for column, positive in column_and_positive_val:
        df[column]=df[column].apply(lambda x:1 if x==positive else 0)
        
    return df
    

In [None]:
data['Risk'].value_counts()

Since we want to predict the number of high risk loans( bad), we will encode bad as the positive value.

In [None]:
dataset=binary_encode(data,column_and_positive_val=[('Sex','male'),('Risk','bad')])

Making a new copy instead of altering the original dataset

In [None]:
dataset

Done with Binary encoding. 

NEXT: Ordinal Encoding. 
Used when we have more than 2 values but the values in the columns take a specific order. 

In [None]:
#Writing a function for ordinal encoding 

def ordinal_encode(df,ordered_column_list): #List of tuples
    df=df.copy()
    for column,order in ordered_column_list:
        df[column]=df[column].apply(lambda x:order.index(x))
    return df
    

In [None]:
#Ordinal Encoding for Savings Accounts and Checking acount columns

data_set=ordinal_encode(
dataset,
[
    ('Saving accounts',['little','moderate','rich', 'quite rich']), #We can choose the order ourselves
    ('Checking account',['little', 'moderate','rich'])
])


In [None]:
data_set

NEXT: More than 2 values, but there is no clear ordering between the values.
We will use one-hot encoding 

In [None]:
pd.get_dummies(data_set['Housing'],prefix='H') #To see exactly what the get_dummies does 

In [None]:
# Writing a function to do one-hot encoding 

def onehot_encoding(df,columns_and_prefixes): #Dataframe, List of Tuples
    df=df.copy()
    for column,prefix in columns_and_prefixes:
        #Create dummies
        dummies=pd.get_dummies(df[column],prefix=prefix)
        #Concatenate together side by side
        df=pd.concat([df,dummies],axis=1)
        #Drop the orginals 
        df=df.drop(column,axis=1)
    return df

In [None]:
#Performing one-hot encoding on columns:- Housing, Purpose

X=onehot_encoding(data_set,[
    ('Housing','H'), ('Purpose','P')
])



In [None]:
X

#### Now, we have taken care of all the categorical data. All the data is in numerical form. 

### NEXT: Splitting and Scaling the dataset

In [None]:
#Splitting the data set into features(X) and the value that we want to predict (y)

y=X['Risk'].copy()
X=X.drop('Risk',axis=1).copy()

In [None]:
y  #here, 1 means that it is high risk loan

In [None]:
X

In [None]:
#Train-Test Split

from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.7,random_state=42)

In [None]:
X_test


In [None]:
#Scaling the data 

from sklearn.preprocessing import StandardScaler

In [None]:
#Standard Scaler would give each column a mean of 0 and variance 1

#Creating a scaler object 
scaler=StandardScaler()
scaler.fit(X_train)

X_train=pd.DataFrame(scaler.transform(X_train),columns=X.columns)
X_test=pd.DataFrame(scaler.transform(X_test),columns=X.columns)

Not fitting the test set because it is a good practice to pretend that we do not have access to the test set.

In [None]:
X_train.mean()

Mean of each column is very close to 0 as expected. 

In [None]:
X_train.var()

We will not scale y_train because we want it to remain categorical ( 0 and 1). 

In [None]:
y_train

### Training the Model ( Logistic Regression )

In [None]:
from sklearn.linear_model import LogisticRegression


In [None]:
y_train.value_counts()

In [None]:
model = LogisticRegression()
model.fit(X_train,y_train)

In [None]:
model.score(X_test,y_test) #Accuracy 

In [None]:
model.predict(X_test)

In [None]:
model.predict_proba(X_test)[:5] #By Default, threshold=0.5

In [None]:
#If we want to change the threshold , which we probably want to do since we have an imbalance in our data
# If we lower the threshold, we are more likely to get positive value. ( i.e, predictions would be more sensitive towards high risk)
#By changing the threshold our model can make better predictions 

preds=list(map(
lambda x:x[1],
(model.predict_proba(X_test)>=0.3).astype(np.int)

))

In [None]:
list_models=['Logistic Regression','SVM']
list_accuracy=[0,0]


In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
# Evaluating the Model 

def evaluate_model(model, X_test, y_test, thresh=0.5):
    
    #y_test=np.array(y_test)
    y_p=(model.predict_proba(X_test)>=thresh).astype(np.int)
    y_pred=list(map(lambda x:x[1],y_p))
    
    print("Test Accuracy: {:.2f}%".format(accuracy_score(y_test,y_pred)*100))
    
    #Confusion Matrix & Classification Report ( Since our data is a little skewed/imbalanced)
    
    cm=confusion_matrix(y_test,y_pred)
    clr=classification_report(y_test,y_pred,target_names=['Not Risky','Risky'])
    
    
    plt.figure(figsize=(8,8))
    sns.heatmap(cm,annot=True,fmt='g',vmin=0,cbar=False,cmap='Blues')
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.xticks(np.arange(2)+0.5,['Not Risky','Risky'])
    plt.yticks(np.arange(2)+0.5,['Not Risky','Risky'])

    plt.show()
    
    print('CLASSIFICATION REPORT\n----------------------------------------------\n',clr)
   

Confusion Matrix: 
* 0,0; True Negatives
* 1,0: False Negatives
* 1,1: True Positives
* 0,1: False Positives


In [None]:
def evaluate(model, X_test, y_test, thresh=0.5):
    
    #y_test=np.array(y_test)
    y_p=(model.predict_proba(X_test)>=thresh).astype(np.int)
    y_pred=list(map(lambda x:x[1],y_p))
    
    print("Test Accuracy: {:.2f}%".format(accuracy_score(y_test,y_pred)*100))
    
    #Confusion Matrix & Classification Report ( Since our data is a little skewed/imbalanced)
    
    cm=confusion_matrix(y_test,y_pred)
    clr=classification_report(y_test,y_pred,target_names=['Not Risky','Risky'])
    
    
    print('CLASSIFICATION REPORT:-\n',clr)
    print('\n-----------------------------------------------------------------------\n')

In [None]:
evaluate_model(model, X_test, y_test, thresh=0.3)   #Choose thresh through cross validation set. By lowering threshold we are predicting risky better

We can see that our model is performing better in the not risky class. This is because we have more number of samples for that class. 

In [None]:
from sklearn.svm import SVC
svm=SVC(probability=True)
svm.fit(X_train,y_train)


In [None]:
svm.score(X_test,y_test) #Accuracy 

In [None]:
svm.predict(X_test)

In [None]:
svm.predict_proba(X_test)[:5] #By Default, threshold=0.5

In [None]:
evaluate_model(svm, X_test, y_test, thresh=0.3) 

Both the models seem to give a similar F-score for threshold=0.3. Neither is a very good model for our data. 

In [None]:
thresholds=[0.2,0.3,0.4,0.5,0.6,0.7,0.8]

print("\nLOGISTIC REGRESSION\n")
for t in thresholds:
    print("For Threshold= ",t)
    evaluate(model, X_test, y_test, thresh=t)

In [None]:
thresholds=[0.2,0.3,0.4,0.5,0.6,0.7,0.8]

print("\nSVM\n")
for t in thresholds:
    print("For Threshold= ",t)
    evaluate(svm, X_test, y_test, thresh=t)

Therefore, threshold of 0.3 seems to be the best for both the models. 



In [None]:
t=0.3
print("\nLOGISTIC REGRESSION\n")
print("For Threshold= ",t)
evaluate(model, X_test, y_test, thresh=t)
print("\nSVM\n")
print("For Threshold= ",t)
evaluate(svm, X_test, y_test, thresh=t)

Neither models seems to be performing too well. Both are almost similar. 