In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from plotly import tools
import plotly as py
import plotly.figure_factory as ff
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# Exploratory Data Analysis - EDA

In [None]:
df = pd.read_csv("../input/german-credit-data-with-risk/german_credit_data.csv", index_col=0)
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
for col in list(df):
    print(col)

In [None]:
for col in list(df):
    print(col)
    print(df[col].unique())
    

In [None]:
missing_values=pd.DataFrame({'Missing Values':df.isnull().sum(),'Missing % Values':(df.isnull().sum()/1000)*100})
missing_values

In [None]:
numerical = ['Credit amount','Age','Duration']
categorical = ['Sex','Job','Housing','Saving accounts','Checking account','Purpose']
df.shape

In [None]:
for cat in categorical:
    df[cat] = df[cat].fillna(df[cat].mode().values[0])
df.isnull().sum()

# Since we had a small data set its better to fill the data with its mean rather than removing the rows or columns which can result in biased results and predictions.

In [None]:
missing_values=pd.DataFrame({'Missing Values':df.isnull().sum(),'Missing % Values':(df.isnull().sum()/1000)*100})
missing_values

In [None]:
df.describe()

In [None]:
df = df.rename(columns={"Credit amount": "Credit_amount","Saving accounts":"Saving_accounts","Checking account":"Checking_account"})

In [None]:
df

In [None]:
df.info()

In [None]:
male_credit = df["Credit_amount"].loc[df["Sex"] == "male"].values
female_credit = df["Credit_amount"].loc[df["Sex"] == "female"].values
total_credit = df['Credit_amount'].values

fig, ax = plt.subplots(1, 3, figsize=(15,5))

sns.distplot(male_credit, ax=ax[0], color="blue")
ax[0].set_title("Male Credit Distribution", fontsize=15)
sns.distplot(female_credit, ax=ax[1], color="red")
ax[1].set_title("Female Credit Distribution", fontsize=15)
sns.distplot(total_credit, ax=ax[2], color="green")
ax[2].set_title("Total Credit Distribution", fontsize=15)
plt.show()

# The Above plots shows the distribution of Credit_Amount for each Gender and overall

In [None]:
plt.figure(figsize=(14,5))
gx=sns.boxplot(x='Sex', y='Age', data=df, palette="RdBu")
gx.set_title("Age vs Sex")
gx.set_ylabel("Age")
gx.set_xlabel("Sex")
plt.show()
#This plot shows how many people are male and female within the specified age.

In [None]:
plt.figure(figsize=(15,5))
sns.histplot(data=df, x="Purpose", kde=True, color="y")
# This plot shows the purpose of loan that people took.

In [None]:
plt.figure(figsize=(15,5))
sns.histplot(data=df, x="Housing", color="b")
# This plot shows how many people own the house and how many are on rent or free. 

In [None]:
male_age = df["Age"].loc[df["Sex"] == "male"].values
female_age = df["Age"].loc[df["Sex"] == "female"].values
All_age = df['Age'].values
fig, ax = plt.subplots(1, 3, figsize=(15,5))
sns.histplot(male_age,ax=ax[0],color="blue")
ax[0].set_title("Male Age", fontsize=15)
sns.histplot(female_age, ax=ax[1], color="red")
ax[1].set_title("Female Age", fontsize=15)
sns.histplot(All_age, ax=ax[2], color="green")
ax[2].set_title("All Age ", fontsize=15)
plt.show()
# This plot shows what kind of people(men or female) have taken loan the most with respect to their age.

In [None]:
df["Purpose"].unique()
sex_purpose = pd.crosstab(df['Purpose'], df['Sex'])
display(sex_purpose)
#This Frame Shows the purpose of having credits for each gender

In [None]:
number_of_jobs = pd.crosstab(df["Job"], df["Sex"])
number_of_jobs
#This Frame Shows the No of Jobs for each gender

In [None]:
#dividing Age groups into different categories
df['Age_Group'] = np.nan

lst = [df]

for col in lst:
    col.loc[(col['Age'] > 18) & (col['Age'] <= 29), 'Age_Group'] = 'Young'
    col.loc[(col['Age'] > 29) & (col['Age'] <= 40), 'Age_Group'] = 'Young Adults'
    col.loc[(col['Age'] > 40) & (col['Age'] <= 55), 'Age_Group'] = 'Senior'
    col.loc[col['Age'] > 55, 'Age_Group'] = 'Elder' 
    
df

# Finding correlated variables and variables affecting the target variable the most.

In [None]:
#Using Pearson Correlation
plt.figure(figsize=(12,10))
cor = df.corr()
sns.heatmap(cor, annot=True)
plt.show()

In [None]:
df["Risk"].unique()

In [None]:
df.replace(['good','bad'],[1,0],inplace=True)

In [None]:
pd.to_numeric(df["Risk"],errors='coerce')

In [None]:
#Using Pearson Correlation
plt.figure(figsize=(12,10))
cor = df.corr()
sns.heatmap(cor, annot=True)
plt.show()

#### Credit amount and Duration attributes have a strong postive relationship. Greater the credit amount, greater will be the duration.
#### Credit amount and Duration have a negative correlatiopn with the target variable(Risk), which means poeple who have larger credit loans have higher risk
#### larger duration of loan tend towards Bad Risk.
#### Feature Purpose have no relation with the target variable (Risk).

# Applying Some of the Machine Learning Algorithm's in order to find; Assess accuracy, F1 Score and Precision recall

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.metrics import roc_auc_score,roc_curve
from sklearn.metrics import confusion_matrix
from sklearn import metrics

In [None]:
df_clean=df.copy()

In [None]:
cat_features = ['Sex','Housing', 'Saving_accounts', 'Checking_account','Purpose']
num_features=['Age', 'Job', 'Credit_amount', 'Duration','Risk']
for variable in cat_features:
    dummies = pd.get_dummies(df_clean[cat_features])
    df1= pd.concat([df_clean[num_features], dummies],axis=1)
Risk= df1['Risk']          
df2=df1.drop(['Risk'],axis=1)

## Input Split

In [None]:
X_train,X_test,Y_train,Y_test = train_test_split(df2,Risk,test_size=0.20,random_state = 30)

In [None]:
from yellowbrick.classifier import ConfusionMatrix, ClassificationReport, ROCAUC
from yellowbrick.features import FeatureImportances



def visualize(model):

    
    fig, axes = plt.subplots(1, 3,figsize=(15,5))
    fig.subplots_adjust(wspace=0.7)
    
    visualgrid = [
        #FeatureImportances(model,ax=axes[0][0]),
        ROCAUC(model, ax=axes[1],cmap='coolwarm'),
        ConfusionMatrix(model,cmap='PuOr', ax=axes[2]),
        ClassificationReport(model, cmap='PuRd',ax=axes[0])
        
    ]

    for viz in visualgrid:
        viz.fit(X_train, Y_train)
        viz.score(X_test, Y_test)
        viz.finalize()

    plt.show()

# Logistic Regression

In [None]:
#Logistic Regression
model = LogisticRegression()
model.fit(X_train,Y_train)
predictions=model.predict(X_test)
probabilities = model.predict_proba(X_test)

In [None]:
print ("\n Classification report : \n",classification_report(Y_test,predictions))
print ("Accuracy Score   : ",accuracy_score(Y_test,predictions))
model_roc_auc = roc_auc_score(Y_test,predictions) 
print ("Area under curve : ",model_roc_auc)

In [None]:
lr = LogisticRegression()
lr.fit(X_train, Y_train)
Y_test_pred = lr.predict(X_test)

confusion_matrix = confusion_matrix(Y_test, Y_test_pred)
confusion_matrix

In [None]:
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(predictions,Y_test)
sns.heatmap(conf_matrix,annot=True,fmt = "d",square = True)
plt.ylabel('Actual label'),
plt.xlabel("Predicted")
plt.title("CONFUSION MATRIX",color = "grey")


In [None]:
model_roc_auc = roc_auc_score(Y_test,predictions) 
y_pred_proba = model.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(Y_test, y_pred_proba)
plt.plot([0,1],[0,1],'k--')
plt.plot(fpr,tpr, label='Knn')
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('ROC curve')
print ("Area under curve : ",model_roc_auc)
plt.show()

In [None]:
visualize(model)

# KNN -  K-Nearest Neighbors Algorithm

In [None]:
#KNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=15)
#Train the model using the training sets 
knn.fit(X_train, Y_train)

#Predict the response for test dataset
predictions  = knn.predict(X_test)
probabilities = model.predict_proba(X_test)
fpr,tpr,thresholds = roc_curve(Y_test,probabilities[:,1])

In [None]:
print ("\n Classification report : \n",classification_report(Y_test,predictions))
print ("Accuracy Score   : ",accuracy_score(Y_test,predictions))
model_roc_auc = roc_auc_score(Y_test,predictions) 
print ("Area under curve : ",model_roc_auc)

In [None]:
conf_matrix = confusion_matrix(predictions,Y_test)
conf_matrix

In [None]:
conf_matrix = confusion_matrix(predictions,Y_test)
sns.heatmap(conf_matrix,annot=True,fmt = "d",square = True,
            xticklabels=["not churn","churn"],
            yticklabels=["not churn","churn"],linewidths = 2,linecolor = "w",cmap = "Set1")
plt.title("CONFUSION MATRIX",color = "Brown")

In [None]:
model_roc_auc = roc_auc_score(Y_test,predictions) 
y_pred_proba = model.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(Y_test, y_pred_proba)
plt.plot([0,1],[0,1],'k--')
plt.plot(fpr,tpr, label='Knn')
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('ROC curve')
print ("Area under curve : ",model_roc_auc)
plt.show()

In [None]:
visualize(knn)

# Decision Tree Algorithm

In [None]:
#Decision Trees
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, plot_tree

In [None]:
clf=DecisionTreeClassifier(criterion="entropy", max_depth=6)
clf.fit(X_train,Y_train)

In [None]:
plt.figure(figsize=(30,17))
plot_tree(clf,fontsize=9,filled=True)
plt.show()

In [None]:
prediction_test=clf.predict(X_test) 

In [None]:
prediction_test

In [None]:
confusion_matrix(prediction_test,Y_test)

In [None]:
print ("\n Classification report : \n",classification_report(Y_test,prediction_test))
print ("Accuracy Score   : ",accuracy_score(Y_test,prediction_test))
model_roc_auc = roc_auc_score(Y_test,prediction_test) 
print ("Area under curve : ",model_roc_auc)

In [None]:
conf_matrix = confusion_matrix(prediction_test,Y_test)
sns.heatmap(pd.DataFrame(conf_matrix), annot=True, cmap="Greens" ,fmt='g')
plt.tight_layout()
plt.title('Confusion matrix')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
model_roc_auc = roc_auc_score(Y_test,prediction_test) 
y_pred_proba = model.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(Y_test, y_pred_proba)
plt.plot([0,1],[0,1],'k--')
plt.plot(fpr,tpr, label='Knn')
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('ROC curve')
print ("Area under curve : ",model_roc_auc)
plt.show()

In [None]:
visualize(clf)

# Gaussian NB 

In [None]:
#GaussianNB
from sklearn.naive_bayes import GaussianNB

In [None]:
#Model Training
model_nb = GaussianNB()
model_nb.fit(X_train, Y_train);

# Model Prediction
predictions_NB = model_nb.predict(X_test)
probabilities = model_nb.predict_proba(X_test)
fpr,tpr,thresholds = roc_curve(Y_test,probabilities[:,1])

In [None]:
print ("\n Classification report : \n",classification_report(Y_test,predictions_NB))
print ("Accuracy Score   : ",accuracy_score(Y_test,predictions_NB))
model_roc_auc = roc_auc_score(Y_test,predictions_NB) 
print ("Area under curve : ",model_roc_auc)

In [None]:
conf_matrix = confusion_matrix(predictions_NB,Y_test)
sns.heatmap(pd.DataFrame(conf_matrix), annot=True, cmap="Greens" ,fmt='g')
plt.tight_layout()
plt.title('Confusion matrix')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
model_roc_auc = roc_auc_score(Y_test,predictions_NB) 
y_pred_proba = model.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(Y_test, y_pred_proba)
plt.plot([0,1],[0,1],'k--')
plt.plot(fpr,tpr, label='Knn')
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('ROC curve')
print ("Area under curve : ",model_roc_auc)
plt.show()

In [None]:
visualize(model_nb)

# Random Forest

In [None]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier

In [None]:

#Model Training
model_rfc = RandomForestClassifier()
model_rfc.fit(X_train, Y_train);

#Prediction
predictions_rfc = model_rfc.predict(X_test)
probabilities = model_rfc.predict_proba(X_test)
fpr,tpr,thresholds = roc_curve(Y_test,probabilities[:,1])

In [None]:
print ("\n Classification report : \n",classification_report(Y_test,predictions_rfc))
print ("Accuracy Score   : ",accuracy_score(Y_test,predictions_rfc))
model_roc_auc = roc_auc_score(Y_test,predictions_rfc) 
print ("Area under curve : ",model_roc_auc)

In [None]:
conf_matrix = confusion_matrix(predictions_rfc,Y_test)
sns.heatmap(pd.DataFrame(conf_matrix), annot=True, cmap="Greens" ,fmt='g')
plt.tight_layout()
plt.title('Confusion matrix')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
model_roc_auc = roc_auc_score(Y_test,predictions_rfc) 
y_pred_proba = model.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(Y_test, y_pred_proba)
plt.plot([0,1],[0,1],'k--')
plt.plot(fpr,tpr, label='Knn')
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('ROC curve')
print ("Area under curve : ",model_roc_auc)
plt.show()

In [None]:
visualize(model_rfc)

# XG BOOST

In [None]:
#XG BOOST
import xgboost as xgb

In [None]:
#Model Training
model_xgb  = xgb.XGBClassifier()
model_xgb.fit(X_train, Y_train);

#Prediction
predictions_xgb = model_xgb.predict(X_test)
probabilities = model_xgb.predict_proba(X_test)
fpr,tpr,thresholds = roc_curve(Y_test,probabilities[:,1])

In [None]:
print ("\n Classification report : \n",classification_report(Y_test,predictions_xgb))
print ("Accuracy Score   : ",accuracy_score(Y_test,predictions_xgb))
model_roc_auc = roc_auc_score(Y_test,predictions_xgb) 
print ("Area under curve : ",model_roc_auc)

In [None]:
conf_matrix = confusion_matrix(predictions_xgb,Y_test)
sns.heatmap(pd.DataFrame(conf_matrix), annot=True, cmap="Blues" ,fmt='g')
plt.tight_layout()
plt.title('Confusion matrix')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
model_roc_auc = roc_auc_score(Y_test,predictions_xgb) 
y_pred_proba = model.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(Y_test, y_pred_proba)
plt.plot([0,1],[0,1],'k--')
plt.plot(fpr,tpr, label='Knn')
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('ROC curve')
print ("Area under curve : ",model_roc_auc)
plt.show()

In [None]:
visualize(model_xgb)

# Ada - Boost

In [None]:
#Ada Boost
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
#Model Training
model_adaboost = AdaBoostClassifier()
model_adaboost.fit(X_train, Y_train);

#Prediction
predictions_ada = model_adaboost.predict(X_test)
probabilities = model_adaboost.predict_proba(X_test)
fpr,tpr,thresholds = roc_curve(Y_test,probabilities[:,1])

In [None]:
print ("\n Classification report : \n",classification_report(Y_test,predictions_ada))
print ("Accuracy Score   : ",accuracy_score(Y_test,predictions_ada))
model_roc_auc = roc_auc_score(Y_test,predictions_ada) 
print ("Area under curve : ",model_roc_auc)

In [None]:
conf_matrix = confusion_matrix(predictions_ada,Y_test)
sns.heatmap(pd.DataFrame(conf_matrix), annot=True, cmap="coolwarm" ,fmt='g')
plt.tight_layout()
plt.title('Confusion matrix')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
model_roc_auc = roc_auc_score(Y_test,predictions_ada) 
y_pred_proba = model.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(Y_test, y_pred_proba)
plt.plot([0,1],[0,1],'k--')
plt.plot(fpr,tpr, label='Knn')
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('ROC curve')
print ("Area under curve : ",model_roc_auc)
plt.show()

In [None]:
visualize(model_adaboost)

# Conclusion

* The Above Analysis Shows all the Classification Report, ROC Curves and Confusion Matrix of all the Algorithms used which include;
1. Logistic Regression
2. KNN
3. Decision Trees
4. Gaussain NB
5. Random Forest
6. XG - Boost
7. Ada - Boost

* We have got different accuracy scores in the all of the above models, which shows a successful modelling tests.
Note : This is my first kaggle notebook, if you have any suggestions and recommendations for me to improve, please do comment.
Looking Forward to Improve.