# <center><u> Credit DataSet </u></center>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.offline as py
py.init_notebook_mode(connected = True)
import plotly.graph_objs as go
from sklearn.model_selection import train_test_split
from sklearn import metrics
import plotly.figure_factory as ff
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score,recall_score
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import roc_auc_score,roc_curve,scorer
from sklearn.metrics import f1_score
import statsmodels.api as sm
import plotly.tools as tls

## 1. Data Loading

In [None]:

df =  pd.read_csv('../input/loadpred/train_AV3.csv')

## 2. Data Description

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

## 3. Data Preparation

### 3.1 Handling Null and Incorrect Values

In [None]:
df.isnull().sum()

#### Gender

In [None]:
df.Gender.unique()

In [None]:
df.Gender.fillna(df.Gender.mode().values[0], inplace = True)

In [None]:
df.Gender.isnull().sum()

#### Married

In [None]:
df.Married.unique()

In [None]:
df.Married.fillna(df.Married.mode().values[0], inplace=True)

In [None]:
df.Married.isnull().sum()

#### Dependents

In [None]:
df.Dependents.unique()

In [None]:
df.Dependents.fillna(df.Dependents.mode().values[0], inplace=True)

In [None]:
df.Dependents.isnull().sum()

#### Self Employed

In [None]:
df.Self_Employed.unique()

In [None]:
df.Self_Employed.fillna(df.Self_Employed.mode().values[0], inplace=True)

In [None]:
df.Self_Employed.isnull().sum()

#### Loan Amount

In [None]:
df.LoanAmount.unique()

In [None]:
df.LoanAmount.fillna(round(df.LoanAmount.mean(),0),inplace=True)

In [None]:
df.LoanAmount.isnull().sum()

#### Loan Amount Term

In [None]:
df.Loan_Amount_Term.unique()

In [None]:
df.Loan_Amount_Term.fillna(round(df.Loan_Amount_Term.mean(),0), inplace=True)

In [None]:
df.Loan_Amount_Term.isnull().sum()

#### Credit History

In [None]:
df.Credit_History.unique()

In [None]:
df.Credit_History.fillna(df.Credit_History.mode().values[0], inplace=True)

In [None]:
df.Credit_History.isnull().sum()

In [None]:
df.isnull().sum()

## <center> 4. Uni-Variate Analysis</center>

The purpose of univriate analysis here is to only check and get the idea of distribution of attributes in the data set

In [None]:
def categorical_plots(var, data):
    
    
    #Adjustment of plots, bigger size and space b/w subplots
    
    fig = plt.figure(figsize=(15,5))
    fig.subplots_adjust(wspace=0.7)
    
    #1st Plot:  Bar plot     
        
    plt.subplot(1,3,1)
    sns.countplot(x=var, data= data)
    plt.xticks(rotation = 45, horizontalalignment='right')
    plt.xlabel(var.name + ' Distribution')

    #2nd Plot: PIE Chart
    
    labels =var.value_counts().index  #Labels that will be written against slices in pie charts
    
    #For the slice with highest value to be exploded, explode parameter is passed. Using for loop to make a tuple of 
    # number of slice using len(unique) and exploding the first slice by mentioning 0.1 at first index. Atlast converted list to tuple
    
    a=[0.1]
    for i in range ((len(var.unique()))-1):
        a.append(0)

    explode1= tuple(a)
    #if var.name != 'Customer Name':
    ax1 = plt.subplot(1,3,2)
    ax1.pie(var.value_counts(), labels=labels,autopct='%1.1f%%', shadow=True,explode= explode1 )
    ax1.axis('equal')
    plt.xlabel(var.name + ' Distribution')
    
    #3rd Plot: Line Plot
    
    plt.subplot(1,3,3)
    var.value_counts().sort_index().plot.line()
    plt.xticks(rotation = 45, horizontalalignment='right')
    plt.xlabel(var.name + ' Distribution')
    
    show=plt.show()
    
    return(show)



In [None]:
#FOR NUMERICL PLOTS WE WILL BE USING THE FOLLOWING FUNCTION

def numerical_plots(var):
    
    #Adjustment of plots, bigger size and space b/w subplots
    
    fig = plt.figure(figsize=(15,4))
    fig.subplots_adjust(wspace=0.3)
    
    #1st Plot:  Histogram with KDE plot          
 
    plt.subplot(1,3,1)
    sns.distplot(var, color='b')
    plt.xlabel(var.name + ' Distribution')

    
    #2nd Plot:  Box plot
    
    plt.subplot(1,3,2)
    sns.boxplot(y=var)
    plt.xlabel(var.name + ' Distribution')


    #3rd Plot:  Histogram without plot     

    plt.subplot(1,3,3)
    sns.distplot(var, color='b', kde=False)
    plt.xlabel(var.name + ' Distribution')
    
    #plt.subplot(1,3,3)
    #sns.kdeplot(var, color='b')
    #plt.xlabel(var.name + ' Distribution')
    
    show=plt.show()
    
    return(show)


In [None]:
categorical_plots(df.Gender, df)

In [None]:
categorical_plots(df.Married, df)

In [None]:
categorical_plots(df.Dependents,df)

In [None]:
categorical_plots(df.Education,df)

In [None]:
categorical_plots(df.Self_Employed,df)

In [None]:
numerical_plots(df.ApplicantIncome)

In [None]:
numerical_plots(df.CoapplicantIncome)

In [None]:
numerical_plots(df.LoanAmount)

In [None]:
categorical_plots(df.Loan_Amount_Term,df)

In [None]:
categorical_plots(df.Credit_History,df)

In [None]:
categorical_plots(df.Property_Area,df)

In [None]:
categorical_plots(df.Loan_Status, df)

## <center> 5. Bi-Variate Analysis</center>

<b>Our main target is to find out customers that are elgible for Loan. Therefore we will plot bar charts of every categorical attribute agaist Loan Status and boxplot for every numerical attribute against Loan Status </b>

For the purpose of better understanding of Credit History attribute relation with Target variable we will categorize it into Yes and No.

In [None]:
df.Credit_History = np.where(df.Credit_History== 1., 'Yes','No')
df.Credit_History.unique()

### 5.1. Barplots for Categorical Attributes

In [None]:
for i in df.columns:
    if df[i].dtype =='O' and i!='Loan_ID':
        sns.countplot(x=df[i], hue=df.Loan_Status)
        plt.show()

<b> 5.1.1 Results: </b>
    
Gender: There are more males than females who have been granted Loan.

Marital Status: From the plots it seems that people who are married have a greater proability for loan.
    
Dependents: From the plots it seems that people who have no dependents are the one who have mostly applied for the loan. And interestingly are also in majority who have been granted loan.

Education: Majority of the people who applied for loan are graduates and also have a higher probability of getting loan as compared to non graduates.

Self Employed: A large portion of the people who applied for loan are of salaried class and majority of them have been granted loan.

Credit History:It is pretty clear from the plot that applicants who have a cerdit history are very likely to be granted loan.

Property Area: Customers residing in Semiurban area seem to have a higher probability of getting loans.





### 5.2 Box plots for Numeric Attributes

In [None]:
for i in df.columns:
    if df[i].dtype !='O':
        sns.boxplot(y=df[i], x=df.Loan_Status)
        plt.show()

<b>5.2.1 Results : </b> Numerical attributes dont seem to have a noticable relation with the target variable. This further will be conifrmed in the correlation matrix.

## 6. Feature Selection

To find if any categorical attribute is independent of our target variable (Loan Status), we will calculate chi square statistics.

### 6.1. Chi Square Statistics

In [None]:
df.select_dtypes(include='O').columns

In [None]:
import scipy.stats as s

In [None]:
def chi2(data,target,alpha):
    
    for i in df.columns:
    
        if df[i].dtype == 'O' and i != target:
            col = i

            ov = pd.crosstab(data[col], data[target])
            #max_least_income = ov.loc[ov[' <=50K'].idxmax()].name
            #max_highest_income = ov.loc[ov[' >50K'].idxmax()].name
            plt.style.use('ggplot')
            ov.plot(kind='bar', figsize=(5,5), stacked=True)
            plt.xlabel(i.title())
                 
            chi = s.chi2_contingency(ov)
            chi2_s = chi[0]
            p_value = chi[1]
            dof = chi[2]
            critical_value = s.chi2.ppf(q=1-alpha, df=dof)
            
            print('\n\033[1m\033[4m', col.upper(),':\033[0m \n')
            print('Significance Level = ', alpha)
            print('Degree of Freedom = ', dof)
            print('chi2 = ', chi2_s)
            print('Critical Value = ',critical_value)
            print('p-value = ', p_value)

            if chi2_s >=critical_value or p_value <= alpha :
                print('\nWe reject the null hypotheses, there is a relationship between the two variables \n')
            else:
                print('\nThere is no relationship between the two variables and the null hypotheses is retained \n')
            
            plt.show()
            #print('\033[1mThe bar chart shows that', max_least_income,i,'has the highest number of people with <=50k income and',max_highest_income,i,'has the highest number of people having income >50K \n')

In [None]:
chi2(df, 'Loan_Status', 0.05)

In [None]:
df.drop(columns=['Loan_ID'], inplace=True)

In [None]:
df.info()

## 6.2 Pearson Correlation of Target Variables and Numerical Attributes\

For Numerical Attributed we will check correlation of each with target variable. For this will first manually encode our target variable to 0 and 1.

#### 6.2.1 Manual Encoding of Target Variable

In [None]:
df.Loan_Status.value_counts()

In [None]:
df.Loan_Status.dtype

In [None]:
df.Loan_Status = np.where(df.Loan_Status=='Y', 1, 0)

In [None]:
df.Loan_Status.value_counts()

#### 6.2.2 Pearson Correlation

In [None]:
correlation =df.corr()
correlation.Loan_Status

In [None]:
df.corr()

<b> Based on chi square statistics and Pearson correlation matrix we can drop some attributes but first we will train and evaluate model without dropping any and later on evaluate model using selected features. </b>

## 7. Pre-processing

### 7.1 Feature Encoding and Descretization

In [None]:
df.head()

For all categorical variabes that have more than two unique we will hot encode them and for the rest we will manually encode them.

#### 7.1.1 Manual Encoding

In [None]:
df.Gender = np.where(df.Gender =='Male', 1,0)
df.Married = np.where(df.Married == 'Yes',1,0)
df.Education = np.where(df.Education == 'Graduate',1,0)
df.Self_Employed = np.where(df.Self_Employed =='No', 1,0)
df.Credit_History = np.where(df.Credit_History =='Yes',1,0)

#### 7.1.2 One Hot Encoding

In [None]:
df.Dummies = pd.get_dummies(df.Property_Area)

In [None]:
df.Dummies.head()

In [None]:
df = pd.concat([df,df.Dummies], axis=1)

In [None]:
df.head()

In [None]:
df.drop(columns=['Property_Area'], inplace=True)

In [None]:
df.head()

In [None]:
from sklearn.preprocessing import StandardScaler
zscore = StandardScaler()

In [None]:
# We will used copied dataframe here. And 3+ values needs to be changed for normalization, doing so.
df['Dependents'] = np.where(df.Dependents == '3+', 4, df.Dependents)

In [None]:
cols = ['ApplicantIncome', 'CoapplicantIncome','LoanAmount','Loan_Amount_Term']
for i in cols:
    df[i] = zscore.fit_transform(df[[i]])

In [None]:
x = df
df.head()

In [None]:
df1 =df.copy() # Saving a copy of dataframe to be utilized later on to see the effects of normalization and feature selection on model

# <center><u>8. Machine Learning Models </u></center>

## 8.1 Logistic Regression

### 8.1.1 Tranforming features and Target Variables into Arrays

In [None]:
y = df['Loan_Status'] #Separating Target Variable
df.drop(columns=['Loan_Status'], inplace=True)
x = df
x= x.to_dict(orient='records')

from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()
x = vec.fit_transform(x).toarray()
x

In [None]:
y = np.asarray(y)

### 8.1.2 Train Test Split

In [None]:
# We will use this split data for all algorithms
xtrain,xtest,ytrain,ytest =train_test_split(x,y,test_size=0.2, random_state=0)

### 8.1.3 Applying Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
model_LG =LogisticRegression()
model_LG.fit(xtrain,ytrain);

In [None]:
y_pred_LG = model_LG.predict(xtest)
probabilities = model_LG.predict_proba(xtest)
fpr,tpr,thresholds = roc_curve(ytest,probabilities[:,1])

In [None]:
# Defining a functio to be used for evaluation for all algorithms
def evaluation(algorithm):
    #Classification Report
    print ("\n \033[1m Classification report : \033[0m\n",classification_report(ytest,algorithm ))

    #Accuracy
    print ("\033[1mAccuracy Score   : \033[0m",accuracy_score(ytest, algorithm))

    #conf_matrix
    conf_matrix = confusion_matrix(ytest,algorithm)


    #roc_auc_score
    model_roc_auc = round(roc_auc_score(ytest, algorithm),3) 
    print ("\033[1mArea under curve : \033[0m",model_roc_auc)
    fpr,tpr,thresholds = roc_curve(ytest,probabilities[:,1])

    # roc curve plot
    trace1 = go.Scatter(x = fpr,y = tpr,
                        name = "Roc : " + str(model_roc_auc),
                        line = dict(color = ('rgb(22, 96, 167)'),width = 2),
                       )
    #confusion matrix plot
    trace2 = go.Heatmap(z = conf_matrix ,
                        x = ["Not Granted","Granted"],
                        y = ["Not Granted","Granted"],
                        colorscale = "Viridis",name = "matrix" )
    #subplots
    fig = tls.make_subplots(rows=1, cols=2, horizontal_spacing = 0.40,subplot_titles=('ROC Curve','Confusion Matrix'))

    fig.append_trace(trace1,1,1)
    fig.append_trace(trace2,1,2)


    fig['layout'].update(showlegend=False, title="Model performance" ,
                         autosize = False,height = 400,width = 800,
                         plot_bgcolor = 'rgba(240,240,240, 0.95)',
                         paper_bgcolor = 'rgba(240,240,240, 0.95)',
                         xaxis = dict(title = "false positive rate",
                                 gridcolor = 'rgb(255, 255, 255)',
                                 domain=[0, 0.6],
                                 ticklen=5,gridwidth=2),
                        yaxis = dict(title = "true positive rate",
                                  gridcolor = 'rgb(255, 255, 255)',
                                  zerolinewidth=1),
                        margin = dict(b = 20))

    py.iplot(fig)


### 8.1.4 Logistic Regression Evaluation

In [None]:
print ("\n\033[1m Classification report : \033[0m\n",classification_report(ytest,y_pred_LG))
print ("\033[1mAccuracy Score   : \033[0m",accuracy_score(ytest,y_pred_LG))
evaluation(y_pred_LG)

### 8.1.5 Actual vs Prediction

In [None]:
data = pd.DataFrame({'Actual': ytest.flatten(), 'Predicted': y_pred_LG.flatten()})
data.head(10)

#### 8.1.5.1 Actual vs Predicted Graph

In [None]:
data = data.head(20)
data.plot(kind='bar',figsize=(15,5))
plt.title('Actual vs Predicted')
plt.grid(which='major', linestyle=':', linewidth='0.99', color='black')
plt.show()

## 8.2 Logistic Regression (Features Selected)

In [None]:
df1.head()

### 8.2.1 Dropping Attributes

In [None]:
#df1 is a normzalized dataframe saved earlier
y2 =df1['Loan_Status']
df1.drop(columns=['Gender','Dependents','Self_Employed','ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term','Loan_Status'], inplace=True)

df1.columns

### 8.2.2 Transforming into arrays

In [None]:
x2=df1
x2.head()

In [None]:
x2=df1
x2 = x2.to_dict(orient='records')
x2 =vec.fit_transform(x2).toarray()
y2=np.asarray(y2)

### 8.2.3 Train Test Split

In [None]:
xtrain2,xtest2,ytrain2,ytest2 =train_test_split(x2,y2,test_size=0.2, random_state=0)

### 8.2.4 Applying Logistic Regression (Selected Features)

In [None]:
#Logistic Regression model training
model_LG.fit(xtrain2,ytrain2); 

In [None]:
#Prediction
y_pred_LG2 = model_LG.predict(xtest2)
probabilities = model_LG.predict_proba(xtest2)
fpr,tpr,thresholds = roc_curve(ytest,probabilities[:,1])

### 8.2.5 Logistic Regression Evaluation (Selected Features)

In [None]:
evaluation(y_pred_LG2)

## 8.3 KNN Algorithm

### 8.3.1 Applying KNN 

In [None]:
from sklearn.neighbors import KNeighborsClassifier

#Model Traning
model_knn = KNeighborsClassifier()
model_knn.fit(xtrain,ytrain);


In [None]:
#Prediction
y_pred_knn = model_knn.predict(xtest)
probabilities = model_knn.predict_proba(xtest)
fpr,tpr,thresholds = roc_curve(ytest,probabilities[:,1])

### 8.3.2 Knn Evaluation

In [None]:
evaluation(y_pred_knn)

## 8.4 Naive Bayes Classifier

### 8.4.1 Applying Naive Bayes Classifier

In [None]:
from sklearn.naive_bayes import GaussianNB
#Model Training
model_nb = GaussianNB()
model_nb.fit(xtrain, ytrain);

In [None]:
# Model Prediction
y_pred_nb = model_nb.predict(xtest)
probabilities = model_nb.predict_proba(xtest)
fpr,tpr,thresholds = roc_curve(ytest,probabilities[:,1])

### 8.4.2 Naive Bayes Evaluation

In [None]:
evaluation(y_pred_nb)

## 8.5 Decision Tree Classifier

### 8.5.1 Applying Decision Tree

In [None]:
print(xtrain.shape,xtest.shape,ytrain.shape,ytest.shape)

In [None]:
from sklearn import tree

# Model Traning
model_DT = tree.DecisionTreeClassifier()
model_DT.fit(xtrain,ytrain)
y_pred_DT = model_DT.predict(xtest)
probabilities = model_DT.predict_proba(xtest)
fpr,tpr,thresholds = roc_curve(ytest,probabilities[:,1])

### 8.5.1 Decision Tree Evaluation 

In [None]:
evaluation(y_pred_DT)

## 8.6. SVM Classifer

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

### 8.6.1. Applying SVM Classifier

In [None]:

svm_classifier = SVC(kernel='rbf', random_state=0, probability=True)
svm_classifier.fit(xtrain,ytrain);

In [None]:
y_pred_svm = svm_classifier.predict(xtest)
probabilities = svm_classifier.predict_proba(xtest)
fpr,tpr,thresholds = roc_curve(ytest,probabilities[:,1])

### 8.6.2 SVM Evaluation

In [None]:
evaluation(y_pred_svm)

## 8.7 Random Forest

### 8.7.1 Applying Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
model = rfc.fit(xtrain, ytrain)


In [None]:
y_pred_rfc = rfc.predict(xtest)
probabilities = rfc.predict_proba(xtest)
fpr,tpr,thresholds = roc_curve(ytest,probabilities[:,1])

### 8.7.2 Random Forest Evaluation

In [None]:
evaluation(y_pred_rfc)

## 8.8 Model Metrics Comparision

In [None]:
from sklearn.metrics import f1_score

def model_report(model,training_x,testing_x,training_y,testing_y,name) :
    model.fit(training_x,training_y)
    predictions  = model.predict(testing_x)
    accuracy     = accuracy_score(testing_y,predictions)
    recallscore  = recall_score(testing_y,predictions)
    precision    = precision_score(testing_y,predictions)
    f1score      = f1_score(testing_y,predictions) 
    ROC          = roc_auc_score(testing_y,predictions)
    
    df = pd.DataFrame({"Model"           : [name],
                       "Accuracy_score"  : [accuracy],
                       "Recall_score"    : [recallscore],
                       "Precision"       : [precision],
                       "f1_score"        : [f1score],
                       "Area Under Curve": [ROC]
                       })
    return df

model1 = model_report(model_LG,xtrain,xtest,ytrain,ytest,"Logistic Reg. ")

model2 = model_report(model_LG,xtrain2,xtest2,ytrain2,ytest2,"Log.Reg.Selected Feat.")

model3 = model_report(rfc,xtrain,xtest,ytrain,ytest,"Random Forest")

model4 = model_report(model_knn,xtrain,xtest,ytrain,ytest,"KNN Classifier")

model5 = model_report(model_nb,xtrain,xtest,ytrain,ytest,"Naive Bayes")

model6 = model_report(model_DT,xtrain,xtest,ytrain,ytest,"Decision Tree")

model7 = model_report(svm_classifier,xtrain,xtest,ytrain,ytest,"SVM Classifier")


model_performances = pd.concat([model1,model2, model3,model4,model5,model6, model7],axis = 0).reset_index()

model_performances = model_performances.drop(columns = "index",axis =1)

table  = ff.create_table(np.round(model_performances,4))

py.iplot(table)

## <center>------End------</center>