In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

For this notebook project, we will explore a loan data  which connects people who need money (borrowers) with people who have money (investors). We try to predict whether an investor will invest in people who showed a profile of borrowers with some features.


# 1.Import Libraries and Data


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import cufflinks as cf
cf.go_offline()

In [None]:
df=pd.read_csv("../input/loan_data.csv")
df.head(3)

# 2.Exploratory Data Analysis

In [None]:
df.info() 
#There 9578 rows and 14 columns in our dataset

Here are what the columns represent:
* credit.policy: 1 if the customer meets the credit underwriting criteria of LendingClub.com, and 0 otherwise.
* purpose: The purpose of the loan (takes values "credit_card", "debt_consolidation", "educational", "major_purchase", "small_business", and "all_other").
* int.rate: The interest rate of the loan, as a proportion (a rate of 11% would be stored as 0.11). Borrowers judged by LendingClub.com to be more risky are assigned higher interest rates.
* installment: The monthly installments owed by the borrower if the loan is funded.
* log.annual.inc: The natural log of the self-reported annual income of the borrower.
* dti: The debt-to-income ratio of the borrower (amount of debt divided by annual income).
* fico: The FICO credit score of the borrower.
* days.with.cr.line: The number of days the borrower has had a credit line.
* revol.bal: The borrower's revolving balance (amount unpaid at the end of the credit card billing cycle).
* revol.util: The borrower's revolving line utilization rate (the amount of the credit line used relative to total credit available).
* inq.last.6mths: The borrower's number of inquiries by creditors in the last 6 months.
* delinq.2yrs: The number of times the borrower had been 30+ days past due on a payment in the past 2 years.
* pub.rec: The borrower's number of derogatory public records (bankruptcy filings, tax liens, or judgments).

In [None]:
df.describe(include="all") 
#Here we get overall statistical information about the data

*here wreate a histogram of two FICO distributions on top of each other, one for each credit.policy outcome

In [None]:
plt.figure(figsize=(15,10))
df[df["credit.policy"] == 1]["fico"].hist(color="blue",bins=50,label="Credit Policy = 1",alpha=0.4)
df[df["credit.policy"] == 0]["fico"].hist(color="red",bins=50,label="Credit Policy = 0",alpha=0.4)
plt.legend()
#here we make two different histogram one for those who have credit policy 1 score and the other for those who have 0 score
# we compare their relative fico credit scores

*This figure shows that people who have lower FICO score tends to have a credit policy of 0,
this means that they do not meet the criteria of borrowing money

*People who have 660 or fewer fico socre do not meet the criteria 

In [None]:
df[df["credit.policy"] == 1]["fico"].iplot(kind="hist",bins=24,colors="blue")
df[df["credit.policy"] == 0]["fico"].iplot(kind="hist",bins=24,colors="orange")
#Here we do the same histogram with iplot library because it is interactive
# this means that when we click somewhere we can get exact score 

* Now we will create a similar figure, except this time select by the not.fully.paid column as our target column

*

In [None]:
plt.figure(figsize=(15,10))
df[df["not.fully.paid"] ==1]["fico"].hist(label="not fully paid = 1",alpha=0.6,color="blue",bins=30)
df[df["not.fully.paid"] ==0]["fico"].hist(label="not fully paid = 0",alpha=0.6,color="red",bins=30)
plt.xlabel("FICO")
plt.title("The FICO credit score of the borrower")
plt.legend()

*The figure shows that the majority of people pay these loans

*The loans fully paid or not fully paid has almost the same distribution, but those not fully paid has lower fico scores

*Here we create a countplot using seaborn showing the counts of loans by purpose, with the color hue defined by not.fully.paid. 

In [None]:
plt.figure(figsize=(15,10))
sns.countplot(x="purpose",hue="not.fully.paid", data=df, palette="Set1")
plt.title("The Counts of Loans by Purpose")
plt.legend()

*Based on countplot of purpose of loan, we can say that debt consolidation is the most popular reason for loan

*Secondly the ratio of fully paid and not fully paid is almost similar across different purposes of loan

* Here we will see the trend between FICO score and interest rate with the following jointplot

In [None]:
sns.jointplot(x="fico",y="int.rate",data=df,color="green",space=0.2)

*The figure above shows that the more fico score increase, the lower interest rate people have better credit get and vice versa

* Here we create the following lmplots to see whether the trend between not.fully.paid and credit policy columns. 

In [None]:
sns.lmplot(x="fico",y="int.rate",data=df,palette="Set1",hue="credit.policy",col="not.fully.paid")

*col parameter of lmplot  gives us the possiblity to create more than one plot according to the number of items inside defined column

*for example in the column "not fully paid", we have just two value, so we get two separate plots because we assigned "not fully paid" as col parameter

*Here we get more detailed version of the previous plot and get more complex relationship between columns in a single plot

# 3. Feature Engineering:



In [None]:
df.info() # we look again the overall information about the data

*We need to dela with categorical columns 

*We have  a categorical column as **purpose** column 

*This means we need to transform the values in this column by using dummy variables so sklearn will be able to understand them. 

In [None]:
cat_feature=["purpose"]
final_data= pd.get_dummies(df,columns=cat_feature,drop_first=True)
final_data.info()

get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False, drop_first=False, dtype=None) -> 'DataFrame'
    Convert categorical variable into dummy/indicator variables.

It turns a categorical variable into a series of zeros and ones, which makes them a lot easier to quantify and compare.

In [None]:
final_data.head()
#Now all of the features in the data has been tranformed into 0 and 1 by adding a new column for each of them

## 4. Splitting the Data and Training Decision Three Model

*Now its time to split our data into a training set and a testing set before applying the algorithm

*we use sklearn to split our data into a training set and a testing set

In [None]:
X=final_data.drop("not.fully.paid",axis=1) # All of the columns except from the target column has assigned as the X
y=final_data["not.fully.paid"] # "not.fully.paid" column has been assigned as the target column

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3) # Here we split our data as training and test dataset

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtree=DecisionTreeClassifier()
dtree.fit(X_train,y_train) 

* Here we create an instance of DecisionTreeClassifier() called dtree and fit it to the training data.

## 5. Predictions and Evaluation of Decision Tree Model

**We will make predictions from the test set and create a classification report and a confusion matrix to compare the results

In [None]:
predictions=dtree.predict(X_test)
#df_pred=pd.DataFrame(predictions)
plt.figure(figsize=(15,10))
sns.countplot(predictions,palette="Set1")


In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test,predictions))

*According to classification report; 
            Precision:True Positive/Total Predicted Positivemacro average of precision(averaging the unweighted mean per label) 0.54(%85 for 0's and %22 for 1's);weighted average (averaging the support-weighted mean per label) is 0.75
            



In [None]:
print(confusion_matrix(y_test,predictions))

According to confusion matrix; 

            -True Negatives: 2019
            -False Positive:398
            -False Negative:345
            -True Positive:112

*The results are not good, so we will try Random Forest Model and compare the results with Decision Tree Model

## 6. Training the Random Forest model

*Now its time to train our new model

* we will create an instance of the RandomForestClassifier class and fit it to our training data from the previous step

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier(n_estimators=300)

In [None]:
rfc.fit(X_train,y_train) # We make th new algorithm fit with the training set
rfc_predictions=rfc.predict(X_test) #We make the algorith to predict y test values

## 7. Predictions and Evaluation



**Now we will create a classification report from the results. 

In [None]:
print(classification_report(y_test,rfc_predictions))
print(5*"\n")
print(confusion_matrix(y_test,rfc_predictions))

*When we compare the results from both of the models we use, Random Forest model performs better than Decision Tree Model

*However, when it comes to the resuts for target column=1 of recall and f1 score, Decision Tree Model performs far better than the other one

*Therefore, before choosing an algorithm we have to keep in mind our priorities and pros and cons of different ML models

**Show the Confusion Matrix for the predictions.**

**What performed better the random forest or the decision tree?**

# Great Job!