# Project Loans

### Predicting whether a loan will get approved or not

In [None]:
#import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#set font size to 20
plt.rc("font", size=20)

#set seaborn styles
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

df = pd.read_csv('../input/loan-prediction-problem-dataset/train_u6lujuX_CVtuZ9i.csv', index_col=0)
df.head()

In [None]:
df.get_dtype_counts()

In [None]:
df = df.dropna()

In [None]:
#Convert values in Education, Gender, Married, Self_Employed, Loan_Status to integer values and replace in dataframe
df = df.replace({"Graduate": 1, "Not Graduate": 0})
df = df.replace({"Female": 1, "Male": 0})
df = df.replace({"Yes": 1, "No" : 0})
df = df.replace({"Y": 1, "N" : 0})
df = df.replace({"Rural": 0, "Semiurban": 1, "Urban": 2})
df = df.replace({"3+": 3})

#Education should now be 0 or 1 values in the dataframe
df.head()

In [None]:
df['Loan_Status'].value_counts()

In [None]:
sns.countplot(x='Loan_Status', data=df, palette='hls')

In [None]:
count_no_loan = len(df[df['Loan_Status']==0])
count_loan = len(df[df['Loan_Status']==1])
pct_of_loans = count_loan/(count_no_loan+count_loan)
print("Percentage of loans granted: ", pct_of_loans*100)
print("Percentage of loans not granted: ", (1-pct_of_loans)*100)

In [None]:
df.groupby('Loan_Status').mean()

In [None]:
%matplotlib inline
#Analyse whether property area is apredictor of loan approval
pd.crosstab(df.Property_Area,df.Loan_Status).plot(kind='bar')
plt.title('Approval frequency per property area')
plt.xlabel('Property Area')
plt.ylabel('Loan Status Frequency')

Property Area appears to be a good indicator of loan approval

In [None]:
#Analyse whether marital status is apredictor of loan approval
pd.crosstab(df.Married,df.Loan_Status).plot(kind='bar')
plt.title('Approval frequency per marital status')
plt.xlabel('Married')
plt.ylabel('Loan Status Frequency')

Marital status appears to be a good indicator of  loan approval 

In [None]:
pd.crosstab(df.Gender,df.Loan_Status).plot(kind='bar', stacked=True)
plt.title('Approval frequency per gender')
plt.xlabel('Gender')
plt.ylabel('Loan Status Frequency')

gender_df = df[['Gender', 'Loan_Status']]

male_approved_df = gender_df.loc[(gender_df['Gender'] == 0) & (gender_df['Loan_Status'] == 1)]
female_approved_df = gender_df.loc[(gender_df['Gender'] == 1) & (gender_df['Loan_Status'] == 1)]

male_approval_rate = len(male_approved_df) / len(df[df['Gender']==0])
female_approval_rate = len(female_approved_df) / len(df[df['Gender']==1])

print("Percentage of male loans granted: ", male_approval_rate*100)
print("Percentage of female loans granted: ", female_approval_rate*100)

Although male approvals are slightly more, gender does not appear to be a strong indicator of loan approval as the ratio of approvals are similar for both genders

In [None]:
pd.crosstab(df.Education,df.Loan_Status).plot(kind='bar')
plt.title('Approval frequency per education')
plt.xlabel('Education')
plt.ylabel('Loan Status Frequency')

Education appears to play a role, albeit not a very strong one. There is still a high rate of approval among the non-graduate applicants

In [None]:
df.groupby('Loan_Status').ApplicantIncome.hist()

In [None]:
df.groupby('Loan_Status').CoapplicantIncome.hist()

Applicant and co-applicant income do not appear to have an impact on approvals

In [None]:
df.groupby('Loan_Status').LoanAmount.hist()

loan amount does not appear to have a huge impact on approvals

In [None]:
pd.crosstab(df.Loan_Amount_Term,df.Loan_Status).plot(kind='bar')
plt.title('Approval frequency per education')
plt.xlabel('Loan Term (Days)')
plt.ylabel('Loan Status Frequency')

In [None]:
pd.crosstab(df.Credit_History,df.Loan_Status).plot(kind='bar')
plt.title('Approval frequency per education')
plt.xlabel('Credit History')
plt.ylabel('Loan Status Frequency')

Clearly an important predictor of loan success

In [None]:
#Split the data into features and target variables
feature_cols = ['Property_Area','Married','Dependents','Education','Gender','ApplicantIncome','LoanAmount','Loan_Amount_Term','Credit_History']
X = df[feature_cols]
y = df.Loan_Status

In [None]:
# split X and y into training and testing sets
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)

In [None]:
# import LogisticRegression from sklearn
from sklearn.linear_model import LogisticRegression

# instantiate the model (using the default parameters)
logreg = LogisticRegression()

# fit the model with data
logreg.fit(X_train,y_train)

y_pred=logreg.predict(X_test)

In [None]:
# import the metrics class
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

### Heatmap of confusion matrix

In [None]:
%matplotlib inline
class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)


# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
#Print accuracy, precision and recall of prediction
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))

### ROC Curve
Receiver Operating Characteristic(ROC) curve is a plot of the true positive rate against the false positive rate. It shows the tradeoff between sensitivity and specificity.

In [None]:
y_pred_proba = logreg.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

In [None]:
#Load and convert test data
test_df = pd.read_csv('../input/loan-prediction-problem-dataset/test_Y3wMUE5_7gLdaTN.csv', index_col=0)
#Convert values in Education, Gender, Married, Self_Employed, Loan_Status to integer values and replace in dataframe
test_df = df.replace({"Graduate": 1, "Not Graduate": 0})
test_df = df.replace({"Female": 1, "Male": 0})
test_df = df.replace({"Yes": 1, "No" : 0})
test_df = df.replace({"Y": 1, "N" : 0})
test_df = df.replace({"Rural": 0, "Semiurban": 1, "Urban": 2})
df = df.replace({"3+": 3})

In [None]:
X_test = test_df[feature_cols]
y_pred=logreg.predict(X_test)