In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import plotly_express as px
import seaborn as sns
sns.set()
import warnings
warnings.filterwarnings("ignore")
import datetime
import os
from sklearn.preprocessing import StandardScaler

from matplotlib import pyplot 
%matplotlib inline
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('../input/loandata/Loan payments data.csv')

In [None]:
data.head()

# **Data Cleaning, Understanding & Visualising**

In [None]:
data.info()

The aim is to replace NaN values for the columns in accordance with their distribution.

In [None]:
data.isnull().sum()

Empty values in the data are filled as "0" and "1" for past_due_days and paid_off_time.

In [None]:
data['past_due_days'] = data['past_due_days'].fillna(0)
data['paid_off_time'] = data['paid_off_time'].fillna(1)
data.isnull().sum()

In [None]:
data.shape

In [None]:
data.tail(5)

In [None]:
data.duplicated().sum()

In [None]:
data.describe().T

From Analysis:
There are no duplicated values in the data. Loan data set have 500 records in 11 columns/features. There are 100 null values in "paid_off_time" feature and 300 null values in "past_due_days" Also it will be need to convert some columns to respective datetime datatype. For this reason effective_date, due_date and paid_off_time are going to convert to Datetime format.

In [None]:
data['effective_date'] = pd.to_datetime(data['effective_date'])
data['due_date'] = pd.to_datetime(data['due_date'])
data['paid_off_time'] = pd.to_datetime(data['paid_off_time']).dt.date
data['paid_off_time'] = pd.to_datetime(data['paid_off_time'])

In [None]:
data.info()


In [None]:
data.hist(figsize = (15,11), color="#008080")

# Loan Status

In [None]:
loan_status = data['loan_status'].value_counts()
pd.DataFrame(loan_status)

In [None]:
plt.figure(figsize = [10,5])
sns.countplot(x='loan_status', data=data)
plt.title('Loan Status Distribution',fontsize=20)
plt.xlabel('Loan Status', fontsize=16)
plt.ylabel('Count', fontsize=16)

In [None]:
plt.figure(figsize = [10,5])
plt.pie(data['loan_status'].value_counts(),labels=data['loan_status'].unique(),explode=[0,0.1,0],startangle=145,autopct='%1.f%%', colors=['#1e847f', '#ecc19c', '#000000'])
plt.title('Loan Status Distribution',fontsize = 15)
plt.show()

In [None]:
# Relation between loan_status and past_due_days
%matplotlib inline
plt.figure(figsize = [9,5])
sns.boxplot(x='loan_status', y='past_due_days', data=data)
plt.xlabel('Loan Status', fontsize=16)
plt.ylabel('Past Due Days', fontsize=16)
plt.show()

Observations:
It can be clearly seen that, out of 500 peoples 300 people (60%) repaid the full amount on time. "Collection paid off" shows 100 people (20%) repaid the loan but lately after due date. "Collection" shows 100 people (20%) not repaid the loan.



# **Principal Analysis**


In [None]:
Principal = data['Principal'].value_counts()
pd.DataFrame(Principal)

In [None]:
plt.figure(figsize = [10,5])
sns.countplot(data['Principal'],hue=data['loan_status'],palette=('#1e847f', '#ecc19c', '#000000'))
plt.legend(loc='upper left')
plt.title('Principal vs Loan Status',fontsize=20)
plt.xlabel('Principal', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.show()

In [None]:
p = data.groupby(['loan_status'])['Principal'].value_counts()
pd.DataFrame(p)

In [None]:
fig=plt.figure(figsize=(12,6))
sns.distplot(data['Principal'], bins=40)

Observation:

Majority of the people have opted for "Principal" of 800 and 1000.
It can be seen that most of the principal amount is at 1000 USD



# Term Analysis


In [None]:
Terms = data['terms'].value_counts()
pd.DataFrame(Terms)

In [None]:
plt.figure(figsize = [10,5])
sns.countplot(data['terms'],hue=data['loan_status'],palette=('#1e847f', '#ecc19c', '#000000'))
plt.legend(loc='upper left')
plt.title('Terms vs Loan Status',fontsize=20)
plt.xlabel('Terms', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.show()

In [None]:
data['Days to pay']= (pd.DatetimeIndex(data['paid_off_time']).normalize()
                        -pd.DatetimeIndex(data['effective_date']).normalize())/np.timedelta64(1,'D')

In [None]:
data['paid_off_date'] = pd.DatetimeIndex(data['paid_off_time']).normalize()

In [None]:
fig, ax=plt.subplots(figsize=(15,6))
ax=sns.countplot(x='Days to pay',hue='terms',data=data)
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)

In [None]:
fig, ax=plt.subplots(figsize=(15,6))
ax=sns.countplot(x='Days to pay', hue='terms', data=data[data['loan_status']== 'PAIDOFF'])
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)

In [None]:
tmp = data.loc[(data['Days to pay'] > 30) & (data['loan_status'] == 'PAIDOFF')]
print("{}: Incorrect status: {} observations")
print(tmp[['loan_status', 'terms', 'effective_date', 'due_date', 'paid_off_time']])

Observation:

From above graph it can be seen that only few people have opted loan for 7 days term. Majority of the late payments are from people who have their loan terms as 15 days and 30 days. 

Most of the applications are having monthly mode as people do get monthly wages mostly from where they would like to pay off for the loan amount.


# Effective Date Analysis

In [None]:
Effective_Date = data['effective_date'].value_counts()
pd.DataFrame(Effective_Date)

In [None]:
data.groupby(['effective_date'])['loan_status'].value_counts().to_frame()

In [None]:
plt.figure(figsize = [10,5])
dates = data['effective_date'].dt.date
sns.countplot(x=dates, hue=data['loan_status'],palette=('#1e847f', '#ecc19c', '#000000'))
plt.legend(loc='upper right')
plt.title('Effective Date vs Loan Status',fontsize=20)
plt.xlabel('Effective Date', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.show()

In [None]:
fig = plt.figure(figsize=(15,5))
ax = sns.countplot(data['effective_date'],hue=data['Principal'],data=data)
for p in ax.patches:
    ax.annotate(p.get_height(), (p.get_x(), p.get_height()*1.01))

Observation:
The loan data is for Sept 8th to 14th. With most of the loans originating/drawn on Sept 11. Sep 10,11 and 12 make for the bulk of the business for the week. 400+ loans out of a total of 500 for the week. Most of people who paid latety(or yet to pay) are from these 2 days.

Majority of loans seem to be paid of on time, the ones who paid after moving to collections also is a bit confusing as the graphs are identical with the Paid of loans. The loans with 7 days credit period are the paid off without any adverse behavior.

# **Age Analysis**

In [None]:
Age = data['age'].value_counts()
pd.DataFrame(Age)

In [None]:
plt.figure(figsize = [18,7])
sns.countplot(data['age'],hue=data['loan_status'],palette=('#1e847f', '#ecc19c', '#000000'))
plt.legend(loc='upper left')
plt.title('Age vs Loan Status',fontsize=20)
plt.xlabel('Age', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.show()

In [None]:
for i in data['loan_status'].unique():
    agemean=data[data['loan_status']==i]['age'].mean()
    agemode=data[data['loan_status']==i]['age'].mode()
    print("average age of people whose loan status is'{0}': \b {1:2.2f} and mode is {2}".format(i,agemean, agemode[0]))
    
    
    

In [None]:
px.scatter(data, x="age", y="past_due_days", size ="terms" ,color="loan_status",
           hover_data=['Gender','Principal'], log_x=True, size_max=8)

Observation:

Majority of the people who took loan have age ranging from 24 years to 38 years Majority of people repaid their loan.

# Education Analysis
******

Lets move further and see how education affects the loan amount and payment status ?

In [None]:
data['education']= data['education'].replace('Bechalor','Bachelor')

In [None]:
Education = data['education'].value_counts()
pd.DataFrame(Education)

In [None]:
EducationvsLoanStatus = data.groupby(['education'])['loan_status'].value_counts()
pd.DataFrame(EducationvsLoanStatus)

In [None]:
plt.figure(figsize = [10,5])
sns.countplot(data['education'],hue=data['loan_status'], palette=('#1e847f', '#ecc19c', '#000000'))
plt.legend(loc='upper right')
plt.title('Education vs Loan Status',fontsize=20)
plt.xlabel('Education', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.show()

In [None]:
ax = sns.countplot(data['education'],hue='Gender',data=data)
for p in ax.patches:
    ax.annotate(p.get_height(), (p.get_x(), p.get_height()*1.01))

In [None]:
g = sns.catplot("loan_status", col="education", col_wrap=4,
                 data=data[data.loan_status.notnull()],
                 kind="count", height=12.5, aspect=.6)

Observations:

Majority of the loan taker is form College and then from High school. Very few people from Masters or Above background took loan. Irrespective of education category, most of them repaid their loan. 

Above visualisation shows that it can be deduce that most of the college or high school students apply for the loan and pay back well in time so they are better candidates for loans by banks.

However for Bachelors degree students - Loan applications are less and return is also not so rewarding means a bit less preferable to return the money by themselves.

Lastly - Very few loan applications for students going for Masters and above degree which is a valid insight as very few people (from crowd) opt for masters degree or higher (bit costly than others) so less applicants for loan. Such candidates pay off well and very few people take time to pay back to bank.

# Gender Analysis

In [None]:
Gender = data['Gender'].value_counts()
pd.DataFrame(Gender)

In [None]:
plt.figure(figsize = [10,5])
sns.countplot(x='Gender', data=data)
plt.title('Gender Count',fontsize=20)
plt.xlabel('Count', fontsize=16)
plt.ylabel('Gender', fontsize=16)

In [None]:
GendervsLoan = data.groupby(['Gender'])['loan_status'].value_counts()
pd.DataFrame(GendervsLoan).T

In [None]:
plt.figure(figsize = [10,5])
sns.countplot(data['Gender'],hue=data['loan_status'], palette=('#1e847f', '#ecc19c', '#000000'))
plt.legend(loc='upper right')
plt.title('Gender vs Loan Status',fontsize=20)
plt.xlabel('Gender', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.show()

In [None]:
ax = sns.barplot(x="Principal", y="age",hue="Gender" ,  data=data)
ax.legend(loc="upper right")

In [None]:
## exploring gender +education 
pd.crosstab(data['loan_status'], data['Gender'] + "_" + data['education'], margins=True)

In [None]:
sns.pairplot(data, hue='Gender')

Observations:
Out of 500 there are 423 males and 77 females present. Belove graph shows that female have lesser ratio of loans.
Almost 60% of the male population have repaid their loan and almost 40% paid lately or not yet. Almost 70% of the female population have repaid their loan and almost 30% paid lately or not yet. Irrespective of gender, most of the population tend to pay the loan on time

Below visualizations is best example as it clearly shows that Male candidates do have more Loan applications compared to Female candidates and most of the candidates who apply for the loan are in higher side of Principal Amount means they wish to go for better studies.



# Correlation

In [None]:
correlation = data[data.columns].corr()
sns.heatmap(correlation, vmin=-1, vmax=1, annot=True)

In [None]:
data.corr()

# Conclusion for Data Analysis

In [None]:
## exploring demographic
fig, axs=plt.subplots(3,2, figsize=(20,15))

sns.distplot(data['age'], ax=axs[0][0])
axs[0][0].set_title("Total age distribution across dataset")

sns.boxplot(x='loan_status', y='age', data=data, ax=axs[0][1])
axs[0][1].set_title("Age distribution by loan status")

sns.countplot(x='education', data=data, ax=axs[1][0])
axs[1][0].set_title("Education count")

sns.countplot(x='education', data=data, hue='loan_status', ax=axs[1][1])
axs[1][1].set_title("Education by loan status")
axs[1][1].legend(loc='upper right')

sns.countplot(x='Gender', data=data, ax=axs[2][0])
axs[2][0].set_title(" Gender")

sns.countplot(x='Gender', data=data, hue='education', ax=axs[2][1])
axs[2][1].set_title("Education of the gender")

Conclusion:

	It can be clearly seen that 60% (300) people repaid the full amount of loan. 
	20% of the people have not repaid the loan.
	20% of the people have repaid the loan but lately after due date.
	Almost 60% of the male population have repaid their loan and almost 40% paid lately or not yet.
	60% of the people have repaid the loan on time. 
	Almost 70% of the female population have repaid their loan and almost 30% paid lately or not yet.
	Majority of the people who took loan have age ranging from 24 years to 38 years. Most of the elder people (35 - 50 years) have paid back loan on time.
	Majority of the loan takers are from High School or College background. Majority of the people have opted for Principal of 800 and 1000.
	Only few people have opted loan for 7 days term.
	Majority of the late payments are from people who have their loan terms as 15 days and 30 days.
	On 11th and 12th September, loan was given to many people maybe as part of a drive.

