### WELCOME TO THE ANALYSIS OF A LENDING CLUB LOAN DATA

<h5>1. Importing Relevant libraries and Modules</h5>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#from datetime import datetime

In [2]:
#Remove Warnings
import warnings
warnings.filterwarnings("ignore")

<h5>2.Load your CSV File</h5>

In [None]:
loan_dataset = pd.read_csv("loan.csv", encoding= "ISO-8859-1") #The encoding is useful in this case

In [None]:
#create a shallow copy of this dataset so that our original would be unaffected
loan_dataset_cp = loan_dataset.copy(deep = True)

#### Inspect the data to know what we have

In [None]:
loan_dataset_cp.shape

In [None]:
loan_dataset_cp.head()

#### Dropping Some Irrelevant Columns and Taking only the Ones we need

In [None]:
#let's know or let's see the columns we have in our dataset already

In [None]:
loan_dataset_cp.columns

In [None]:
#or rather.., let's store it in an empty list
loan_dataset_columns = [column for column in loan_dataset_cp.columns]

In [None]:
len(loan_dataset_columns)

In [None]:
for column in loan_dataset_columns:
    print(column)

In [None]:
'acc_now_delinq' in loan_dataset_columns

In [None]:
loan_dataset_columns.sort()

In [None]:
for columns in loan_dataset_columns:
    print(columns)

In [None]:
#Rather than dropping, which is more tedious let's carry the ones we need

In [None]:
#let's just drop regardless

In [None]:
loan_dataset_cp = loan_dataset_cp.drop([
'acc_now_delinq'
,'acc_open_past_24mths'
,'all_util'
,'annual_inc_joint'
,'application_type'
,'avg_cur_bal'
,'bc_open_to_buy'
,'bc_util'
,'chargeoff_within_12_mths'
,'collection_recovery_fee'
,'collections_12_mths_ex_med'
,'delinq_2yrs'
,'delinq_amnt'
,'desc'
,'dti_joint'
,'earliest_cr_line'
,'home_ownership'
,'id'
,'il_util'
,'initial_list_status'
,'inq_fi'
,'inq_last_12m'
,'max_bal_bc'
,'member_id'
,'mo_sin_old_il_acct'
,'mo_sin_old_rev_tl_op'
,'mo_sin_rcnt_rev_tl_op'
,'mo_sin_rcnt_tl'
,'mort_acc'
,'mths_since_last_delinq'
,'mths_since_last_major_derog'
,'mths_since_last_record'
,'mths_since_rcnt_il'
,'mths_since_recent_bc'
,'mths_since_recent_bc_dlq'
,'mths_since_recent_inq'
,'mths_since_recent_revol_delinq'
,'num_accts_ever_120_pd'
,'num_actv_bc_tl'
,'num_actv_rev_tl'
,'num_bc_sats'
,'num_bc_tl'
,'num_il_tl'
,'num_op_rev_tl'
,'num_rev_accts'
,'num_rev_tl_bal_gt_0'
,'num_sats'
,'num_tl_120dpd_2m'
,'num_tl_30dpd'
,'num_tl_90g_dpd_24m'
,'num_tl_op_past_12m'
,'open_acc_6m'
,'open_il_12m'
,'open_il_24m'
,'open_il_6m'
,'open_rv_12m'
,'open_rv_24m'
,'out_prncp'
,'out_prncp_inv'
,'pct_tl_nvr_dlq'
,'percent_bc_gt_75'
,'policy_code'
,'pymnt_plan'
,'recoveries'
,'tax_liens'
,'title'
,'tot_coll_amt'
,'tot_cur_bal'
,'tot_hi_cred_lim'
,'total_bal_ex_mort'
,'total_bal_il'
,'total_bc_limit'
,'total_cu_tl'
,'next_pymnt_d'
,'total_il_high_credit_limit'
,'total_pymnt'
,'total_pymnt_inv'
,'total_rec_int'
,'total_rec_late_fee'
,'total_rec_prncp'
,'verification_status_joint'
,'total_rev_hi_lim'
,'last_pymnt_amnt'
,'last_credit_pull_d'
,'url'], axis='columns')

In [None]:
loan_dataset_cp.shape

In [None]:
loan_dataset_cp.head()

In [None]:
#let's gain more insight into our data

In [None]:
loan_dataset_cp.info()

In [None]:
#let's take a look at the memory usage alright

In [None]:
loan_dataset_cp.memory_usage()

In [None]:
type(loan_dataset_cp.memory_usage())

In [None]:
317736/1000

In [None]:
317/1000 * 26

#### Let's do a quick transformation using Apply/Map - Lambda
<p>This is Actually a little bit of feature Engineering</p>

In [None]:
loan_dataset_cp.loan_status.head()

In [None]:
#let's create a new column called defaulted that returns True(1) if it was charged off i.e. defaulted
#and False (0) if it was fully paid

In [None]:
loan_dataset_cp['defaulted'] = loan_dataset_cp['loan_status'].map(lambda x: 1 if x == 'Charged Off' else 0)
#this could also be like this alright
#loan_dataset_cp['defaulted'] = loan_dataset_cp['loan_status'].apply(lambda x: 1 if x == 'Charged Off' else 0)

In [None]:
#observe, we can't see all the columns, let's set the pd options of the column so that we can see it

In [None]:
pd.options.display.max_columns = 30

In [None]:
loan_dataset_cp.head()

In [None]:
#Let's see some statistical analysis of this data

In [None]:
loan_dataset_cp.describe()

In [None]:
type(loan_dataset_cp.describe())

In [None]:
loan_dataset_cp.describe().loan_amnt

In [None]:
#general information of the dataset

In [None]:
loan_dataset_cp.info()

#### UNIVARIATE ANALYSIS

In [None]:
#Aim: To checkout the distribution of Loan Amounts and Funded Amounts

In [None]:
for values in loan_dataset_cp.columns:
    if values.endswith('amnt'):
        print(values)
    else:
        pass

In [None]:
fig = plt.figure(figsize = (8,4))
plt.hist(x = [loan_dataset_cp.loan_amnt, loan_dataset_cp.funded_amnt], label = ['Loan Amount', 'Funded Amount'])
plt.xlabel("Loan Amount/Funded Amount")
plt.ylabel("The count of these")
plt.title("LOAN AMOUNT/FUNDED AMOUNT VS TITLE")
plt.legend()
fig.show()

##### Just a pivot_table analysis

In [None]:
#let's do average funded amount for defaulters and not defauotrers for the 36 and 60 months period

In [None]:
loan_dataset_cp.columns

In [None]:
loan_dataset_cp.pivot_table(index = 'defaulted', columns = 'term', values = 'funded_amnt', aggfunc = 'mean')