In [None]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec # Alignments 

import seaborn as sns # theme & dataset
print(f"Matplotlib Version : {mpl.__version__}")
print(f"Seaborn Version : {sns.__version__}")


plt.style.use('ggplot')
%matplotlib inline

In [None]:
df = pd.read_csv('/kaggle/input/loan-data-for-dummy-bank/loan_final313.csv')

### Data Description 

* emp_length_int : Employment length in years. Possible values are between 0 and 10 where 0 means less than one year and 10 means ten or more years.
* home_ownership : The home ownership status provided by the borrower during registration. Our values are: RENT, OWN, MORTGAGE, OTHER.
* income_category : Categorized Income (Low, Medium, High) 
* annual_inc : The self-reported annual income provided by the borrower during registration.

* loan_amount : The listed amount of the loan applied for by the borrower. If at some point in time, the credit department reduces the loan amount, then it will be reflected in this value.
* term : The number of payments on the loan. Values are in months and can be either 36 or 60.

* application_type : Indicates whether the loan is an individual application or a joint application with two co-borrowers
* purpose : A category provided by the borrower for the loan request.
* interest_payments : ;;
* loan_condition : Condition of the Loan [TARGET] (Good Loan = 0 , Bad Loan = 1)
* interest_rate :  Interest Rate on the loan
* grade : LC assigned loan grade
* dti : A ratio calculated using the borrower’s total monthly debt payments on the total debt obligations, - - - excluding mortgage and the requested LC loan, divided by the borrower’s self-reported monthly income.
* total_pymnt : Payments received to date for total amount funded
* total_rec_prncp : Principal received to date
* recoveries : post charge off gross recovery
* installment : The monthly payment owed by the borrower if the loan originates.
* region : region of Loan being executed

### Data Description (Korean)

* year : 대출 발생 연도
* issue_d : 대출 발생 일자
* final_d : 마지막 거래일자
* emp_length_int : "근속년수. 0은 1년 미만, 10은 10년 이상"
* home_ownership : "등록 시 대출자에게서 제공된 집 보유 상태. RENT(대여) = 1, OWN(소유) = 2, MORTAGE(담보대출) = 3 "
* home_ownership_cat : ;;
* income_category : "수익 Low = 1, Medium = 2, High = 3 으로 분류"
* annual_inc : 등록시 대출자에게서 제공된 연간 소득
* income_cat : ;;
* loan_amount : 대출금액(달러)
* term : "대출기간(36개월 = 1, 60개월 = 2)"
* term_cat : ;;
* application_type : "개인 대출 신청(=1) 인지, 2명의 대출자에 의해 공동으로 신청된 대출 신청 (=2)인지 여부"
* application_type_cat : ;;
* purpose : 대출이유
* purpose_cat : "대출용도(빛 청산, 카드 대금 결제, 집 개발 등등) 
[credit_card = 1, car = 2, small_business = 3, other = 4, wedding = 5, debt_consolidation =6, 
home_improvement = 7, major_purchase = 8, medical = 9, moving = 10, vacation = 11, house =12,
renewable_energy = 13,  educational = 14]"
* interest_payments : "이자 지불? (Low = 1, High =2 로 분류)"
* interest_payments_cat : ;;
* loan_condition : 대출의 상태(TARGET) (Good Loan = 0 , Bad Loan = 1)
* loan_condition_cat : ;;
* interest_rate : 대출의 이자율
* grade : 대출 등급 ( A ~ G , 1~7)
* grade_cat : ;;
* dti : 금융부채 상환능력을 소득으로 따져서 대출한도를 정하는 계산비율
* total_pymnt : 총 상환금액
* total_rec_prncp : ???
* recoveries : 회수
* installment : 분할 불입금
* region : 거래지역


In [None]:
df.head()

In [None]:
plt.rcParams['figure.dpi'] = 200

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
## Check how imbalanced TARGET is
df['loan_condition'].value_counts()

In [None]:
# to recognize what is designated as _cat
df3 = df.loc[df['grade'] == 'G']
df3['grade_cat'].head()

## Data Visualization

In [None]:
f,ax=plt.subplots(1,2,figsize=(18,8))
df['loan_condition'].value_counts().plot.pie(explode=[0,0.1],autopct='%1.1f%%',ax=ax[0],shadow=True)
ax[0].set_title('loan_condition')
ax[0].set_ylabel('')
sns.countplot('loan_condition',data=df,ax=ax[1])
ax[1].set_title('loan_condition')
plt.show()

**Categorical Features in the dataset**:, home_ownership,  term, application_type, purpose, region.

**Ordinal Features in the dataset** :emp_length_int, grade, interest_payments, income_category

**Continous Features in the dataset**: loan_amount, annual_inc, interest_rate, dti, total_pymnt, recoveries, installment

## Analysing The Features

## Categorical Features

#### : home_ownership,  term, application_type, purpose, region.


### home_ownership 

: 등록 시 대출자에게서 제공된 집 보유 상태. RENT(대여) = 1, OWN(소유) = 2, MORTAGE(담보대출) = 3 

In [None]:
df.groupby(['home_ownership','loan_condition'])['loan_condition'].count()

In [None]:
f,ax=plt.subplots(1,2,figsize=(18,8))
df[['home_ownership','loan_condition_cat']].groupby(['home_ownership']).mean().plot.bar(ax=ax[0])
ax[0].set_title('loan_condition vs home_ownership')
sns.countplot('home_ownership',hue='loan_condition_cat',data=df,ax=ax[1])
ax[1].set_title('home_ownership:Bad Loan vs Good loan')
plt.show()

### term 

: 대출기간(36개월 = 1, 60개월 = 2)

In [None]:
df.groupby(['term','loan_condition'])['loan_condition'].count()

In [None]:
pd.crosstab(df.term,df.loan_condition,margins=True).style.background_gradient(cmap='summer_r')

In [None]:
f,ax=plt.subplots(1,2,figsize=(18,8))
df[['term','loan_condition_cat']].groupby(['term']).mean().plot.bar(ax=ax[0])
ax[0].set_title('loan_condition vs term')
sns.countplot('term',hue='loan_condition_cat',data=df,ax=ax[1])
ax[1].set_title('term:Bad Loan vs Good loan')
plt.show()

### application_type 

: 개인 대출 신청(=1) 인지, 2명의 대출자에 의해 공동으로 신청된 대출 신청 (=2)인지 여부

In [None]:
pd.crosstab(df.application_type,df.loan_condition,margins=True).style.background_gradient(cmap='summer_r')

### purpose
: 대출이유

In [None]:
pd.crosstab(df.purpose,df.loan_condition,margins=True).style.background_gradient(cmap='summer_r')

In [None]:
f,ax=plt.subplots(1,2,figsize=(18,8))
df[['purpose','loan_condition_cat']].groupby(['purpose']).mean().plot.bar(ax=ax[0])
ax[0].set_title('loan_condition vs purpose')
sns.countplot('purpose_cat',hue='loan_condition_cat',data=df,ax=ax[1])
ax[1].set_title('purpose:Bad Loan vs Good Loan')
plt.show()

####purpose_cat = 6 refers to debt_consolidation

### region 

: 거래지역

In [None]:
pd.crosstab(df.region,df.loan_condition,margins=True).style.background_gradient(cmap='summer_r')

In [None]:
f,ax=plt.subplots(1,2,figsize=(18,8))
df[['region','loan_condition_cat']].groupby(['region']).mean().plot.bar(ax=ax[0])
ax[0].set_title('loan_condition vs region')
sns.countplot('region',hue='loan_condition_cat',data=df,ax=ax[1])
ax[1].set_title('Region : ')
plt.show()

## Ordinal Features

#### : emp_length_int, grade, interest_payments, income_category


## emp_length_int

: 근속년수. 0은 1년 미만, 10은 10년 이상

In [None]:
pd.crosstab(df.emp_length_int,df.loan_condition,margins=True).style.background_gradient(cmap='summer_r')

In [None]:
f,ax=plt.subplots(1,2,figsize=(18,8))
df['emp_length_int'].value_counts().plot.bar(color=['#CD7F32','#FFDF00','#D3D3D3'],ax=ax[0])
ax[0].set_title('Number Of Loans By emp_length_int')
ax[0].set_ylabel('Count')
sns.countplot('emp_length_int',hue='loan_condition_cat',data=df,ax=ax[1])
ax[1].set_title('emp_length_int: Bad Loan vs Good Loan')
plt.show()

## grade

: 대출 등급 ( A ~ G , 1~7)

In [None]:
pd.crosstab(df.grade,df.loan_condition,margins=True).style.background_gradient(cmap='summer_r')

In [None]:
f,ax=plt.subplots(1,2,figsize=(18,8))
df['grade'].value_counts().plot.bar(color=['#CD7F32','#FFDF00','#D3D3D3'],ax=ax[0])
ax[0].set_title('Number Of Loans By Grade')
ax[0].set_ylabel('Count')
sns.countplot('grade',hue='loan_condition_cat',data=df,ax=ax[1])
ax[1].set_title('grade: Bad Loan vs Good Loan')
plt.show()

## interest_payments

: 이자 지불? (Low = 1, High =2 로 분류)

In [None]:
pd.crosstab(df.interest_payments,df.loan_condition,margins=True).style.background_gradient(cmap='summer_r')

In [None]:
f,ax=plt.subplots(1,2,figsize=(18,8))
df['interest_payments'].value_counts().plot.bar(color=['#CD7F32','#FFDF00','#D3D3D3'],ax=ax[0])
ax[0].set_title('Number Of Loans By Interest_payments')
ax[0].set_ylabel('Count')
sns.countplot('interest_payments',hue='loan_condition_cat',data=df,ax=ax[1])
ax[1].set_title('interest_payments: Bad Loan vs Good Loan')
plt.show()

## income_category

: 수익 Low = 1, Medium = 2, High = 3 으로 분류

In [None]:
pd.crosstab(df.income_category,df.loan_condition,margins=True).style.background_gradient(cmap='summer_r')

In [None]:
f,ax=plt.subplots(1,2,figsize=(18,8))
df['income_category'].value_counts().plot.bar(color=['#CD7F32','#FFDF00','#D3D3D3'],ax=ax[0])
ax[0].set_title('Number Of Loans By Income_category')
ax[0].set_ylabel('Count')
sns.countplot('income_category',hue='loan_condition_cat',data=df,ax=ax[1])
ax[1].set_title('Income_category: Bad Loan vs Good Loan')
plt.show()

## Continuous Features

#### : loan_amount, annual_inc, interest_rate, dti, total_pymnt, recoveries, installment


## loan_amount

: 대출금액(달러)

In [None]:
print('Highest Loan Amount was:',df['loan_amount'].max(),'$')
print('Lowest Loan Amount was:',df['loan_amount'].min(),'$')
print('Average Loan Amount was:',df['loan_amount'].mean(),'$')

In [None]:
f,ax=plt.subplots(1,2,figsize=(18,8))
sns.violinplot("income_category","loan_amount", hue="loan_condition_cat", data=df,split=True,ax=ax[0])
ax[0].set_title('income_category and loan_amount vs Loan_condition')
ax[0].set_yticks(range(0,40000,5000))
sns.violinplot("term","loan_amount", hue="loan_condition_cat", data=df,split=True,ax=ax[1])
ax[1].set_title('Term and loan_amount vs Loan_condition')
ax[1].set_yticks(range(0,40000,5000))
plt.show()

## installment

: 분할 불입금

In [None]:
print('Highest installment was:',df['installment'].max(),'$')
print('Lowest installment was:',df['installment'].min(),'$')
print('Average installment was:',df['installment'].mean(),'$')

In [None]:
f,ax=plt.subplots(1,2,figsize=(18,8))
sns.violinplot("income_category","installment", hue="loan_condition_cat", data=df,split=True,ax=ax[0])
ax[0].set_title('income_category and installment vs Loan_condition')
ax[0].set_yticks(range(0,1500,300))
sns.violinplot("term","installment", hue="loan_condition_cat", data=df,split=True,ax=ax[1])
ax[1].set_title('Term and installment vs Loan_condition')
ax[1].set_yticks(range(0,1500,300))
plt.show()

## interest_rate

: 대출의 이자율

In [None]:
print('Highest interest_rate was:',df['interest_rate'].max(),'%')
print('Lowest interest_rate was:',df['interest_rate'].min(),'%')
print('Average interest_rate was:',df['interest_rate'].mean(),'%')

In [None]:
f,ax=plt.subplots(1,2,figsize=(18,8))
sns.violinplot("income_category","interest_rate", hue="loan_condition_cat", data=df,split=True,ax=ax[0])
ax[0].set_title('income_category and interest_rate vs Loan_condition')
ax[0].set_yticks(range(0,30,3))
sns.violinplot("term","interest_rate", hue="loan_condition_cat", data=df,split=True,ax=ax[1])
ax[1].set_title('Term and interest_rate vs Loan_condition')
ax[1].set_yticks(range(0,30,3))
plt.show()

### Correlation Between The Features

In [None]:
df_labeled = df.drop(['id','final_d', 'year','loan_condition', 'issue_d', 'home_ownership', 'income_category', 
              'term', 'application_type', 'purpose', 'interest_payments', 'loan_condition', 'grade', 'region'],axis=1 )

In [None]:
sns.heatmap(df_labeled.corr(),annot=True,cmap='RdYlGn',linewidths=0.2) #df.corr()-->correlation matrix
fig=plt.gcf()
fig.set_size_inches(20, 16)
plt.show()

#### Highly correlated Features
: loan_amount & installment ,
  total_rec_prncp & total_pymnt ,
  grade_cat & interest_rate
  

### **Drop Columns for MODEL _ Without FE and elimination**

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['region'] = le.fit_transform(df['region'])

In [None]:
df_for_use = df.drop(['id','final_d', 'year','loan_condition', 'issue_d', 'home_ownership', 'income_category', 
              'term', 'application_type', 'purpose', 'interest_payments', 'loan_condition', 'grade'],axis=1 )

In [None]:
df_for_use.head()

In [None]:
#### Export


df_for_use.to_pickle('df_for_use.pkl')

## Feature_Engineering

In [None]:
df_fe = df.drop(['id','final_d', 'year','loan_condition', 'issue_d', 'home_ownership', 'income_category', 'term', 'application_type', 'purpose', 'interest_payments', 'loan_condition', 'grade', 'region'],axis=1 )

In [None]:
df_fe.columns

In [None]:
# df_fe.drop(['total_rec_prncp', 'installment', 'interest_rate' ] , axis = 1, inplace = True)

In [None]:
df_fe['LoanAmntOverIncome'] = df_fe['loan_amount'] / df_fe['annual_inc']
df_fe['installmentOverLoanAmnt'] = df_fe['installment'] / df_fe['loan_amount']
df_fe['totalPymntOverIncome'] = df_fe['total_pymnt'] / df_fe['annual_inc']
df_fe['totalRecPrncpOverIncome'] = df_fe['total_rec_prncp'] / df_fe['annual_inc']



In [None]:
df.dtypes

In [None]:
df_fe.astype({'LoanAmntOverIncome': 'float32'}).dtypes

In [None]:
df_fe['totalRecPrncpOverIncome']

In [None]:
nanCounter = np.isnan(df_fe.loc[:,df_fe.columns]).sum()

In [None]:
nanCounter

In [None]:
df_fe.head()

In [None]:
#### Export


df_fe.to_pickle('df_fe.pkl')