In [1]:
import numpy as np
import pandas as pd

In [2]:
# Get multiple outputs in the same cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# To supress future warnings
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings(action='ignore', category=DeprecationWarning)

In [3]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_colwidth',2000)
pd.options.display.float_format='{:.2f}'.format

**Read the Cleaned data from Excel**

In [4]:
data = pd.read_csv('../output/CleanedData.csv')

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88910 entries, 0 to 88909
Data columns (total 19 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Loan ID                       88910 non-null  object 
 1   Customer ID                   88910 non-null  object 
 2   Loan Status                   88910 non-null  object 
 3   Current Loan Amount           88910 non-null  float64
 4   Term                          88910 non-null  object 
 5   Credit Score                  88910 non-null  float64
 6   Years in current job          88910 non-null  int64  
 7   Home Ownership                88910 non-null  object 
 8   Annual Income                 88910 non-null  float64
 9   Purpose                       88910 non-null  object 
 10  Monthly Debt                  88910 non-null  float64
 11  Years of Credit History       88910 non-null  float64
 12  Months since last delinquent  88910 non-null  float64
 13  N

### Data Preprocessing

#### **Dropping Loan ID and Customer ID Features**

In [6]:
data = data.iloc[:,2:]
data.head()

Unnamed: 0,Loan Status,Current Loan Amount,Term,Credit Score,Years in current job,Home Ownership,Annual Income,Purpose,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
0,Loan Refused,12232.0,Short Term,997.5,1,Rent,46643.0,Debt Consolidation,777.39,18.0,10.0,12,0,6762.0,7946.0,0.0,0.0
1,Loan Refused,25014.0,Long Term,997.5,10,Home Mortgage,81099.0,Debt Consolidation,892.09,26.7,32.0,14,0,35706.0,77961.0,0.0,0.0
2,Loan Refused,16117.0,Short Term,997.5,9,Home Mortgage,60438.0,Home Improvements,1244.02,16.7,32.0,11,1,11275.0,14815.0,1.0,0.0
3,Loan Refused,11716.0,Short Term,997.5,3,Rent,34171.0,Debt Consolidation,990.94,10.0,32.0,21,0,7009.0,43533.0,0.0,0.0
4,Loan Refused,9789.0,Long Term,997.5,10,Home Mortgage,47003.0,Home Improvements,503.71,16.7,25.0,13,1,16913.0,19553.0,1.0,0.0


####  **Encoding Categorical Features**
  - Loan Status
  - Term
  - Home Ownership
  - Purpose

In [7]:
columns_to_be_encoded = ['Loan Status', 'Term', 'Home Ownership', 'Purpose']

In [8]:
for feature in columns_to_be_encoded:
    if data[feature].dtype == 'object': 
        print('\n')
        print('feature:',feature)
        print(pd.Categorical(data[feature].unique()))
        print(pd.Categorical(data[feature].unique()).codes)
        data[feature] = pd.Categorical(data[feature]).codes
        
data.info()



feature: Loan Status
['Loan Refused', 'Loan Given']
Categories (2, object): ['Loan Given', 'Loan Refused']
[1 0]


feature: Term
['Short Term', 'Long Term']
Categories (2, object): ['Long Term', 'Short Term']
[1 0]


feature: Home Ownership
['Rent', 'Home Mortgage', 'Own Home']
Categories (3, object): ['Home Mortgage', 'Own Home', 'Rent']
[2 0 1]


feature: Purpose
['Debt Consolidation', 'Home Improvements', 'other', 'Business Loan', 'small_business', ..., 'vacation', 'major_purchase', 'Educational Expenses', 'wedding', 'renewable_energy']
Length: 15
Categories (15, object): ['Business Loan', 'Buy House', 'Buy a Car', 'Debt Consolidation', ..., 'renewable_energy', 'small_business', 'vacation', 'wedding']
[ 3  5 10  0 12  9  2  6  1  7 13  8  4 14 11]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88910 entries, 0 to 88909
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Loan 

In [9]:
data.to_csv('../output/PreProcessedData.csv', index=False)