# Machine Learning Project - Data Cleaning

We will focus on credit modelling, a well known data science problem that focuses on modeling a borrower's credit risk. We'll be working with financial lending data from Lending Club, which is a marketplace for personal loans that matches borrowers who are seeking a loan with investors looking to lend money and make a return. The data set contains data for all of the approved and declined loan applications from 2007 to 2011.
The goal is to build machine learning models that reliably predict if a loan will be paid off or not.

In [3]:
import pandas as pd

## Exploring and Cleaning the Data

In [6]:
loans_2007 = pd.read_csv('LoanStats3a.csv', skiprows=1, low_memory=False)
half_count = len(loans_2007) / 2
loans_2007 = loans_2007.dropna(thresh = half_count, axis=1)
loans_2007 = loans_2007.drop(['desc'],axis=1)
loans_2007.to_csv('loans_2007.csv', index=False)

In [7]:
loans_2007 = pd.read_csv('loans_2007.csv')

In [8]:
loans_2007.shape

(42538, 53)

In [10]:
loans_2007.drop_duplicates()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,...,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens,hardship_flag,disbursement_method,debt_settlement_flag
0,5000.0,5000.0,4975.000000,36 months,10.65%,162.87,B,B2,,10+ years,...,1.0,Individual,0.0,0.0,0.0,0.0,0.0,N,Cash,N
1,2500.0,2500.0,2500.000000,60 months,15.27%,59.83,C,C4,Ryder,< 1 year,...,1.0,Individual,0.0,0.0,0.0,0.0,0.0,N,Cash,N
2,2400.0,2400.0,2400.000000,36 months,15.96%,84.33,C,C5,,10+ years,...,1.0,Individual,0.0,0.0,0.0,0.0,0.0,N,Cash,N
3,10000.0,10000.0,10000.000000,36 months,13.49%,339.31,C,C1,AIR RESOURCES BOARD,10+ years,...,1.0,Individual,0.0,0.0,0.0,0.0,0.0,N,Cash,N
4,3000.0,3000.0,3000.000000,60 months,12.69%,67.79,B,B5,University Medical Group,1 year,...,1.0,Individual,0.0,0.0,0.0,0.0,0.0,N,Cash,N
5,5000.0,5000.0,5000.000000,36 months,7.90%,156.46,A,A4,Veolia Transportaton,3 years,...,1.0,Individual,0.0,0.0,0.0,0.0,0.0,N,Cash,N
6,7000.0,7000.0,7000.000000,60 months,15.96%,170.08,C,C5,Southern Star Photography,8 years,...,1.0,Individual,0.0,0.0,0.0,0.0,0.0,N,Cash,N
7,3000.0,3000.0,3000.000000,36 months,18.64%,109.43,E,E1,MKC Accounting,9 years,...,1.0,Individual,0.0,0.0,0.0,0.0,0.0,N,Cash,N
8,5600.0,5600.0,5600.000000,60 months,21.28%,152.39,F,F2,,4 years,...,1.0,Individual,0.0,0.0,0.0,0.0,0.0,N,Cash,N
9,5375.0,5375.0,5350.000000,60 months,12.69%,121.45,B,B5,Starbucks,< 1 year,...,1.0,Individual,0.0,0.0,0.0,0.0,0.0,N,Cash,N


In [11]:
loans_2007.shape

(42538, 53)

In [12]:
loans_2007.head()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,...,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens,hardship_flag,disbursement_method,debt_settlement_flag
0,5000.0,5000.0,4975.0,36 months,10.65%,162.87,B,B2,,10+ years,...,1.0,Individual,0.0,0.0,0.0,0.0,0.0,N,Cash,N
1,2500.0,2500.0,2500.0,60 months,15.27%,59.83,C,C4,Ryder,< 1 year,...,1.0,Individual,0.0,0.0,0.0,0.0,0.0,N,Cash,N
2,2400.0,2400.0,2400.0,36 months,15.96%,84.33,C,C5,,10+ years,...,1.0,Individual,0.0,0.0,0.0,0.0,0.0,N,Cash,N
3,10000.0,10000.0,10000.0,36 months,13.49%,339.31,C,C1,AIR RESOURCES BOARD,10+ years,...,1.0,Individual,0.0,0.0,0.0,0.0,0.0,N,Cash,N
4,3000.0,3000.0,3000.0,60 months,12.69%,67.79,B,B5,University Medical Group,1 year,...,1.0,Individual,0.0,0.0,0.0,0.0,0.0,N,Cash,N


In [13]:
loans_2007.columns

Index(['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate',
       'installment', 'grade', 'sub_grade', 'emp_title', 'emp_length',
       'home_ownership', 'annual_inc', 'verification_status', 'issue_d',
       'loan_status', 'pymnt_plan', 'purpose', 'title', 'zip_code',
       'addr_state', 'dti', 'delinq_2yrs', 'earliest_cr_line',
       'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util',
       'total_acc', 'initial_list_status', 'out_prncp', 'out_prncp_inv',
       'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_d', 'last_pymnt_amnt', 'last_credit_pull_d',
       'collections_12_mths_ex_med', 'policy_code', 'application_type',
       'acc_now_delinq', 'chargeoff_within_12_mths', 'delinq_amnt',
       'pub_rec_bankruptcies', 'tax_liens', 'hardship_flag',
       'disbursement_method', 'debt_settlement_flag'],
      dtype='object')

## Removing unuseful columns

In [15]:
loans_2007 = loans_2007.drop(["funded_amnt", "funded_amnt_inv", "grade", "sub_grade", "emp_title", "issue_d"], axis=1)

In [16]:
loans_2007 = loans_2007.drop(["zip_code", "out_prncp", "out_prncp_inv", "total_pymnt", "total_pymnt_inv", "total_rec_prncp"], axis=1)

In [17]:
loans_2007 = loans_2007.drop(["total_rec_int", "total_rec_late_fee", "recoveries", "collection_recovery_fee", "last_pymnt_d", "last_pymnt_amnt"], axis=1)

In [18]:
loans_2007.shape

(42538, 35)

In [19]:
loans_2007.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,emp_length,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens,hardship_flag,disbursement_method,debt_settlement_flag
0,5000.0,36 months,10.65%,162.87,10+ years,RENT,24000.0,Verified,Fully Paid,n,...,1.0,Individual,0.0,0.0,0.0,0.0,0.0,N,Cash,N
1,2500.0,60 months,15.27%,59.83,< 1 year,RENT,30000.0,Source Verified,Charged Off,n,...,1.0,Individual,0.0,0.0,0.0,0.0,0.0,N,Cash,N
2,2400.0,36 months,15.96%,84.33,10+ years,RENT,12252.0,Not Verified,Fully Paid,n,...,1.0,Individual,0.0,0.0,0.0,0.0,0.0,N,Cash,N
3,10000.0,36 months,13.49%,339.31,10+ years,RENT,49200.0,Source Verified,Fully Paid,n,...,1.0,Individual,0.0,0.0,0.0,0.0,0.0,N,Cash,N
4,3000.0,60 months,12.69%,67.79,1 year,RENT,80000.0,Source Verified,Fully Paid,n,...,1.0,Individual,0.0,0.0,0.0,0.0,0.0,N,Cash,N


## Choosing the target column

In [21]:
target = 'loan_status'

In [22]:
loans_2007[target].value_counts()

Fully Paid                                             34116
Charged Off                                             5670
Does not meet the credit policy. Status:Fully Paid      1988
Does not meet the credit policy. Status:Charged Off      761
Name: loan_status, dtype: int64

## Preparing target column for Binary Classification

In [25]:
loans_2007.shape

(42538, 35)

In [26]:
loans_2007 = loans_2007[(loans_2007['loan_status'] == "Fully Paid") | (loans_2007['loan_status'] == "Charged Off")]

In [27]:
loans_2007.shape

(39786, 35)

In [28]:
mapping_dict = {
    "loan_status" : {
        "Fully Paid": 1,
        "Charged Off": 0,
    }
}

loans_2007 = loans_2007.replace(mapping_dict)

In [29]:
loans_2007.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,emp_length,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens,hardship_flag,disbursement_method,debt_settlement_flag
0,5000.0,36 months,10.65%,162.87,10+ years,RENT,24000.0,Verified,1,n,...,1.0,Individual,0.0,0.0,0.0,0.0,0.0,N,Cash,N
1,2500.0,60 months,15.27%,59.83,< 1 year,RENT,30000.0,Source Verified,0,n,...,1.0,Individual,0.0,0.0,0.0,0.0,0.0,N,Cash,N
2,2400.0,36 months,15.96%,84.33,10+ years,RENT,12252.0,Not Verified,1,n,...,1.0,Individual,0.0,0.0,0.0,0.0,0.0,N,Cash,N
3,10000.0,36 months,13.49%,339.31,10+ years,RENT,49200.0,Source Verified,1,n,...,1.0,Individual,0.0,0.0,0.0,0.0,0.0,N,Cash,N
4,3000.0,60 months,12.69%,67.79,1 year,RENT,80000.0,Source Verified,1,n,...,1.0,Individual,0.0,0.0,0.0,0.0,0.0,N,Cash,N


## Removing single value columns

In [30]:
single_val_cols = []

cols = loans_2007.columns

for col in cols:
    is_unique = loans_2007[col].dropna().unique()
    if len(is_unique) == 1:
        single_val_cols.append(col)
        
loans_2007 = loans_2007.drop(single_val_cols, axis=1)

In [31]:
loans_2007.shape

(39786, 24)

In [32]:
loans_2007.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,emp_length,home_ownership,annual_inc,verification_status,loan_status,purpose,...,earliest_cr_line,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,total_acc,last_credit_pull_d,pub_rec_bankruptcies,debt_settlement_flag
0,5000.0,36 months,10.65%,162.87,10+ years,RENT,24000.0,Verified,1,credit_card,...,Jan-1985,1.0,3.0,0.0,13648.0,83.7%,9.0,Oct-2018,0.0,N
1,2500.0,60 months,15.27%,59.83,< 1 year,RENT,30000.0,Source Verified,0,car,...,Apr-1999,5.0,3.0,0.0,1687.0,9.4%,4.0,Oct-2016,0.0,N
2,2400.0,36 months,15.96%,84.33,10+ years,RENT,12252.0,Not Verified,1,small_business,...,Nov-2001,2.0,2.0,0.0,2956.0,98.5%,10.0,Jun-2017,0.0,N
3,10000.0,36 months,13.49%,339.31,10+ years,RENT,49200.0,Source Verified,1,other,...,Feb-1996,1.0,10.0,0.0,5598.0,21%,37.0,Apr-2016,0.0,N
4,3000.0,60 months,12.69%,67.79,1 year,RENT,80000.0,Source Verified,1,other,...,Jan-1996,0.0,15.0,0.0,27783.0,53.9%,38.0,Apr-2018,0.0,N
