# Mod 5 Project Central Notebook

## Imports and Data Ingestion

In [234]:
#import libraries

import mod_5_project_helper as hp
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split

#set environment - REMEMBER TO COPY OVER mod_5_project_helper.py

hp.set_environment()
%matplotlib inline
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

#initiliase variables

%run variables.py

#import data

df = pd.read_csv("data.csv")

## Data Cleanup

In [235]:
df.drop_duplicates(inplace = True) #drop 10,728 duplicate rows

In [236]:
df.drop(index = 100000, axis = 0, inplace = True) #delete last blank row in the data set

In [237]:
df.drop(df[df["Current Loan Amount"] == 99999999].index, inplace = True) #delete 11484 rows where the loan amount is 99999999

In [238]:
df.dropna(subset = ['Credit Score'], inplace = True) #delete 19154 rows where Credit Score is missing

In [239]:
df.dropna(subset = ['Years in current job'], inplace = True) #delete 2564 rows where Years in current job is missing 

In [240]:
df["Years in current job"].replace(to_replace = "year.*", value = "",inplace = True, regex = True) #remove 'years' or 'year' from string

In [241]:
df.drop(df[df["Home Ownership"] == "HaveMortgage"].index, inplace = True) #delete 120 rows with "HaveMortgage" as meaning isn't clear

In [242]:
df.dropna(subset = ['Months since last delinquent'], inplace = True) #delete 30,000 rows where data is missing 

In [243]:
df.drop(columns = "Maximum Open Credit", inplace = True) #delete Maximum Open Credit columns as some of these numbers are huge e.g. 798255370.0

In [244]:
df = hp.drop_column_keyword_search(df, ["Loan ID", "Customer ID"]) #delete features 'Loan ID' and 'Customer ID' as they don't add anything

In [245]:
df = df.fillna(df.median()) #fill remaining values with 

In [246]:
#rename and consolidate categorical variables for purpose

df["Purpose"].replace({
              "Business Loan": "business_loan",
              "Medical Bills": "medical bills",
              "Educational Expenses": "educational_expenses",
              "Buy House": "buy_house",
              "Buy a Car": "buy_a_car",
              "Debt Consolidation": "debt_consolidation",
              "Home Improvements": "home_improvements",
              "Take a Trip": "take_a_trip",
              "vacation": "take_a_trip",
              "Major Purchase": "other:",
              "Other": "other", 
              "renewable_energy": "home_improvements",
              "small_business": "business_loan",
              "moving": "home_improvements",
              "major_purchase": "major_purchase",
              "wedding": "wedding"
              }, inplace=True)

In [247]:
#rename categorical variables for Home Ownership

df["Home Ownership"].replace({
              "Home Mortgage": "mortgage",
              "Rent": "rent",
              "Own Home": "own_home",
              }, inplace=True)

In [248]:
#rename categorical variables for Term

df["Term"].replace({
              "Long Term": "long_term",
              "Short Term": "short_term",
              }, inplace=True)

In [249]:
#rename categorical variables for Loan Status

df["Loan Status"].replace({
              "Fully Paid": "fully_paid",
              "Charged Off": "default",
              }, inplace=True)

In [250]:
#rename columns to make the dataset easier to work with using . notation

df.columns = ['loan_status',
              'loan_amount',
              'term',
              'credit_score',
              'annual_income',
              'years_in_current_job',
              'home_ownership',
              'loan_purpose',
              'monthly_debt',
              'years_of_credit_history',
              'months_since_last_delinquent',
              'number_of_open_accounts',
              'number_of_credit_problems',
              'current_credit_balance',
              'bankruptcies',
              'tax_liens']

In [251]:
df.reset_index(inplace = True); #reset index

## Cleaned Dataset Information

In [252]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26401 entries, 0 to 26400
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   index                         26401 non-null  int64  
 1   loan_status                   26401 non-null  object 
 2   loan_amount                   26401 non-null  float64
 3   term                          26401 non-null  object 
 4   credit_score                  26401 non-null  float64
 5   annual_income                 26401 non-null  float64
 6   years_in_current_job          26401 non-null  object 
 7   home_ownership                26401 non-null  object 
 8   loan_purpose                  26401 non-null  object 
 9   monthly_debt                  26401 non-null  float64
 10  years_of_credit_history       26401 non-null  float64
 11  months_since_last_delinquent  26401 non-null  float64
 12  number_of_open_accounts       26401 non-null  float64
 13  n

In [253]:
df.shape

(26401, 17)

In [254]:
df.head()

Unnamed: 0,index,loan_status,loan_amount,term,credit_score,annual_income,years_in_current_job,home_ownership,loan_purpose,monthly_debt,years_of_credit_history,months_since_last_delinquent,number_of_open_accounts,number_of_credit_problems,current_credit_balance,bankruptcies,tax_liens
0,6,fully_paid,217646.0,short_term,730.0,1184194.0,< 1,mortgage,debt_consolidation,10855.08,19.6,10.0,13.0,1.0,122170.0,1.0,0.0
1,8,fully_paid,548746.0,short_term,678.0,2559110.0,2,rent,debt_consolidation,18660.28,22.6,33.0,4.0,0.0,437171.0,0.0,0.0
2,14,fully_paid,234124.0,short_term,727.0,693234.0,10+,rent,debt_consolidation,14211.24,24.7,46.0,10.0,1.0,28291.0,1.0,0.0
3,17,fully_paid,666204.0,long_term,723.0,1821967.0,10+,mortgage,debt_consolidation,17612.24,22.0,34.0,15.0,0.0,813694.0,0.0,0.0
4,20,default,317108.0,long_term,687.0,1133274.0,8,rent,debt_consolidation,9632.81,17.4,53.0,4.0,0.0,60287.0,0.0,0.0


## Train, test split, sampling and K-folds

In [294]:
#everything below here is a work in progress so please ignore

In [274]:
#X = df.drop(columns = "loan_status")

In [275]:
#y = df.loan_status

In [284]:
X_pre_sample = df

In [285]:
y = df

In [286]:
X_train_pre_sample, X_test, y_train, y_test = train_test_split(X_pre_sample, y, test_size=0.2, random_state=42)

In [287]:
X_train_pre_sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21120 entries, 16900 to 23654
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   index                         21120 non-null  int64  
 1   loan_status                   21120 non-null  object 
 2   loan_amount                   21120 non-null  float64
 3   term                          21120 non-null  object 
 4   credit_score                  21120 non-null  float64
 5   annual_income                 21120 non-null  float64
 6   years_in_current_job          21120 non-null  object 
 7   home_ownership                21120 non-null  object 
 8   loan_purpose                  21120 non-null  object 
 9   monthly_debt                  21120 non-null  float64
 10  years_of_credit_history       21120 non-null  float64
 11  months_since_last_delinquent  21120 non-null  float64
 12  number_of_open_accounts       21120 non-null  float64
 1

In [293]:
X_train_pre_sample.groupby(X_pre_sample.loan_status).count()

Unnamed: 0_level_0,index,loan_status,loan_amount,term,credit_score,annual_income,years_in_current_job,home_ownership,loan_purpose,monthly_debt,years_of_credit_history,months_since_last_delinquent,number_of_open_accounts,number_of_credit_problems,current_credit_balance,bankruptcies,tax_liens
loan_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
default,6023,6023,6023,6023,6023,6023,6023,6023,6023,6023,6023,6023,6023,6023,6023,6023,6023
fully_paid,15097,15097,15097,15097,15097,15097,15097,15097,15097,15097,15097,15097,15097,15097,15097,15097,15097


In [None]:
X_train = X_train_pre_sample[X_pre_sample()

In [113]:
sample = df.sample(7000, random_state = 42)