# Mod 5 Project Central Notebook

## Imports and Data Ingestion

In [72]:
#import libraries

import mod_5_project_helper as hp
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder

#set environment - REMEMBER TO COPY OVER mod_5_project_helper.py

hp.set_environment()
%matplotlib inline
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

#initiliase variables

%run variables.py

#import data

df = pd.read_csv("data.csv")

## Data Cleanup

In [37]:
df.drop_duplicates(inplace = True) #drop 10,728 duplicate rows

In [38]:
df.drop(index = 100000, axis = 0, inplace = True) #delete last blank row in the data set

In [39]:
df.drop(df[df["Current Loan Amount"] == 99999999].index, inplace = True) #delete 11484 rows where the loan amount is 99999999

In [40]:
df.dropna(subset = ['Credit Score'], inplace = True) #delete 19154 rows where Credit Score is missing

In [41]:
df.dropna(subset = ['Years in current job'], inplace = True) #delete 2564 rows where Years in current job is missing 

In [42]:
df["Years in current job"].replace(to_replace = "year.*", value = "",inplace = True, regex = True) #remove 'years' or 'year' from string

In [43]:
df.drop(df[df["Home Ownership"] == "HaveMortgage"].index, inplace = True) #delete 120 rows with "HaveMortgage" as meaning isn't clear

In [44]:
df.dropna(subset = ['Months since last delinquent'], inplace = True) #delete 30,000 rows where data is missing 

In [45]:
df.drop(columns = "Maximum Open Credit", inplace = True) #delete Maximum Open Credit columns as some of these numbers are huge e.g. 798255370.0

In [46]:
df = hp.drop_column_keyword_search(df, ["Loan ID", "Customer ID"]) #delete features 'Loan ID' and 'Customer ID' as they don't add anything

In [47]:
df = df.fillna(df.median()) #fill remaining values with 

In [48]:
#rename and consolidate categorical variables for purpose

df["Purpose"].replace({
              "Business Loan": "business_loan",
              "Medical Bills": "medical bills",
              "Educational Expenses": "educational_expenses",
              "Buy House": "buy_house",
              "Buy a Car": "buy_a_car",
              "Debt Consolidation": "debt_consolidation",
              "Home Improvements": "home_improvements",
              "Take a Trip": "take_a_trip",
              "vacation": "take_a_trip",
              "Major Purchase": "other",
              "Other": "other", 
              "renewable_energy": "home_improvements",
              "small_business": "business_loan",
              "moving": "home_improvements",
              "major_purchase": "major_purchase",
              "wedding": "wedding"
              }, inplace=True)

In [49]:
#rename categorical variables for Home Ownership

df["Home Ownership"].replace({
              "Home Mortgage": "mortgage",
              "Rent": "rent",
              "Own Home": "own_home",
              }, inplace=True)

In [50]:
#rename categorical variables for Term

df["Term"].replace({
              "Long Term": "long_term",
              "Short Term": "short_term",
              }, inplace=True)

In [51]:
#rename categorical variables for Loan Status

df["Loan Status"].replace({
              "Fully Paid": "fully_paid",
              "Charged Off": "default",
              }, inplace=True)

In [52]:
#rename columns to make the dataset easier to work with using . notation

df.columns = ['loan_status',
              'loan_amount',
              'term',
              'credit_score',
              'annual_income',
              'years_in_current_job',
              'home_ownership',
              'loan_purpose',
              'monthly_debt',
              'years_of_credit_history',
              'months_since_last_delinquent',
              'number_of_open_accounts',
              'number_of_credit_problems',
              'current_credit_balance',
              'bankruptcies',
              'tax_liens']

In [53]:
df.reset_index(inplace = True); #reset index

## Cleaned Dataset Information

In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26401 entries, 0 to 26400
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   index                         26401 non-null  int64  
 1   loan_status                   26401 non-null  object 
 2   loan_amount                   26401 non-null  float64
 3   term                          26401 non-null  object 
 4   credit_score                  26401 non-null  float64
 5   annual_income                 26401 non-null  float64
 6   years_in_current_job          26401 non-null  object 
 7   home_ownership                26401 non-null  object 
 8   loan_purpose                  26401 non-null  object 
 9   monthly_debt                  26401 non-null  float64
 10  years_of_credit_history       26401 non-null  float64
 11  months_since_last_delinquent  26401 non-null  float64
 12  number_of_open_accounts       26401 non-null  float64
 13  n

In [55]:
df.shape

(26401, 17)

In [56]:
df.head()

Unnamed: 0,index,loan_status,loan_amount,term,credit_score,annual_income,years_in_current_job,home_ownership,loan_purpose,monthly_debt,years_of_credit_history,months_since_last_delinquent,number_of_open_accounts,number_of_credit_problems,current_credit_balance,bankruptcies,tax_liens
0,6,fully_paid,217646.0,short_term,730.0,1184194.0,< 1,mortgage,debt_consolidation,10855.08,19.6,10.0,13.0,1.0,122170.0,1.0,0.0
1,8,fully_paid,548746.0,short_term,678.0,2559110.0,2,rent,debt_consolidation,18660.28,22.6,33.0,4.0,0.0,437171.0,0.0,0.0
2,14,fully_paid,234124.0,short_term,727.0,693234.0,10+,rent,debt_consolidation,14211.24,24.7,46.0,10.0,1.0,28291.0,1.0,0.0
3,17,fully_paid,666204.0,long_term,723.0,1821967.0,10+,mortgage,debt_consolidation,17612.24,22.0,34.0,15.0,0.0,813694.0,0.0,0.0
4,20,default,317108.0,long_term,687.0,1133274.0,8,rent,debt_consolidation,9632.81,17.4,53.0,4.0,0.0,60287.0,0.0,0.0


## Train, test split, sampling and K-folds

In [57]:
default_df = df[df.loan_status == "default"] #create a new dataframe of loan defaulters

In [58]:
fully_paid_df = df[df.loan_status == "fully_paid"] #create a new dataframe of fully paid loans

In [59]:
fully_paid_sample = fully_paid_df.sample(7479,random_state = 42) #sample the fully paid loans dataframe to a number equal to the default loan dataframe

In [60]:
update_df = pd.concat([default_df, fully_paid_sample]) #create a new dataframe with a 50/50 split of defaulters and paid loans

In [61]:
update_df = update_df.sample(frac = 1, random_state = 42) #shuffle the new 50/50 split dataframe

In [62]:
X = update_df.drop(columns = "loan_status") #create predictor dataframe

In [63]:
y = update_df.loan_status #create target variable dataframe

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) #create train/test split

In [65]:
crossvalidation = KFold(n_splits=10, shuffle=True, random_state=42) #initialise cross validation object

In [66]:
#export data for CSVs for ease

X_train.to_csv("X_train.csv")
X_test.to_csv("X_test.csv")
y_train.to_csv("y_train.csv")
y_test.to_csv("y_test.csv")

## Decision Trees

In [67]:
X_train.reset_index(inplace = True); 

In [70]:
X_train.drop(columns = ["level_0", "index"], inplace = True);

In [71]:
X_train.head()

Unnamed: 0,loan_amount,term,credit_score,annual_income,years_in_current_job,home_ownership,loan_purpose,monthly_debt,years_of_credit_history,months_since_last_delinquent,number_of_open_accounts,number_of_credit_problems,current_credit_balance,bankruptcies,tax_liens
0,130724.0,short_term,732.0,1975658.0,10+,mortgage,debt_consolidation,28153.06,18.0,4.0,9.0,0.0,102220.0,0.0,0.0
1,77088.0,short_term,658.0,665741.0,5,mortgage,debt_consolidation,13869.43,17.0,6.0,11.0,0.0,542659.0,0.0,0.0
2,88484.0,short_term,7330.0,382090.0,3,rent,other,7227.79,17.1,27.0,7.0,0.0,5320.0,0.0,0.0
3,178464.0,short_term,745.0,751374.0,6,mortgage,debt_consolidation,14150.82,18.6,66.0,11.0,0.0,131024.0,0.0,0.0
4,190234.0,short_term,709.0,657172.0,1,mortgage,debt_consolidation,16977.07,10.4,61.0,12.0,1.0,200811.0,0.0,1.0


In [73]:
cat_encoder = OneHotEncoder()

In [74]:
X_train_one_hot_encoded = cat_encoder.fit_transform(X_train)

In [82]:
a = pd.DataX_train_one_hot_encoded.toarray()

In [83]:
type(a)

numpy.ndarray

In [None]:
tree_clf = DecisionTreeClassifier(max_depth = 2)

In [None]:
tree_clf.fit(X_train,y_train)