# Mod 5 Project - Sandy's Notebook

## Imports and Data Ingestion

In [1]:
#import libraries

import mod_5_project_helper as hp
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import export_graphviz


#set environment - REMEMBER TO COPY OVER mod_5_project_helper.py

hp.set_environment()
%matplotlib inline
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

#initiliase variables

%run variables.py

#import data

df = pd.read_csv("data.csv")

## Data Cleanup

In [2]:
df.drop_duplicates(inplace = True) #drop 10,728 duplicate rows

In [3]:
df.drop(index = 100000, axis = 0, inplace = True) #delete last blank row in the data set

In [4]:
df.drop(df[df["Current Loan Amount"] == 99999999].index, inplace = True) #delete 11484 rows where the loan amount is 99999999

In [5]:
df.dropna(subset = ['Credit Score'], inplace = True) #delete 19154 rows where Credit Score is missing

In [6]:
df.dropna(subset = ['Years in current job'], inplace = True) #delete 2564 rows where Years in current job is missing 

In [7]:
df["Years in current job"].replace(to_replace = "year.*", value = "",inplace = True, regex = True) #remove 'years' or 'year' from string

In [8]:
df.drop(df[df["Home Ownership"] == "HaveMortgage"].index, inplace = True) #delete 120 rows with "HaveMortgage" as meaning isn't clear

In [9]:
df.dropna(subset = ['Months since last delinquent'], inplace = True) #delete 30,000 rows where data is missing 

In [10]:
df.drop(columns = "Maximum Open Credit", inplace = True) #delete Maximum Open Credit columns as some of these numbers are huge e.g. 798255370.0

In [11]:
df = hp.drop_column_keyword_search(df, ["Loan ID", "Customer ID"]) #delete features 'Loan ID' and 'Customer ID' as they don't add anything

In [12]:
df = df.fillna(df.median()) #fill remaining values with 

In [13]:
#rename and consolidate categorical variables for purpose

df["Purpose"].replace({
              "Business Loan": "business_loan",
              "Medical Bills": "medical bills",
              "Educational Expenses": "educational_expenses",
              "Buy House": "buy_house",
              "Buy a Car": "buy_a_car",
              "Debt Consolidation": "debt_consolidation",
              "Home Improvements": "home_improvements",
              "Take a Trip": "take_a_trip",
              "vacation": "take_a_trip",
              "Major Purchase": "other",
              "Other": "other", 
              "renewable_energy": "home_improvements",
              "small_business": "business_loan",
              "moving": "home_improvements",
              "major_purchase": "major_purchase",
              "wedding": "wedding"
              }, inplace=True)

In [14]:
#rename categorical variables for Home Ownership

df["Home Ownership"].replace({
              "Home Mortgage": "mortgage",
              "Rent": "rent",
              "Own Home": "own_home",
              }, inplace=True)

In [15]:
#rename categorical variables for Term

df["Term"].replace({
              "Long Term": "long_term",
              "Short Term": "short_term",
              }, inplace=True)

In [16]:
#rename categorical variables for Loan Status

df["Loan Status"].replace({
              "Fully Paid": "fully_paid",
              "Charged Off": "default",
              }, inplace=True)

In [17]:
#rename columns to make the dataset easier to work with using . notation

df.columns = ['loan_status',
              'loan_amount',
              'term',
              'credit_score',
              'annual_income',
              'years_in_current_job',
              'home_ownership',
              'loan_purpose',
              'monthly_debt',
              'years_of_credit_history',
              'months_since_last_delinquent',
              'number_of_open_accounts',
              'number_of_credit_problems',
              'current_credit_balance',
              'bankruptcies',
              'tax_liens']

In [18]:
df.reset_index(inplace = True); #reset index

## Cleaned Dataset Information

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.head()

## Train, test split, sampling and K-folds

In [19]:
default_df = df[df.loan_status == "default"] #create a new dataframe of loan defaulters

In [20]:
fully_paid_df = df[df.loan_status == "fully_paid"] #create a new dataframe of fully paid loans

In [21]:
fully_paid_sample = fully_paid_df.sample(7479,random_state = 42) #sample the fully paid loans dataframe to a number equal to the default loan dataframe

In [22]:
update_df = pd.concat([default_df, fully_paid_sample]) #create a new dataframe with a 50/50 split of defaulters and paid loans

In [23]:
update_df = update_df.sample(frac = 1, random_state = 42) #shuffle the new 50/50 split dataframe

In [24]:
X = update_df.drop(columns = "loan_status") #create predictor dataframe

In [25]:
y = update_df.loan_status #create target variable dataframe

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) #create train/test split

In [27]:
crossvalidation = KFold(n_splits=10, shuffle=True, random_state=42) #initialise cross validation object

In [28]:
#export data for CSVs for ease

X_train.to_csv("X_train.csv")
X_test.to_csv("X_test.csv")
y_train.to_csv("y_train.csv")
y_test.to_csv("y_test.csv")

## Decision Trees

In [29]:
X_train.reset_index(inplace = True); #reset index

In [30]:
X_train.drop(columns = ["level_0", "index"], inplace = True); #drop old index columns

In [31]:
#make a dataframe of categorical variables

X_train_categorical = pd.concat([X_train.term, 
                                 X_train.years_in_current_job, 
                                 X_train.home_ownership, 
                                 X_train.loan_purpose], 
                                 axis = 1) 

In [32]:
#make a dataframe of continous variables

X_train_continuous = pd.concat([X_train.loan_amount, 
                                X_train.credit_score, 
                                X_train.annual_income, 
                                X_train.monthly_debt, 
                                X_train.years_of_credit_history,
                                X_train.months_since_last_delinquent,
                                X_train.number_of_open_accounts,
                                X_train.current_credit_balance,
                                X_train.bankruptcies,
                                X_train.tax_liens], 
                                axis = 1)

In [56]:
#one hot encode categorical variables
#for everything but decision trees we should do drop_first = True
X_train_one_hot_encoded = pd.get_dummies(X_train_categorical)

In [34]:
#stitch the one hot encoded dataframe back together

X_train_updated = pd.concat([X_train_continuous, X_train_one_hot_encoded], axis = 1)

In [53]:
#instantiate tree classifier object

tree_clf = DecisionTreeClassifier(max_depth = 3, min_samples_leaf = 10, random_state = 42)

In [54]:
#fit model

tree_clf.fit(X_train_updated,y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=3, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=10, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')

In [55]:
#export tree to a dot file so it can be converted to an image using the CLI:
# dot -Tpng tree.dot -o tree.png

export_graphviz(tree_clf, 
                out_file = ("tree.dot"), 
                feature_names = X_train_updated.columns, 
                class_names = y_train.values, 
                rounded = True, 
                filled = True)