In [0]:
!wget "https://spotleai.sgp1.digitaloceanspaces.com/course/data/credit_data.csv"

Import important libraries

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier

Read the data and perform some analysis

In [0]:
df = pd.read_csv('credit_data.csv')
df.sample(5)

Unnamed: 0,age,gender,education,occupation,organization_type,seniority,annual_income,disposable_income,house_type,vehicle_type,marital_status,no_card,default
46352,26,Male,Graduate,Business,,,388090,11021,Owned,,Married,0,0
13384,39,Female,Post Graduate,Professional,,,210462,30699,Owned,Two Wheeler,Single,1,0
39327,28,Female,Graduate,Professional,,,440845,38212,Family,Two Wheeler,Single,1,1
32354,34,Female,Post Graduate,Salaried,,Mid-level 1,346317,47410,Rented,,Married,0,0
19488,29,Male,Graduate,Student,,,92743,24728,Rented,,Married,0,0


In [0]:
df.describe()

Unnamed: 0,age,annual_income,disposable_income,no_card,default
count,50636.0,50636.0,50636.0,50636.0,50636.0
mean,29.527411,277243.989889,18325.788569,0.509815,0.158425
std,8.816532,153838.973755,12677.864844,0.669883,0.365142
min,18.0,50000.0,1000.0,0.0,0.0
25%,25.0,154052.25,8317.75,0.0,0.0
50%,27.0,258860.5,15770.0,0.0,0.0
75%,30.0,385071.5,24135.0,1.0,0.0
max,64.0,999844.0,49999.0,2.0,1.0


In [0]:
df.shape

(50636, 13)

In [0]:
categorical = ['gender', 'education', 'occupation', 'organization_type', 'seniority',
               'house_type', 'vehicle_type', 'marital_status'] # attributes that need to be one hot ecoded

In [0]:
df.isna().count()

age                  50636
gender               50636
education            50636
occupation           50636
organization_type    50636
seniority            50636
annual_income        50636
disposable_income    50636
house_type           50636
vehicle_type         50636
marital_status       50636
no_card              50636
default              50636
dtype: int64

No missing values

In [0]:
df.sample()

Unnamed: 0,age,gender,education,occupation,organization_type,seniority,annual_income,disposable_income,house_type,vehicle_type,marital_status,no_card,default
18851,38,Male,Graduate,Salaried,,Mid-level 1,385743,22253,Rented,,Married,2,0


In [0]:
for category in categorical:
  print(category)
  one_hot = pd.get_dummies(df[category], prefix=category)
  
  #df = df.drop(category, axis = 1, inplace=True)
  df = df.join(one_hot)
  
df.sample(10)

gender
education
occupation
organization_type
seniority
house_type
vehicle_type
marital_status


Unnamed: 0,age,gender,education,occupation,organization_type,seniority,annual_income,disposable_income,house_type,vehicle_type,marital_status,no_card,default,gender_Female,gender_Male,education_Graduate,education_Other,education_Post Graduate,education_Under Graduate,occupation_Business,occupation_Professional,occupation_Salaried,occupation_Student,organization_type_None,organization_type_Tier 1,organization_type_Tier 2,organization_type_Tier 3,seniority_Entry,seniority_Junior,seniority_Mid-level 1,seniority_Mid-level 2,seniority_None,seniority_Senior,house_type_Company provided,house_type_Family,house_type_Owned,house_type_Rented,vehicle_type_Four Wheeler,vehicle_type_None,vehicle_type_Two Wheeler,marital_status_Married,marital_status_Other,marital_status_Single
48822,29,Male,Post Graduate,Salaried,Tier 1,Entry,140290,14807,Family,,Married,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0
49095,27,Male,Graduate,Salaried,Tier 3,Junior,305143,9640,Rented,Two Wheeler,Married,1,0,0,1,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,1,0,0
3335,26,Male,Under Graduate,Salaried,,Mid-level 1,103497,17966,Rented,Two Wheeler,Married,0,1,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,1,0,0
30687,49,Female,Graduate,Salaried,Tier 3,Mid-level 1,445212,13329,Rented,Two Wheeler,Other,1,0,1,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,0
3850,25,Female,Graduate,Student,,,432594,16372,Rented,,Married,0,1,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,1,0,0
33207,37,Male,Other,Student,,,145802,15204,Rented,Four Wheeler,Single,0,0,0,1,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,1
31948,30,Female,Post Graduate,Professional,,,253029,17254,Rented,,Single,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1
4534,23,Male,Post Graduate,Salaried,,Entry,183668,17246,Family,,Single,1,1,0,1,0,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1
43019,25,Female,Graduate,Salaried,,Entry,165926,5592,Rented,Two Wheeler,Single,0,0,1,0,1,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1
22080,27,Male,Post Graduate,Professional,,,235006,13503,Rented,Two Wheeler,Single,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1


In [0]:
processed_df = df.drop(categorical, axis=1)
processed_df.sample(5)

Unnamed: 0,age,annual_income,disposable_income,no_card,default,gender_Female,gender_Male,education_Graduate,education_Other,education_Post Graduate,education_Under Graduate,occupation_Business,occupation_Professional,occupation_Salaried,occupation_Student,organization_type_None,organization_type_Tier 1,organization_type_Tier 2,organization_type_Tier 3,seniority_Entry,seniority_Junior,seniority_Mid-level 1,seniority_Mid-level 2,seniority_None,seniority_Senior,house_type_Company provided,house_type_Family,house_type_Owned,house_type_Rented,vehicle_type_Four Wheeler,vehicle_type_None,vehicle_type_Two Wheeler,marital_status_Married,marital_status_Other,marital_status_Single
37286,29,882201,47131,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0
46866,34,270923,25678,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,1,1,0,0
22430,36,73946,11412,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,1,0,0
22514,27,155468,3157,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,1
33687,27,110506,24957,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,1,0,0


In [0]:
defaulter = ['No', 'Yes']
y = processed_df.default
X = processed_df.drop('default', axis=1)
print(X.sample(5))
y.sample(5)

       age  annual_income  ...  marital_status_Other  marital_status_Single
4952    22          95790  ...                     0                      1
7879    27         237466  ...                     0                      0
3328    29         297999  ...                     0                      1
3160    26         197930  ...                     0                      1
45629   38         174485  ...                     0                      1

[5 rows x 34 columns]


23682    0
31890    0
24109    0
29299    0
31281    0
Name: default, dtype: int64

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
print(X_train.shape,y_train.shape)
print(X_test.shape, y_test.shape)

(37977, 34) (37977,)
(12659, 34) (12659,)
