# Decision model validation

## Loading and preparing customer delinquency data

In [3]:
import pandas as pd

df = pd.read_csv('./data/Delinquent_customer-Insurance_Premium.csv')

# Remove unrelated data

df = df.drop(columns=["APPLICATION_SUBMISSION_TYPE", 
                      "POSTAL_ADDRESS_TYPE", 
                      "RESIDENTIAL_PHONE", 
                      "EMAIL", 
                      "PROFESSIONAL_PHONE",
                      "MONTHS_IN_RESIDENCE",
                      "OTHER_INCOMES",
                      "PERSONAL_ASSETS_VALUE",
                      "QUANT_CARS",
                      "MONTHS_IN_THE_JOB",
                      "QUANT_ADDITIONAL_CARDS"])

# Converting categorical data to numeric
df['MARITAL_STATUS'] = pd.Categorical(df['MARITAL_STATUS']).codes
df['GENDER'] = pd.Categorical(df['GENDER']).codes

# Fill all missing data
df['GENDER'] = df['GENDER'].fillna(df['GENDER'].mode()[0])
df['RESIDENCE_TYPE'] = df['RESIDENCE_TYPE'].fillna(df['RESIDENCE_TYPE'].mode()[0])
df['PROFESSION_CODE'] = df['PROFESSION_CODE'].fillna(df['PROFESSION_CODE'].mode()[0])
df['OCCUPATION_TYPE'] = df['OCCUPATION_TYPE'].fillna(df['OCCUPATION_TYPE'].mode()[0])

display(df)

Unnamed: 0,GENDER,MARITAL_STATUS,RESIDENCE_TYPE,PROFESSION_CODE,OCCUPATION_TYPE,PRODUCT,QUANT_DEPENDANTS,PERSONAL_MONTHLY_INCOME,QUANT_BANKING_ACCOUNTS,AGE,TARGET_LABEL
0,0,6,1.0,9.0,4.0,1,1,900.00,0,32,1
1,0,2,1.0,11.0,4.0,1,0,750.00,0,34,1
2,0,2,1.0,11.0,2.0,1,0,500.00,0,27,0
3,0,2,1.0,9.0,2.0,1,0,500.00,0,61,0
4,1,2,1.0,9.0,5.0,1,0,1200.00,0,48,1
...,...,...,...,...,...,...,...,...,...,...,...
49995,0,1,1.0,9.0,4.0,1,2,1451.00,1,36,1
49996,0,1,1.0,9.0,2.0,2,0,450.00,0,21,0
49997,1,2,2.0,9.0,2.0,1,3,1555.00,0,41,0
49998,0,1,1.0,9.0,2.0,1,1,1443.27,0,28,1


## Preparing data for machine learning

In [4]:
y = df['TARGET_LABEL'].to_numpy()

df=df.drop(columns=['TARGET_LABEL'])

X = df.to_numpy()

## Splitting data for training and testing

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y,
                                                   test_size=0.33,
                                                   random_state=42)

## Building model with train data

In [15]:
from sklearn import tree

tree_clf = tree.DecisionTreeClassifier(criterion="gini",
                                      random_state=42,
                                      max_depth=8)

tree_clf.fit(X_train, y_train)

## Evaluating the model with test data

In [16]:
y_pred = tree_clf.predict(X_test)

In [25]:
outputDf = pd.DataFrame()

outputDf['Actual'] = y_test
outputDf['Predicted'] = y_pred

outputDf

Unnamed: 0,Actual,Predicted
0,1,0
1,0,0
2,0,0
3,0,0
4,1,0
...,...,...
16495,0,0
16496,0,0
16497,0,0
16498,0,0


In [17]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.99      0.85     12215
           1       0.36      0.02      0.04      4285

    accuracy                           0.74     16500
   macro avg       0.55      0.50      0.44     16500
weighted avg       0.64      0.74      0.64     16500



## Splitting data for training and testing

In [5]:
from sklearn.model_selection import train_test_split 

# split into a training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.33, 
                                                    random_state=42)

## Training a decision tree classifier with training data

In [20]:
from sklearn import tree

tree_clf = tree.DecisionTreeClassifier(criterion="gini", 
                                  random_state=42)
tree_clf.fit(X_train, y_train)

## Testing the decision tree with test data

In [21]:
y_pred = tree_clf.predict(X_test)

In [22]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.74      0.75     12215
           1       0.29      0.30      0.29      4285

    accuracy                           0.63     16500
   macro avg       0.52      0.52      0.52     16500
weighted avg       0.63      0.63      0.63     16500

