In [1]:
import pandas as pd
import numpy as np

# machine learning
import scipy.stats as stats
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor 
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import precision_score 
from sklearn.metrics import f1_score, accuracy_score, recall_score
from sklearn.metrics import RocCurveDisplay, roc_auc_score, roc_curve, f1_score, accuracy_score, recall_score
from sklearn.metrics import silhouette_samples,silhouette_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV, cross_val_score, cross_validate
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer

# visualization
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv(r'C:\Users\ReinisFals\OneDrive - Peero, SIA\Desktop\Bachelors Research\Bachelors-Research\data\credit-score-data-cleaned.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,SK_ID_CURR,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,...,NAME_HOUSING_TYPE_Municipal apartment,NAME_HOUSING_TYPE_Office apartment,NAME_HOUSING_TYPE_Rented apartment,NAME_HOUSING_TYPE_With parents,WEEKDAY_APPR_PROCESS_START_MONDAY,WEEKDAY_APPR_PROCESS_START_SATURDAY,WEEKDAY_APPR_PROCESS_START_SUNDAY,WEEKDAY_APPR_PROCESS_START_THURSDAY,WEEKDAY_APPR_PROCESS_START_TUESDAY,WEEKDAY_APPR_PROCESS_START_WEDNESDAY
0,0,100002,1,0,202500.0,406597.5,24700.5,351000.0,0.018801,-9461,...,0,0,0,0,0,0,0,0,0,1
1,5,100008,0,0,99000.0,490495.5,27517.5,454500.0,0.035792,-16941,...,0,0,0,0,0,0,0,0,0,1
2,6,100009,0,1,171000.0,1560726.0,41301.0,1395000.0,0.035792,-13778,...,0,0,0,0,0,0,1,0,0,0
3,7,100010,0,0,360000.0,1530000.0,42075.0,1530000.0,0.003122,-18850,...,0,0,0,0,1,0,0,0,0,0
4,8,100011,0,0,112500.0,1019610.0,33826.5,913500.0,0.018634,-20099,...,0,0,0,0,0,0,0,0,0,1


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 222308 entries, 0 to 222307
Data columns (total 94 columns):
 #   Column                                             Non-Null Count   Dtype  
---  ------                                             --------------   -----  
 0   Unnamed: 0                                         222308 non-null  int64  
 1   SK_ID_CURR                                         222308 non-null  int64  
 2   TARGET                                             222308 non-null  int64  
 3   CNT_CHILDREN                                       222308 non-null  int64  
 4   AMT_INCOME_TOTAL                                   222308 non-null  float64
 5   AMT_CREDIT                                         222308 non-null  float64
 6   AMT_ANNUITY                                        222308 non-null  float64
 7   AMT_GOODS_PRICE                                    222308 non-null  float64
 8   REGION_POPULATION_RELATIVE                         222308 non-null  float6

## Data Scaling

Before we can train our model, we need to scale the data. We will use the Min-Max Scaler to scale the data to the range of 0 to 1. Min Max Scaler formula:

$$
\text{Scaled Feature} = \frac{\text{Feature} - \text{Min(Feature)}}{\text{Max(Feature)} - \text{Min(Feature)}}
$$


In [7]:
scaler = MinMaxScaler()
scaler_vars = df.columns
df[scaler_vars] = scaler.fit_transform(df[scaler_vars])
df

Unnamed: 0.1,Unnamed: 0,SK_ID_CURR,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,...,NAME_HOUSING_TYPE_Municipal apartment,NAME_HOUSING_TYPE_Office apartment,NAME_HOUSING_TYPE_Rented apartment,NAME_HOUSING_TYPE_With parents,WEEKDAY_APPR_PROCESS_START_MONDAY,WEEKDAY_APPR_PROCESS_START_SATURDAY,WEEKDAY_APPR_PROCESS_START_SUNDAY,WEEKDAY_APPR_PROCESS_START_THURSDAY,WEEKDAY_APPR_PROCESS_START_TUESDAY,WEEKDAY_APPR_PROCESS_START_WEDNESDAY
0,0.000000,0.000000,1.0,0.000000,0.001508,0.090287,0.090032,0.077441,0.253810,0.898146,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.000016,0.000017,0.0,0.000000,0.000623,0.111235,0.101018,0.103255,0.489878,0.471327,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.000020,0.000020,0.0,0.052632,0.001239,0.378458,0.154774,0.337823,0.489878,0.651812,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.000023,0.000022,0.0,0.000000,0.002854,0.370787,0.157792,0.371493,0.035971,0.362397,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.000026,0.000025,0.0,0.000000,0.000739,0.243348,0.125623,0.217733,0.251490,0.291127,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
222303,0.999977,0.999978,0.0,0.000000,0.000739,0.075034,0.063005,0.051627,0.309371,0.760685,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
222304,0.999984,0.999983,0.0,0.000000,0.000739,0.044944,0.079695,0.046016,0.309371,0.046619,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
222305,0.999993,0.999994,0.0,0.000000,0.001085,0.157969,0.110618,0.135802,0.062091,0.584023,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
222306,0.999997,0.999997,1.0,0.000000,0.001239,0.081175,0.072499,0.069585,0.066412,0.755492,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## Train Test Split

In [8]:
X = df.drop('TARGET', axis=1)
y = df['TARGET']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size = 0.20, random_state = 101)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(177846, 93)
(44462, 93)
(177846,)
(44462,)


## ML Model Development

This part goes through the process of developing a machine learning model to predict employee churn. We will use the following machine learning algorithms:

 - **Logistic Regression**
 - **Decision Tree**
 - **Random Forest**
 - **Gradient Boosting**

Each model will be optimized and evaluated using the following metrics:

 - **Accuracy:** The proportion of correct predictions:
$$
\text{Accuracy} = \frac{\text{Number of Correct Predictions}}{\text{Total Number of Predictions}}
$$
 - **Precision:** The proportion of positive predictions that are correct:
$$
\text{Precision} = \frac{\text{True Positives}}{\text{True Positives} + \text{False Positives}}
$$
 - **Recall:** The proportion of actual positives that are correctly classified:

$$
\text{Recall} = \frac{\text{True Positives}}{\text{True Positives} + \text{False Negatives}}
$$
 - **F1 Score:** The harmonic mean of precision and recall:
 
$$
\text{F1 Score} = 2 \times \frac{\text{Precision} \times \text{Recall}}{\text{Precision} + \text{Recall}}
$$
 - **ROC AUC:** The area under the Receiver Operating Characteristic (ROC) curve:
$$
\text{TPR} = \frac{\text{True Positives}}{\text{True Positives} + \text{False Negatives}}
$$
$$
\text{FPR} = \frac{\text{False Positives}}{\text{False Positives} + \text{True Negatives}}
$$
$$
\text{ROC AUC} = \int_{0}^{1} \text{TPR}(f(x)) \text{FPR}(f(x)) \text{dx}
$$

 - **Confusion Matrix:** A table showing correct predictions and types of incorrect predictions:

| | Predicted: No | Predicted: Yes |
| --- | --- | --- |
| **Actual: No** | True Negative | False Positive |
| **Actual: Yes** | False Negative | True Positive |


In [10]:
# TODO: include in helpers class
# functions to help with model evaluation
def eval(model, X_train, X_test):
    y_pred = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    
    print(confusion_matrix(y_test, y_pred))
    print("Test_Set")
    print(classification_report(y_test,y_pred))
    print("Train_Set")
    print(classification_report(y_train,y_pred_train))
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
    disp.plot()

def train_val(y_train, y_train_pred, y_test, y_pred):
    
    scores = {"train_set": {"Accuracy" : accuracy_score(y_train, y_train_pred),
                            "Precision" : precision_score(y_train, y_train_pred),
                            "Recall" : recall_score(y_train, y_train_pred),                          
                            "f1" : f1_score(y_train, y_train_pred)},
    
              "test_set": {"Accuracy" : accuracy_score(y_test, y_pred),
                           "Precision" : precision_score(y_test, y_pred),
                           "Recall" : recall_score(y_test, y_pred),                          
                           "f1" : f1_score(y_test, y_pred)}}
    
    return pd.DataFrame(scores) 

### 9.1 Logistic Regression

**Logistic Regression** is a binary classification algorithm used to predict the probability of an event occurring. It is a linear model that uses the logistic sigmoid function to predict the probability of an event occurring. The logistic sigmoid function is defined as:

$$
\sigma(x) = \frac{1}{1 + e^{-x}}
$$

where $x$ is the input to the function. The output of the function is a value between 0 and 1, which can be interpreted as the probability of an event occurring. For example, if the output of the function is 0.8, then the probability of the event occurring is 80%.

In [11]:
log_model = LogisticRegression(max_iter=1000)
log_model = log_model.fit(X_train, y_train)

In [12]:
# make predictions
y_pred_log = log_model.predict(X_test)
y_train_pred_log = log_model.predict(X_train)

In [13]:
log_accuracy = metrics.accuracy_score(y_test, y_pred_log)
print("Accuracy: {:.4%}".format(log_accuracy))

Accuracy: 91.9684%


#### 9.1.1 Logistic Regression Model Fine-Tuning

In [14]:
### Manipulating the parameters - second model
model = LogisticRegression(max_iter=1000)
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='recall',error_score=0)
log_model_tuned = grid_search.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
# make predictions with the tuned model
y_pred_logt = log_model_tuned.predict(X_test)
y_train_pred_logt = log_model_tuned.predict(X_train)

In [None]:
logt_accuracy = metrics.accuracy_score(y_test, y_pred_logt)
print("Accuracy: {:.4%}".format(logt_accuracy))

#### Logistics Model Evaluation

In [None]:
log_model_tuned_f1 = f1_score(y_test, y_pred_logt)
log_model_tuned_acc = accuracy_score(y_test, y_pred_logt)
log_model_tuned_recall = recall_score(y_test, y_pred_logt)
log_model_tuned_auc = roc_auc_score(y_test, y_pred_logt)

print("F1 Score: {:.4%}".format(log_model_tuned_f1))
print("Accuracy: {:.4%}".format(log_model_tuned_acc))
print("Recall: {:.4%}".format(log_model_tuned_recall))
print("AUC: {:.4%}".format(log_model_tuned_auc))

In [None]:
eval(log_model_tuned, X_train, X_test)

In [None]:
log_model_scores = train_val(y_train, y_train_pred_logt, y_test, y_pred_logt)
log_model_scores

In [None]:
# plot the ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_logt)
roc_auc = roc_auc_score(y_test, y_pred_logt)
plt.figure(figsize=(10, 10))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()
print("AUC: ", roc_auc_score(y_test, y_pred_logt))

## Decision Tree Model

### 9.2 Decision Tree

**Decision Tree** is a non-parametric supervised learning algorithm that can be used for both classification and regression tasks. It works by recursively partitioning the feature space into smaller and smaller regions, where each region is associated with a class label or a regression value. The goal of the algorithm is to find the optimal partitioning of the feature space that results in the most homogeneous regions. The algorithm works by following a 3-step procedure:

1. **Select the best feature** The algorithm selects the feature that best splits the dataset into the most homogeneous regions. The most common metric used to measure the homogeneity of a region is the Gini impurity, which is defined as:

$$
\text{Gini Impurity} = 1 - \sum_{i=1}^n p_i^2
$$

where $p_i$ is the probability of an instance belonging to class $i$. The Gini impurity is minimized when all instances in a region belong to the same class.

2. **Split the dataset** The algorithm splits the dataset into two subsets using the selected feature. The first subset contains all instances where the selected feature is less than or equal to a threshold value, and the second subset contains all instances where the selected feature is greater than the threshold value.

3. **Repeat** The algorithm repeats steps 1 and 2 for each subset until a stopping criterion is met. The stopping criterion can be a maximum depth, a minimum number of instances, or a minimum number of instances per class.

In [None]:
dtree_model = DecisionTreeClassifier(criterion='gini', max_depth=10, min_samples_split=10, min_samples_leaf=5, random_state=42)
dtree_model = dtree_model.fit(X_train, y_train)

In [None]:
# make predictions
y_pred_dtree = dtree_model.predict(X_test)
y_train_pred_dtree = dtree_model.predict(X_train)

In [None]:
dtree_accuracy = metrics.accuracy_score(y_test, y_pred_dtree)
print("Accuracy: {:.4%}".format(dtree_accuracy))

In [None]:
# plot the decision tree
plt.figure(figsize=(20, 10))
plot_tree(dtree_model, filled=True, rounded=True, class_names=['Active', 'Terminated'], feature_names=X.columns)
plt.show()