In [303]:
import numpy as np
from numpy.random import default_rng
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import f1_score, classification_report
from sklearn.tree import DecisionTreeRegressor

# The Dataset
The UCI Credit Card Default Clients Dataset contains information on a number of defaulters and non-defaulters. The features include age, sex, maritial status, education, bill statements and payment status over the past few months. In this work, we analyze the efficiency of various imputation methods. Since the dataset is clean and does not contain missing values, we artificially induce MAR (Missing At Random) values. 

In [304]:
dataset = pd.read_csv("UCI_Credit_Card.csv")
dataset.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [305]:
len(dataset)

30000

## One-Hot Encoding
We need to perform one-hot encoding on some of the features (sex, education and marriage) as they are discrete and are not quantitative. Payment status, although discrete, does not require one-hot encoding. This is because it is a quantitative feature and has gradations.

In [306]:
#Performing one-hot encoding on sex, education and marriage columns
dataset = pd.get_dummies(dataset, columns=['SEX', 'EDUCATION', 'MARRIAGE'])
dataset.head()

Unnamed: 0,ID,LIMIT_BAL,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,...,EDUCATION_1,EDUCATION_2,EDUCATION_3,EDUCATION_4,EDUCATION_5,EDUCATION_6,MARRIAGE_0,MARRIAGE_1,MARRIAGE_2,MARRIAGE_3
0,1,20000.0,24,2,2,-1,-1,-2,-2,3913.0,...,False,True,False,False,False,False,False,True,False,False
1,2,120000.0,26,-1,2,0,0,0,2,2682.0,...,False,True,False,False,False,False,False,False,True,False
2,3,90000.0,34,0,0,0,0,0,0,29239.0,...,False,True,False,False,False,False,False,False,True,False
3,4,50000.0,37,0,0,0,0,0,0,46990.0,...,False,True,False,False,False,False,False,True,False,False
4,5,50000.0,57,-1,0,-1,0,0,0,8617.0,...,False,True,False,False,False,False,False,True,False,False


The 'ID' column is dropped as it is not a feature

In [307]:
dataset = dataset.drop(['ID'], axis=1)

# Imputation

## Simple Imputation
We remove 10% of values in 'AGE', 'BILL_AMT1' and 'LIMIT_BAL' columns and impute with median. Imputaion with median is preferrable to mean because if the dataset is skewed, the mean might not be a good measure of central tendency.

In [308]:
#For generating random indices
rng = default_rng()
indices = np.arange(0, len(dataset))

dataset_A = dataset.copy()
dataset_A.loc[rng.choice(indices, len(dataset)//10, replace=False), 'AGE'] = np.nan
dataset_A.loc[rng.choice(indices, len(dataset)//10, replace=False), 'BILL_AMT1'] = np.nan
dataset_A.loc[rng.choice(indices, len(dataset)//10, replace=False), 'LIMIT_BAL'] = np.nan
print(dataset_A['AGE'].isnull().sum(), dataset_A['BILL_AMT1'].isnull().sum(), dataset_A['LIMIT_BAL'].isnull().sum())
dataset_A.head()

3000 3000 3000


Unnamed: 0,LIMIT_BAL,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,...,EDUCATION_1,EDUCATION_2,EDUCATION_3,EDUCATION_4,EDUCATION_5,EDUCATION_6,MARRIAGE_0,MARRIAGE_1,MARRIAGE_2,MARRIAGE_3
0,20000.0,24.0,2,2,-1,-1,-2,-2,3913.0,3102.0,...,False,True,False,False,False,False,False,True,False,False
1,120000.0,26.0,-1,2,0,0,0,2,2682.0,1725.0,...,False,True,False,False,False,False,False,False,True,False
2,90000.0,34.0,0,0,0,0,0,0,29239.0,14027.0,...,False,True,False,False,False,False,False,False,True,False
3,50000.0,37.0,0,0,0,0,0,0,46990.0,48233.0,...,False,True,False,False,False,False,False,True,False,False
4,50000.0,57.0,-1,0,-1,0,0,0,8617.0,5670.0,...,False,True,False,False,False,False,False,True,False,False


In [309]:
#Imputation with median
dataset_A = dataset_A.fillna(dataset_A.median())
print(dataset_A['AGE'].isnull().sum(), dataset_A['BILL_AMT1'].isnull().sum(), dataset_A['LIMIT_BAL'].isnull().sum())

0 0 0


## Linear Regression
Here, we remove values from the 'LIMIT_BAL' column and use linear regression for imputation. The underlying assumption of MAR (Missing At Random) is that the probability that a value is missing is independent of all the features in the dataset. We use normalization using standard scaler since different columns have vastly different orders of magnitude.

In [310]:
missing_indices_BC = rng.choice(indices, len(dataset)//10, replace=False)
dataset_BC = dataset.copy()
dataset_BC.loc[missing_indices_BC, 'LIMIT_BAL'] = np.nan

In [311]:
x_train_imp_BC = dataset_BC.dropna().drop(['LIMIT_BAL', 'default.payment.next.month'], axis=1)
#Normalization
scaler = StandardScaler()
x_train_imp_BC = scaler.fit_transform(x_train_imp_BC)

#Linear regression for imputation
y_train_imp_BC = dataset_BC.dropna()['LIMIT_BAL']
x_pred_imp_BC = (dataset_BC[dataset_BC['LIMIT_BAL'].isnull()].drop(['LIMIT_BAL', 'default.payment.next.month'], axis=1)-scaler.mean_)/scaler.scale_

linear_imp_model = LinearRegression()
linear_imp_model.fit(x_train_imp_BC, y_train_imp_BC)


dataset_B = dataset_BC.copy()
dataset_B.loc[missing_indices_BC, 'LIMIT_BAL'] = linear_imp_model.predict(x_pred_imp_BC)



## Non linear imputation
Here, we use KNN model for imputation. We use the same dataset that was used for linear regression imputation

In [312]:
knn_imputation_model = KNeighborsRegressor(n_neighbors=5)
knn_imputation_model.fit(x_train_imp_BC, y_train_imp_BC)

dataset_C = dataset_BC.copy()
dataset_C.loc[missing_indices_BC, 'LIMIT_BAL'] = knn_imputation_model.predict(x_pred_imp_BC)




## Listwise Deletion
In this approach, rows containing missing values are dropped. A drawback is that a lot of data is lost with this approach.

In [313]:
dataset_D = dataset.copy()
dataset_D.loc[rng.choice(indices, len(dataset)//10, replace=False), 'AGE'] = np.nan
dataset_D.loc[rng.choice(indices, len(dataset)//10, replace=False), 'BILL_AMT1'] = np.nan
dataset_D.loc[rng.choice(indices, len(dataset)//10, replace=False), 'LIMIT_BAL'] = np.nan
dataset_D = dataset_D.dropna()

In [314]:
#Fraction of rows with missing values
print((len(dataset)-len(dataset_D))/len(dataset))

0.27213333333333334


The size of the dataset is reduced by around 27%

# Model training

In [315]:
x_A = dataset_A.drop('default.payment.next.month', axis=1)
y_A = dataset_A['default.payment.next.month']

x_B = dataset_B.drop('default.payment.next.month', axis=1)
y_B = dataset_B['default.payment.next.month']

x_C = dataset_C.drop('default.payment.next.month', axis=1)  
y_C = dataset_C['default.payment.next.month']

x_D = dataset_D.drop('default.payment.next.month', axis=1)
y_D = dataset_D['default.payment.next.month']

x_A = scaler.fit_transform(x_A)
x_B = scaler.fit_transform(x_B)
x_C = scaler.fit_transform(x_C)
x_D = scaler.fit_transform(x_D)

x_train_A, x_test_A, y_train_A, y_test_A = train_test_split(x_A, y_A, test_size=0.2, random_state=42, stratify=y_A)
x_train_B, x_test_B, y_train_B, y_test_B = train_test_split(x_B, y_B, test_size=0.2, random_state=42, stratify=y_B)
x_train_C, x_test_C, y_train_C, y_test_C = train_test_split(x_C, y_C, test_size=0.2, random_state=42, stratify=y_C)
x_train_D, x_test_D, y_train_D, y_test_D = train_test_split(x_D, y_D, test_size=0.2, random_state=42, stratify=y_D)

# Performance evaluation

In [316]:
model_A = LogisticRegression()
model_A.fit(x_train_A, y_train_A)
y_pred_A = model_A.predict(x_test_A)
print(classification_report(y_test_A, y_pred_A, digits=3))
#Get f1-score for both 0 and 1
f1_score_A = f1_score(y_test_A, y_pred_A, average=None)

              precision    recall  f1-score   support

           0      0.819     0.969     0.888      4673
           1      0.692     0.244     0.361      1327

    accuracy                          0.809      6000
   macro avg      0.755     0.607     0.624      6000
weighted avg      0.791     0.809     0.771      6000



In [317]:
model_B = LogisticRegression()
model_B.fit(x_train_B, y_train_B)
y_pred_B = model_B.predict(x_test_B)
print(classification_report(y_test_B, y_pred_B, digits=3))
f1_score_B = f1_score(y_test_B, y_pred_B, average=None)

              precision    recall  f1-score   support

           0      0.819     0.969     0.888      4673
           1      0.692     0.244     0.361      1327

    accuracy                          0.809      6000
   macro avg      0.755     0.607     0.624      6000
weighted avg      0.791     0.809     0.771      6000



In [318]:
model_C = LogisticRegression()
model_C.fit(x_train_C, y_train_C)
y_pred_C = model_C.predict(x_test_C)
print(classification_report(y_test_C, y_pred_C, digits=3))
f1_score_C = f1_score(y_test_C, y_pred_C, average=None)

              precision    recall  f1-score   support

           0      0.819     0.969     0.888      4673
           1      0.694     0.246     0.363      1327

    accuracy                          0.809      6000
   macro avg      0.756     0.607     0.625      6000
weighted avg      0.791     0.809     0.772      6000



In [319]:
model_D = LogisticRegression()
model_D.fit(x_train_D, y_train_D)
y_pred_D = model_D.predict(x_test_D)
print(classification_report(y_test_D, y_pred_D, digits=3))
f1_score_D = f1_score(y_test_D, y_pred_D, average=None)

              precision    recall  f1-score   support

           0      0.821     0.976     0.892      3397
           1      0.752     0.255     0.381       971

    accuracy                          0.816      4368
   macro avg      0.786     0.616     0.636      4368
weighted avg      0.806     0.816     0.778      4368



# Summary table

In [320]:
summary_table = pd.DataFrame({
    'Imputation Method': ['Median Imputation', 'Linear Regression Imputation', 'KNN Imputation', 'Listwise Deletion'],
    'F1 Score (Class 0)': [f1_score_A[0], f1_score_B[0], f1_score_C[0], f1_score_D[0]],
    'F1 Score (Class 1)': [f1_score_A[1], f1_score_B[1], f1_score_C[1], f1_score_D[1]]
})
summary_table

Unnamed: 0,Imputation Method,F1 Score (Class 0),F1 Score (Class 1)
0,Median Imputation,0.887604,0.361003
1,Linear Regression Imputation,0.887604,0.361003
2,KNN Imputation,0.887778,0.362827
3,Listwise Deletion,0.891728,0.381245


# Efficacy Discussion
We observe that listwise deletion performs slightly better than imputation methods. A disadvantage of listwise deletion is that we lose data when we remove rows. This is a problem if the missing values are not random (because in this case, the model would perform poorly on features that correlate with missing datapoints). The advantage over impuatation methods is that we do not use any synthetic data. This is the trade-off when using listwise deletion.

KNN performed slightly better than linear regression, but the difference is very less. This suggests that the relation between imputed feature and predictors is almost linear. 

Listwise deletion is the optimal strategy for this datset based on the performance on both classes. The reason why this approach works well here is that the missing values are random and do not correlate with any feature. Therefore, the reduced dataset is a good representation of the original one. However, if the missing values are not random or if too many rows have missing values, imputation strategies are preferrable.