In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

from sklearn.linear_model import LogisticRegression
import xgboost as xgb


In [2]:
PATH = "dataset.csv"
df = pd.read_csv(PATH, delimiter=';')

In [3]:
df.head()

Unnamed: 0,uuid,default,account_amount_added_12_24m,account_days_in_dc_12_24m,account_days_in_rem_12_24m,account_days_in_term_12_24m,account_incoming_debt_vs_paid_0_24m,account_status,account_worst_status_0_3m,account_worst_status_12_24m,...,status_3rd_last_archived_0_24m,status_max_archived_0_6_months,status_max_archived_0_12_months,status_max_archived_0_24_months,recovery_debt,sum_capital_paid_account_0_12m,sum_capital_paid_account_12_24m,sum_paid_inv_0_12m,time_hours,worst_status_active_inv
0,63f69b2c-8b1c-4740-b78d-52ed9a4515ac,0.0,0,0.0,0.0,0.0,0.0,1.0,1.0,,...,1,1,1,1,0,0,0,178839,9.653333,1.0
1,0e961183-8c15-4470-9a5e-07a1bd207661,0.0,0,0.0,0.0,0.0,,1.0,1.0,1.0,...,1,1,2,2,0,0,0,49014,13.181389,
2,d8edaae6-4368-44e0-941e-8328f203e64e,0.0,0,0.0,0.0,0.0,,,,,...,1,1,2,2,0,0,0,124839,11.561944,1.0
3,0095dfb6-a886-4e2a-b056-15ef45fdb0ef,0.0,0,,,,,,,,...,1,1,1,1,0,0,0,324676,15.751111,1.0
4,c8f8b835-5647-4506-bf15-49105d8af30b,0.0,0,0.0,0.0,0.0,,,,,...,0,1,1,1,0,0,0,7100,12.698611,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99976 entries, 0 to 99975
Data columns (total 43 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   uuid                                 99976 non-null  object 
 1   default                              89976 non-null  float64
 2   account_amount_added_12_24m          99976 non-null  int64  
 3   account_days_in_dc_12_24m            88140 non-null  float64
 4   account_days_in_rem_12_24m           88140 non-null  float64
 5   account_days_in_term_12_24m          88140 non-null  float64
 6   account_incoming_debt_vs_paid_0_24m  40661 non-null  float64
 7   account_status                       45603 non-null  float64
 8   account_worst_status_0_3m            45603 non-null  float64
 9   account_worst_status_12_24m          33215 non-null  float64
 10  account_worst_status_3_6m            42274 non-null  float64
 11  account_worst_status_6_12m  

In [5]:
df.describe()

Unnamed: 0,default,account_amount_added_12_24m,account_days_in_dc_12_24m,account_days_in_rem_12_24m,account_days_in_term_12_24m,account_incoming_debt_vs_paid_0_24m,account_status,account_worst_status_0_3m,account_worst_status_12_24m,account_worst_status_3_6m,...,status_3rd_last_archived_0_24m,status_max_archived_0_6_months,status_max_archived_0_12_months,status_max_archived_0_24_months,recovery_debt,sum_capital_paid_account_0_12m,sum_capital_paid_account_12_24m,sum_paid_inv_0_12m,time_hours,worst_status_active_inv
count,89976.0,99976.0,88140.0,88140.0,88140.0,40661.0,45603.0,45603.0,33215.0,42274.0,...,99976.0,99976.0,99976.0,99976.0,99976.0,99976.0,99976.0,99976.0,99976.0,30461.0
mean,0.014315,12255.15,0.223043,5.044622,0.286896,1.331292,1.042168,1.172905,1.337348,1.185291,...,0.744299,0.800582,1.052233,1.226164,4.035429,10816.065386,6542.895325,39208.8,15.32978,1.121762
std,0.118786,35481.48,5.808117,22.863971,2.92991,26.482299,0.202713,0.420142,0.575043,0.443309,...,0.634912,0.719946,0.786121,0.833502,163.934564,26463.97217,19041.223585,90649.29,5.03136,0.34366
min,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000278,1.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,2600.0,11.622708,1.0
50%,0.0,0.0,0.0,0.0,0.0,0.152082,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,15995.0,15.792778,1.0
75%,0.0,4937.25,0.0,0.0,0.0,0.662952,1.0,1.0,2.0,1.0,...,1.0,1.0,1.0,2.0,0.0,9029.75,85.0,43844.25,19.542014,1.0
max,1.0,1128775.0,365.0,365.0,97.0,3914.0,4.0,4.0,4.0,4.0,...,5.0,3.0,5.0,5.0,36479.0,571475.0,341859.0,2962870.0,23.999722,3.0


Num data Needs to be scaled in some way before passing into model

In [40]:
df['default'].value_counts(dropna=False)
final_test = df.iloc[-10000:,:]
final_test

Unnamed: 0,uuid,default,account_amount_added_12_24m,account_days_in_dc_12_24m,account_days_in_rem_12_24m,account_days_in_term_12_24m,age,avg_payment_span_0_12m,merchant_category,merchant_group,...,status_2nd_last_archived_0_24m,status_3rd_last_archived_0_24m,status_max_archived_0_6_months,status_max_archived_0_12_months,status_max_archived_0_24_months,recovery_debt,sum_capital_paid_account_0_12m,sum_capital_paid_account_12_24m,sum_paid_inv_0_12m,time_hours
79976,e640a7eb-d00c-4201-91f2-ee98cb7f72c3,0.0,0,,,,20,10.285714,Diversified entertainment,Entertainment,...,1,1,1,1,1,0,0,0,40062,17.612500
79977,96de59b7-bf9e-4282-8b2a-daa70ee17e3c,0.0,0,,,,63,9.666667,Diversified entertainment,Entertainment,...,1,1,1,1,1,0,0,0,5055,22.060000
79978,1271315b-daed-4179-9050-0e38dbcd59a0,0.0,0,0.0,0.0,0.0,20,23.666667,Youthful Shoes & Clothing,Clothing & Shoes,...,1,2,2,2,2,0,0,0,11885,14.789167
79979,39fd0320-09be-43d6-97ad-d1f7e2a45fb6,0.0,0,0.0,0.0,0.0,56,7.000000,Diversified entertainment,Entertainment,...,0,0,0,1,1,0,0,0,3775,0.358056
79980,ecac106a-f49c-49b3-a899-fbb04570226e,0.0,0,,,,29,30.642857,Books & Magazines,Entertainment,...,2,1,2,2,2,0,0,0,41046,15.491667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89971,3df1babe-26b4-42e3-b3e0-5e29e2100851,0.0,0,0.0,0.0,0.0,70,,Concept stores & Miscellaneous,"Leisure, Sport & Hobby",...,0,0,0,0,0,0,0,0,0,14.919722
89972,da140349-0d40-40ac-85c9-939036eaf4ac,0.0,0,0.0,0.0,0.0,25,10.166667,Diversified entertainment,Entertainment,...,1,1,1,1,1,0,0,0,6535,11.846667
89973,24223123-a12a-4182-aafb-63c6a37f158b,0.0,0,0.0,0.0,0.0,34,13.555556,Youthful Shoes & Clothing,Clothing & Shoes,...,1,1,1,1,1,0,0,0,47306,18.681944
89974,451fc324-9ad3-4e28-93f9-cce455e664a1,0.0,0,0.0,0.0,0.0,51,13.400000,Books & Magazines,Entertainment,...,1,1,1,1,1,0,0,0,13530,11.964444


In [7]:
print(df.shape)

#Dropping columns with very high number of Nans and targat values at Nan
temp_df = df.dropna(axis = 'columns', thresh=int(df.shape[0]*0.6)).dropna(axis = 0, subset="default")
print(temp_df.shape)

df = temp_df

(99976, 43)
(89976, 35)


In [8]:
np.corrcoef(df['has_paid'], df['default'])

array([[ 1.       , -0.0320813],
       [-0.0320813,  1.       ]])

Was looking for correlation between has_paid column and default which is the target, but it seems okay to keep in the features.

In [9]:
df['has_paid'] = df['has_paid'].astype("int64")

In [10]:
df.isna().mean().sort_values(ascending=False)

avg_payment_span_0_12m              0.238597
num_active_div_by_paid_inv_0_12m    0.229595
num_arch_written_off_0_12m          0.181215
num_arch_written_off_12_24m         0.181215
account_days_in_dc_12_24m           0.118732
account_days_in_rem_12_24m          0.118732
account_days_in_term_12_24m         0.118732
status_max_archived_0_6_months      0.000000
num_unpaid_bills                    0.000000
status_last_archived_0_24m          0.000000
status_2nd_last_archived_0_24m      0.000000
status_3rd_last_archived_0_24m      0.000000
uuid                                0.000000
status_max_archived_0_12_months     0.000000
num_arch_rem_0_12m                  0.000000
recovery_debt                       0.000000
sum_capital_paid_account_0_12m      0.000000
sum_capital_paid_account_12_24m     0.000000
sum_paid_inv_0_12m                  0.000000
status_max_archived_0_24_months     0.000000
num_arch_dc_12_24m                  0.000000
num_arch_ok_12_24m                  0.000000
num_arch_o

In [11]:
ids = df['uuid']
y = df['default']
X = df.drop(columns=['uuid','default'])

In [12]:
# First approximation will try a prediction with an imbalanced target in the dataframe
# Expecting a very low score on predicting class 1
y.value_counts(normalize=True)

0.0    0.985685
1.0    0.014315
Name: default, dtype: float64

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, stratify=y, random_state=42)

print("X_train shape: ", X_train.shape)
print("X_test shape: ", X_test.shape)
print("y_train shape: ", y_train.shape)
print("y_test shape: ", y_test.shape)

X_train shape:  (71980, 33)
X_test shape:  (17996, 33)
y_train shape:  (71980,)
y_test shape:  (17996,)


## Making Pipelines

In [14]:
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),#Median is more robust to outliers, might be safer
    ('scaler', RobustScaler())
])
num_pipe

In [15]:
cat_pipe = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore', min_frequency=0.05))
])
cat_pipe

In [16]:
num_columns = X.select_dtypes(include=(float, int)).columns.tolist()
cat_columns = X.select_dtypes(exclude=(float, int)).columns.tolist()
cat_columns

['merchant_category', 'merchant_group', 'name_in_email']

In [17]:
Preproc_pipeline = ColumnTransformer(transformers=[
    ('num', num_pipe, num_columns),
    ('cat', cat_pipe, cat_columns)
])

Preproc_pipeline

checking how the feature matrix looks like after preprocessing

In [18]:
pd.DataFrame(Preproc_pipeline.fit_transform(X_train),columns=Preproc_pipeline.get_feature_names_out())

Unnamed: 0,num__account_amount_added_12_24m,num__account_days_in_dc_12_24m,num__account_days_in_rem_12_24m,num__account_days_in_term_12_24m,num__age,num__avg_payment_span_0_12m,num__has_paid,num__max_paid_inv_0_12m,num__max_paid_inv_0_24m,num__num_active_div_by_paid_inv_0_12m,...,cat__merchant_group_Health & Beauty,"cat__merchant_group_Leisure, Sport & Hobby",cat__merchant_group_infrequent_sklearn,cat__name_in_email_F,cat__name_in_email_F+L,cat__name_in_email_F1+L,cat__name_in_email_L1+F,cat__name_in_email_Nick,cat__name_in_email_no_match,cat__name_in_email_infrequent_sklearn
0,0.000000,0.0,0.0,0.0,0.35,-0.362375,0.0,2.393666,1.980554,0.000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.000000,0.0,0.0,0.0,1.10,1.730564,0.0,-0.416532,-0.514751,0.000,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.000000,0.0,0.0,0.0,-0.15,-0.607799,0.0,-0.330327,-0.287880,0.625,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.000000,0.0,0.0,0.0,0.50,-1.225006,0.0,-0.191090,0.857442,0.000,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,8.497729,0.0,0.0,0.0,-0.05,0.019445,0.0,2.063875,1.687717,0.000,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71975,0.000000,0.0,0.0,0.0,-0.80,-0.136112,0.0,-0.105743,-0.238787,0.000,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
71976,1.150504,0.0,0.0,0.0,-0.25,0.486113,0.0,0.698229,0.475097,0.000,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
71977,0.514912,0.0,0.0,0.0,-0.30,-1.225006,0.0,0.637144,0.420857,0.000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
71978,0.000000,0.0,0.0,0.0,0.80,1.125623,0.0,-0.352657,-0.353177,2.000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
Full_pipe = Pipeline([
    ('preproc', Preproc_pipeline),
    ('estimator',LogisticRegression(solver ='lbfgs',max_iter=5000))
    ])
Full_pipe

## Testing model

### Simple first approach

In [20]:
scores = cross_val_score(Full_pipe, X_train, y_train, cv = 5, scoring='f1')

print('Cross validation scores are: ', scores)
print('Average score of cross validation is: ', scores.mean())

Cross validation scores are:  [0.03669725 0.05504587 0.03555556 0.03571429 0.08695652]
Average score of cross validation is:  0.04999389645500535


In [21]:
y_pred = Full_pipe.fit(X_train,y_train).predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [22]:
confusion_matrix(y_test, y_pred)

array([[17726,    12],
       [  252,     6]])

As expected this simple method is very bad at predicting class 1.

Mutiple options are available to make it better

### First try 

In [23]:
Full_pipe = Pipeline([
    ('preproc', Preproc_pipeline),
    ('estimator',LogisticRegression(solver ='lbfgs',max_iter=5000, class_weight='balanced'))
    ])
Full_pipe

In [24]:
scores = cross_val_score(Full_pipe, X_train, y_train, cv = 5, scoring='f1')

print('Cross validation scores are: ', scores)
print('Average score of cross validation is: ', scores.mean())

Cross validation scores are:  [0.09594096 0.0952381  0.097922   0.09062589 0.09675583]
Average score of cross validation is:  0.09529655549178444


In [25]:
y_pred = Full_pipe.fit(X_train,y_train).predict(X_test)

In [26]:
confusion_matrix(y_test, y_pred)

array([[13743,  3995],
       [   53,   205]])

Added the class_weight argument to the logistic regression model, results imporoved slightly but still but still not great

### Second try 

In [27]:
y.value_counts()

0.0    88688
1.0     1288
Name: default, dtype: int64

In [28]:
rus = RandomUnderSampler(random_state=42)
X_rus , y_rus = rus.fit_resample(X,y)

print(X_rus.shape, y_rus.shape)
print(y_rus.value_counts())

(2576, 33) (2576,)
0.0    1288
1.0    1288
Name: default, dtype: int64


In [29]:
X_train, X_test, y_train, y_test = train_test_split(X_rus, y_rus, test_size=0.2, random_state=42)

print("X_train shape: ", X_train.shape)
print("X_test shape: ", X_test.shape)
print("y_train shape: ", y_train.shape)
print("y_test shape: ", y_test.shape)

X_train shape:  (2060, 33)
X_test shape:  (516, 33)
y_train shape:  (2060,)
y_test shape:  (516,)


In [30]:
Full_pipe = Pipeline([
    ('preproc', Preproc_pipeline),
    ('estimator',LogisticRegression(solver ='lbfgs',max_iter=5000, class_weight='balanced'))
    ])

scores = cross_val_score(Full_pipe, X_train, y_train, cv = 5, scoring='f1')

print('Cross validation scores are: ', scores)
print('Average score of cross validation is: ', scores.mean())

y_pred = Full_pipe.fit(X_train,y_train).predict(X_test)

print(confusion_matrix(y_test, y_pred))

Cross validation scores are:  [0.76555024 0.81690141 0.80285036 0.78773585 0.81030445]
Average score of cross validation is:  0.7966684605370012
[[193  74]
 [ 33 216]]


This seems to be getting better, we now have a better score for prediction of class 1

Let s try Oversampling

### Third try

In [31]:
ros = RandomOverSampler(random_state=42)
X_ros , y_ros = ros.fit_resample(X,y)

print(X_ros.shape, y_ros.shape)
print(y_ros.value_counts(normalize=True))

(177376, 33) (177376,)
0.0    0.5
1.0    0.5
Name: default, dtype: float64


In [32]:
X_train, X_test, y_train, y_test = train_test_split(X_ros, y_ros, test_size=0.2, random_state=42)

print("X_train shape: ", X_train.shape)
print("X_test shape: ", X_test.shape)
print("y_train shape: ", y_train.shape)
print("y_test shape: ", y_test.shape)

X_train shape:  (141900, 33)
X_test shape:  (35476, 33)
y_train shape:  (141900,)
y_test shape:  (35476,)


In [33]:
Full_pipe = Pipeline([
    ('preproc', Preproc_pipeline),
    ('estimator',LogisticRegression(solver ='lbfgs',max_iter=5000))
    ])

scores = cross_val_score(Full_pipe, X_train, y_train, cv = 5, scoring='f1')

print('Cross validation scores are: ', scores)
print('Average score of cross validation is: ', scores.mean())

y_pred = Full_pipe.fit(X_train,y_train).predict(X_test)

print(confusion_matrix(y_test, y_pred))

Cross validation scores are:  [0.81104889 0.80845975 0.81214795 0.80433306 0.80339076]
Average score of cross validation is:  0.8078760822386715
[[13442  4233]
 [ 2866 14935]]


Note that with Oversampling the size of our dataset has greatly increased. But an obvious drawback is the fact that the dataset is now biased towards the particular datapoints that have been oversampled. So there is a risk (a quite obvious one when you think about it of over fitting here!!)

### Back to second try

Let s go back to the second try and try to improve the score a little bit

In [34]:
rus = RandomUnderSampler(random_state=42)
X_rus , y_rus = rus.fit_resample(X,y)

print(X_rus.shape, y_rus.shape)
print(y_rus.value_counts(normalize=True))

X_train, X_test, y_train, y_test = train_test_split(X_rus, y_rus, test_size=0.2, random_state=42)

print("X_train shape: ", X_train.shape)
print("X_test shape: ", X_test.shape)
print("y_train shape: ", y_train.shape)
print("y_test shape: ", y_test.shape)

(2576, 33) (2576,)
0.0    0.5
1.0    0.5
Name: default, dtype: float64
X_train shape:  (2060, 33)
X_test shape:  (516, 33)
y_train shape:  (2060,)
y_test shape:  (516,)


In [35]:
Full_pipe = Pipeline([
    ('preproc', Preproc_pipeline),
    ('estimator',xgb.XGBClassifier(max_depth = 5,
                                   n_estimators = 10,
                                   n_jobs = -1))
    ])

scores = cross_val_score(Full_pipe, X_train, y_train, cv = 5, scoring='f1')

print('Cross validation scores are: ', scores)
print('Average score of cross validation is: ', scores.mean())

y_pred = Full_pipe.fit(X_train,y_train).predict(X_test)

print(confusion_matrix(y_test, y_pred))

Cross validation scores are:  [0.77108434 0.77068558 0.8156682  0.82242991 0.82134571]
Average score of cross validation is:  0.800242746701852
[[195  72]
 [ 37 212]]


In [41]:
finalX_test = final_test.drop(columns = "default")
finalX_test.head()

Unnamed: 0,uuid,account_amount_added_12_24m,account_days_in_dc_12_24m,account_days_in_rem_12_24m,account_days_in_term_12_24m,age,avg_payment_span_0_12m,merchant_category,merchant_group,has_paid,...,status_2nd_last_archived_0_24m,status_3rd_last_archived_0_24m,status_max_archived_0_6_months,status_max_archived_0_12_months,status_max_archived_0_24_months,recovery_debt,sum_capital_paid_account_0_12m,sum_capital_paid_account_12_24m,sum_paid_inv_0_12m,time_hours
79976,e640a7eb-d00c-4201-91f2-ee98cb7f72c3,0,,,,20,10.285714,Diversified entertainment,Entertainment,1,...,1,1,1,1,1,0,0,0,40062,17.6125
79977,96de59b7-bf9e-4282-8b2a-daa70ee17e3c,0,,,,63,9.666667,Diversified entertainment,Entertainment,1,...,1,1,1,1,1,0,0,0,5055,22.06
79978,1271315b-daed-4179-9050-0e38dbcd59a0,0,0.0,0.0,0.0,20,23.666667,Youthful Shoes & Clothing,Clothing & Shoes,1,...,1,2,2,2,2,0,0,0,11885,14.789167
79979,39fd0320-09be-43d6-97ad-d1f7e2a45fb6,0,0.0,0.0,0.0,56,7.0,Diversified entertainment,Entertainment,1,...,0,0,0,1,1,0,0,0,3775,0.358056
79980,ecac106a-f49c-49b3-a899-fbb04570226e,0,,,,29,30.642857,Books & Magazines,Entertainment,1,...,2,1,2,2,2,0,0,0,41046,15.491667


In [42]:
finalX_test = final_test.drop(columns = "default")
final_predictions = pd.DataFrame({"uuid" : finalX_test['uuid'].to_numpy(),
                                  "pd" : np.round(Full_pipe.predict_proba(finalX_test)[:,1]*100,2)
                                  })
final_predictions

Unnamed: 0,uuid,pd
0,e640a7eb-d00c-4201-91f2-ee98cb7f72c3,5.160000
1,96de59b7-bf9e-4282-8b2a-daa70ee17e3c,9.820000
2,1271315b-daed-4179-9050-0e38dbcd59a0,42.619999
3,39fd0320-09be-43d6-97ad-d1f7e2a45fb6,22.670000
4,ecac106a-f49c-49b3-a899-fbb04570226e,52.830002
...,...,...
9995,3df1babe-26b4-42e3-b3e0-5e29e2100851,30.780001
9996,da140349-0d40-40ac-85c9-939036eaf4ac,10.210000
9997,24223123-a12a-4182-aafb-63c6a37f158b,4.830000
9998,451fc324-9ad3-4e28-93f9-cce455e664a1,7.530000
