In [704]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report, f1_score
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import ADASYN
import seaborn as sns
import matplotlib.pyplot as plt

In [705]:
features_df = pd.read_csv("https://raw.githubusercontent.com/pujan08/ML_Logistic_Regression_For_Credit_Card/main/Credit_card.csv")

Pre-Processing Part

In [706]:
# Converting categorical features into numerical value with both label encoder and one hot encoder
label_encoder = preprocessing.LabelEncoder()
onehotencoder = OneHotEncoder()

features_df['GENDER'] = label_encoder.fit_transform(features_df['GENDER'])
features_df['Car_Owner'] = label_encoder.fit_transform(features_df['Car_Owner'])
features_df['Propert_Owner'] = label_encoder.fit_transform(features_df['Propert_Owner'])
'''
df = onehotencoder.fit_transform(features_df[['Type_Income']]).toarray()
df2 = pd.DataFrame(df)
df2_new = pd.concat([features_df,df2], axis=1)
df3=pd.get_dummies(features_df[["Type_Income"]])
features_df=pd.concat([features_df,df3],axis=1)
del features_df['Type_Income']
'''
features_df['Type_Income'] = label_encoder.fit_transform(features_df['Type_Income'])
'''
df = onehotencoder.fit_transform(features_df[['EDUCATION']]).toarray()
df2 = pd.DataFrame(df)
df2_new = pd.concat([features_df,df2], axis=1)
df3=pd.get_dummies(features_df[["EDUCATION"]])
features_df=pd.concat([features_df,df3],axis=1)
del features_df['EDUCATION']
'''
features_df['EDUCATION'] = label_encoder.fit_transform(features_df['EDUCATION'])
features_df['Marital_status'] = label_encoder.fit_transform(features_df['Marital_status'])
'''
df = onehotencoder.fit_transform(features_df[['Housing_type']]).toarray()
df2 = pd.DataFrame(df)
df2_new = pd.concat([features_df,df2], axis=1)
df3=pd.get_dummies(features_df[["Housing_type"]])
features_df=pd.concat([features_df,df3],axis=1)
del features_df['Housing_type']
'''
features_df['Housing_type'] = label_encoder.fit_transform(features_df['Housing_type'])
'''
df = onehotencoder.fit_transform(features_df[['Type_Occupation']]).toarray()
df2 = pd.DataFrame(df)
df2_new = pd.concat([features_df,df2], axis=1)
df3=pd.get_dummies(features_df[["Type_Occupation"]])
features_df=pd.concat([features_df,df3],axis=1)
del features_df['Type_Occupation']
'''
features_df['Type_Occupation'] = label_encoder.fit_transform(features_df['Type_Occupation'])
print(features_df)

       Ind_ID  GENDER  Car_Owner  Propert_Owner  CHILDREN  Annual_income  \
0     5008827       1          1              1         0       180000.0   
1     5009744       0          1              0         0       315000.0   
2     5009746       0          1              0         0       315000.0   
3     5009749       0          1              0         0            NaN   
4     5009752       0          1              0         0       315000.0   
...       ...     ...        ...            ...       ...            ...   
1543  5028645       0          0              1         0            NaN   
1544  5023655       0          0              0         0       225000.0   
1545  5115992       1          1              1         2       180000.0   
1546  5118219       1          1              0         0       270000.0   
1547  5053790       0          1              1         0       225000.0   

      Type_Income  EDUCATION  Marital_status  Housing_type  Birthday_count  \
0        

In [707]:
#Find missing values and impute them
string_imputer = SimpleImputer(strategy='mean')
features_df_imputed = pd.DataFrame(string_imputer.fit_transform(features_df), columns=features_df.columns)
string_imputer.fit(features_df)
missing_values_after_imputation = features_df_imputed.isna().sum().sum()
print(features_df_imputed)

         Ind_ID  GENDER  Car_Owner  Propert_Owner  CHILDREN  Annual_income  \
0     5008827.0     1.0        1.0            1.0       0.0   180000.00000   
1     5009744.0     0.0        1.0            0.0       0.0   315000.00000   
2     5009746.0     0.0        1.0            0.0       0.0   315000.00000   
3     5009749.0     0.0        1.0            0.0       0.0   191399.32623   
4     5009752.0     0.0        1.0            0.0       0.0   315000.00000   
...         ...     ...        ...            ...       ...            ...   
1543  5028645.0     0.0        0.0            1.0       0.0   191399.32623   
1544  5023655.0     0.0        0.0            0.0       0.0   225000.00000   
1545  5115992.0     1.0        1.0            1.0       2.0   180000.00000   
1546  5118219.0     1.0        1.0            0.0       0.0   270000.00000   
1547  5053790.0     0.0        1.0            1.0       0.0   225000.00000   

      Type_Income  EDUCATION  Marital_status  Housing_type  Bir

In [708]:
#check and print if there are any missing values
if missing_values_after_imputation == 0:
    print("Sanity Check Passed: No missing values after imputation in features_df.")
else:
    print(f"Sanity Check Failed: There are still {missing_values_after_imputation} missing values in features_df after imputation.")

Sanity Check Passed: No missing values after imputation in features_df.


In [709]:
# Normalizing the data to remove bias
min_max = MinMaxScaler()
features_df_imputed_min_max = min_max.fit_transform(features_df_imputed[['Ind_ID', 'Annual_income', 'CHILDREN',
                                                               'Birthday_count', 'Employed_days','Family_Members', 'Type_Income','EDUCATION'
                                                                        , 'Housing_type', 'Type_Occupation']])

features_df_imputed[['Ind_ID', 'Annual_income', 'CHILDREN','Birthday_count', 'Employed_days','Family_Members', 'Type_Income', 'EDUCATION'
                    , 'Housing_type', 'Type_Occupation']] = features_df_imputed_min_max

print(features_df_imputed_min_max)
print(features_df_imputed)

[[0.         0.09489051 0.         ... 0.25       0.2        1.        ]
 [0.00647667 0.18248175 0.         ... 0.25       0.2        1.        ]
 [0.0064908  0.18248175 0.         ... 0.25       0.2        1.        ]
 ...
 [0.75689515 0.09489051 0.14285714 ... 0.25       0.2        0.55555556]
 [0.77262422 0.15328467 0.         ... 1.         0.2        0.22222222]
 [0.31756895 0.12408759 0.         ... 0.25       0.2        1.        ]]
        Ind_ID  GENDER  Car_Owner  Propert_Owner  CHILDREN  Annual_income  \
0     0.000000     1.0        1.0            1.0  0.000000       0.094891   
1     0.006477     0.0        1.0            0.0  0.000000       0.182482   
2     0.006491     0.0        1.0            0.0  0.000000       0.182482   
3     0.006512     0.0        1.0            0.0  0.000000       0.102287   
4     0.006533     0.0        1.0            0.0  0.000000       0.182482   
...        ...     ...        ...            ...       ...            ...   
1543  0.139972   

In [710]:
target_df = pd.read_csv("https://raw.githubusercontent.com/pujan08/ML_Logistic_Regression_For_Credit_Card/main/Credit_card_label.csv")

In [711]:
#Find missing values and impute them
numeric_imputer = SimpleImputer(strategy='mean')
target_df_imputed = pd.DataFrame(numeric_imputer.fit_transform(target_df), columns=target_df.columns)
numeric_imputer.fit(target_df)
print(target_df_imputed)

         Ind_ID  label
0     5008827.0    1.0
1     5009744.0    1.0
2     5009746.0    1.0
3     5009749.0    1.0
4     5009752.0    1.0
...         ...    ...
1543  5028645.0    0.0
1544  5023655.0    0.0
1545  5115992.0    0.0
1546  5118219.0    0.0
1547  5053790.0    0.0

[1548 rows x 2 columns]


In [712]:
#check and print if there are any missing values
missing_values_after_imputation = target_df_imputed.isna().sum().sum()
if missing_values_after_imputation == 0:
    print("Sanity Check Passed: No missing values after imputation in target_df.")
else:
    print(f"Sanity Check Failed: There are still {missing_values_after_imputation} missing values in target_df after imputation.")


Sanity Check Passed: No missing values after imputation in target_df.


2. Training and Testing Part

In [713]:
# Defining features and labels
X = features_df_imputed
y = target_df_imputed['label']

In [714]:
# Defining train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69420)

In [715]:
# Oversampling the dataset
ros = RandomOverSampler(random_state=69420)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
print(pd.Series(y_resampled).value_counts())

0.0    1094
1.0    1094
Name: label, dtype: int64


In [716]:
# Build a model of random forest with random hyperparameters 
rf_model = RandomForestClassifier(n_estimators=10, max_features='auto', random_state=69420)

# Train the model
rf_model.fit(X_resampled, y_resampled)

RandomForestClassifier(n_estimators=10, random_state=69420)

In [717]:
# Run Test set on the model and print the confusion matrix
rf_pred = rf_model.predict(X_test)
print(confusion_matrix(y_test, rf_pred))
print('accuracy: ', accuracy_score(y_test, rf_pred))
print('f1 score: ', f1_score(y_test, rf_pred))
print(classification_report(y_test, rf_pred))

[[272   7]
 [ 19  12]]
accuracy:  0.9161290322580645
f1 score:  0.48000000000000004
              precision    recall  f1-score   support

         0.0       0.93      0.97      0.95       279
         1.0       0.63      0.39      0.48        31

    accuracy                           0.92       310
   macro avg       0.78      0.68      0.72       310
weighted avg       0.90      0.92      0.91       310



In [718]:
# Use GridSearchCV to tune the hyperparameters
rf_model_gscv = RandomForestClassifier()

param_grid = {
    'n_estimators': [1, 10, 50, 100, 150, 200, 500],
    'max_features': [None, 'sqrt', 'log2'],
    'random_state': [69420]
}

grid_search = GridSearchCV(rf_model_gscv, param_grid,  cv=5, scoring='f1')
grid_search.fit(X_resampled, y_resampled) 

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_features': [None, 'sqrt', 'log2'],
                         'n_estimators': [1, 10, 50, 100, 150, 200, 500],
                         'random_state': [69420]},
             scoring='f1')

In [719]:
# Build a model with the best params and run test set on it
print("Best Parameters:", grid_search.best_params_)
best_grid = grid_search.best_estimator_
rf_model_gscv_pred = best_grid.predict(X_test)
print(confusion_matrix(y_test, rf_model_gscv_pred))
print('accuracy: ', accuracy_score(y_test, rf_model_gscv_pred))
print('f1 score: ', f1_score(y_test, rf_model_gscv_pred))
print(classification_report(y_test, rf_model_gscv_pred))

Best Parameters: {'max_features': 'sqrt', 'n_estimators': 150, 'random_state': 69420}
[[274   5]
 [ 19  12]]
accuracy:  0.9225806451612903
f1 score:  0.5
              precision    recall  f1-score   support

         0.0       0.94      0.98      0.96       279
         1.0       0.71      0.39      0.50        31

    accuracy                           0.92       310
   macro avg       0.82      0.68      0.73       310
weighted avg       0.91      0.92      0.91       310



In [720]:
# Removing features
features_df_removed = features_df_imputed.drop(['Ind_ID', 'GENDER', 'Mobile_phone', 'Work_Phone', 'Phone', 'EMAIL_ID'], axis=1)
print(features_df_removed)
X_dropped = features_df_removed
X_train, X_test, y_train, y_test = train_test_split(X_dropped, y, test_size=0.2, random_state=69420)

      Car_Owner  Propert_Owner  CHILDREN  Annual_income  Type_Income  \
0           1.0            1.0  0.000000       0.094891     0.333333   
1           1.0            0.0  0.000000       0.182482     0.000000   
2           1.0            0.0  0.000000       0.182482     0.000000   
3           1.0            0.0  0.000000       0.102287     0.000000   
4           1.0            0.0  0.000000       0.182482     0.000000   
...         ...            ...       ...            ...          ...   
1543        0.0            1.0  0.000000       0.102287     0.000000   
1544        0.0            0.0  0.000000       0.124088     0.000000   
1545        1.0            1.0  0.142857       0.094891     1.000000   
1546        1.0            0.0  0.000000       0.153285     1.000000   
1547        1.0            1.0  0.000000       0.124088     1.000000   

      EDUCATION  Marital_status  Housing_type  Birthday_count  Employed_days  \
0          0.25             1.0           0.2        0.

In [721]:
# Oversampling the dataset
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
print(pd.Series(y_resampled).value_counts())

0.0    1094
1.0    1094
Name: label, dtype: int64


In [722]:
# Run GridSearchCV again, run test set on the best model, and print the accuracy
grid_search.fit(X_resampled, y_resampled) 
print("Best Parameters:", grid_search.best_params_)
best_grid = grid_search.best_estimator_
rf_model_gscv_pred = best_grid.predict(X_test)
print(confusion_matrix(y_test, rf_model_gscv_pred))
print('accuracy: ', accuracy_score(y_test, rf_model_gscv_pred))
print('f1 score: ', f1_score(y_test, rf_model_gscv_pred))
print(classification_report(y_test, rf_model_gscv_pred))

Best Parameters: {'max_features': 'sqrt', 'n_estimators': 200, 'random_state': 69420}
[[275   4]
 [ 17  14]]
accuracy:  0.932258064516129
f1 score:  0.5714285714285714
              precision    recall  f1-score   support

         0.0       0.94      0.99      0.96       279
         1.0       0.78      0.45      0.57        31

    accuracy                           0.93       310
   macro avg       0.86      0.72      0.77       310
weighted avg       0.93      0.93      0.92       310

