In [53]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC



In [54]:
features_df = pd.read_csv("https://raw.githubusercontent.com/pujan08/ML_Logistic_Regression_For_Credit_Card/main/Credit_card.csv")

Pre-Processing Part

In [55]:
# Converting categorical features into numerical value with both label encoder and one hot encoder
label_encoder = preprocessing.LabelEncoder()
onehotencoder = OneHotEncoder()

features_df['GENDER'] = label_encoder.fit_transform(features_df['GENDER'])
features_df['Car_Owner'] = label_encoder.fit_transform(features_df['Car_Owner'])
features_df['Propert_Owner'] = label_encoder.fit_transform(features_df['Propert_Owner'])

df = onehotencoder.fit_transform(features_df[['Type_Income']]).toarray()
df2 = pd.DataFrame(df)
df2_new = pd.concat([features_df,df2], axis=1)
df3=pd.get_dummies(features_df[["Type_Income"]])
features_df=pd.concat([features_df,df3],axis=1)
del features_df['Type_Income']

df = onehotencoder.fit_transform(features_df[['EDUCATION']]).toarray()
df2 = pd.DataFrame(df)
df2_new = pd.concat([features_df,df2], axis=1)
df3=pd.get_dummies(features_df[["EDUCATION"]])
features_df=pd.concat([features_df,df3],axis=1)
del features_df['EDUCATION']


features_df['Marital_status'] = label_encoder.fit_transform(features_df['Marital_status'])

df = onehotencoder.fit_transform(features_df[['Housing_type']]).toarray()
df2 = pd.DataFrame(df)
df2_new = pd.concat([features_df,df2], axis=1)
df3=pd.get_dummies(features_df[["Housing_type"]])
features_df=pd.concat([features_df,df3],axis=1)
del features_df['Housing_type']

df = onehotencoder.fit_transform(features_df[['Type_Occupation']]).toarray()
df2 = pd.DataFrame(df)
df2_new = pd.concat([features_df,df2], axis=1)
df3=pd.get_dummies(features_df[["Type_Occupation"]])
features_df=pd.concat([features_df,df3],axis=1)
del features_df['Type_Occupation']

print(features_df)

       Ind_ID  GENDER  Car_Owner  Propert_Owner  CHILDREN  Annual_income  \
0     5008827       1          1              1         0       180000.0   
1     5009744       0          1              0         0       315000.0   
2     5009746       0          1              0         0       315000.0   
3     5009749       0          1              0         0            NaN   
4     5009752       0          1              0         0       315000.0   
...       ...     ...        ...            ...       ...            ...   
1543  5028645       0          0              1         0            NaN   
1544  5023655       0          0              0         0       225000.0   
1545  5115992       1          1              1         2       180000.0   
1546  5118219       1          1              0         0       270000.0   
1547  5053790       0          1              1         0       225000.0   

      Marital_status  Birthday_count  Employed_days  Mobile_phone  ...  \
0            

In [56]:
#Find missing values and impute them
string_imputer = SimpleImputer(strategy='mean')
features_df_imputed = pd.DataFrame(string_imputer.fit_transform(features_df), columns=features_df.columns)
string_imputer.fit(features_df)
missing_values_after_imputation = features_df_imputed.isna().sum().sum()
print(features_df_imputed)

         Ind_ID  GENDER  Car_Owner  Propert_Owner  CHILDREN  Annual_income  \
0     5008827.0     1.0        1.0            1.0       0.0   180000.00000   
1     5009744.0     0.0        1.0            0.0       0.0   315000.00000   
2     5009746.0     0.0        1.0            0.0       0.0   315000.00000   
3     5009749.0     0.0        1.0            0.0       0.0   191399.32623   
4     5009752.0     0.0        1.0            0.0       0.0   315000.00000   
...         ...     ...        ...            ...       ...            ...   
1543  5028645.0     0.0        0.0            1.0       0.0   191399.32623   
1544  5023655.0     0.0        0.0            0.0       0.0   225000.00000   
1545  5115992.0     1.0        1.0            1.0       2.0   180000.00000   
1546  5118219.0     1.0        1.0            0.0       0.0   270000.00000   
1547  5053790.0     0.0        1.0            1.0       0.0   225000.00000   

      Marital_status  Birthday_count  Employed_days  Mobile_pho

In [57]:
#check and print if there are any missing values
if missing_values_after_imputation == 0:
    print("Sanity Check Passed: No missing values after imputation in features_df.")
else:
    print(f"Sanity Check Failed: There are still {missing_values_after_imputation} missing values in features_df after imputation.")

Sanity Check Passed: No missing values after imputation in features_df.


In [58]:
# Normalizing the data to remove bias
min_max = MinMaxScaler()
features_df_imputed_min_max = min_max.fit_transform(features_df_imputed[['Ind_ID', 'Annual_income', 'CHILDREN',
                                                               'Birthday_count', 'Employed_days','Family_Members']])

features_df_imputed[['Ind_ID', 'Annual_income', 'CHILDREN','Birthday_count', 'Employed_days','Family_Members']] = features_df_imputed_min_max

print(features_df_imputed_min_max)
print(features_df_imputed)

[[0.         0.09489051 0.         0.35809988 1.         0.07142857]
 [0.00647667 0.18248175 0.         0.66057653 0.03762134 0.07142857]
 [0.0064908  0.18248175 0.         0.51653952 0.03762134 0.07142857]
 ...
 [0.75689515 0.09489051 0.14285714 0.68279102 0.03264673 0.21428571]
 [0.77262422 0.15328467 0.         0.55994432 0.03746613 0.07142857]
 [0.31756895 0.12408759 0.         0.48402065 0.03164181 0.07142857]]
        Ind_ID  GENDER  Car_Owner  Propert_Owner  CHILDREN  Annual_income  \
0     0.000000     1.0        1.0            1.0  0.000000       0.094891   
1     0.006477     0.0        1.0            0.0  0.000000       0.182482   
2     0.006491     0.0        1.0            0.0  0.000000       0.182482   
3     0.006512     0.0        1.0            0.0  0.000000       0.102287   
4     0.006533     0.0        1.0            0.0  0.000000       0.182482   
...        ...     ...        ...            ...       ...            ...   
1543  0.139972     0.0        0.0        

In [59]:
target_df = pd.read_csv("https://raw.githubusercontent.com/pujan08/ML_Logistic_Regression_For_Credit_Card/main/Credit_card_label.csv")

In [60]:
#Find missing values and impute them
numeric_imputer = SimpleImputer(strategy='mean')
target_df_imputed = pd.DataFrame(numeric_imputer.fit_transform(target_df), columns=target_df.columns)
numeric_imputer.fit(target_df)
print(target_df_imputed)

         Ind_ID  label
0     5008827.0    1.0
1     5009744.0    1.0
2     5009746.0    1.0
3     5009749.0    1.0
4     5009752.0    1.0
...         ...    ...
1543  5028645.0    0.0
1544  5023655.0    0.0
1545  5115992.0    0.0
1546  5118219.0    0.0
1547  5053790.0    0.0

[1548 rows x 2 columns]


In [61]:
#check and print if there are any missing values
missing_values_after_imputation = target_df_imputed.isna().sum().sum()
if missing_values_after_imputation == 0:
    print("Sanity Check Passed: No missing values after imputation in target_df.")
else:
    print(f"Sanity Check Failed: There are still {missing_values_after_imputation} missing values in target_df after imputation.")


Sanity Check Passed: No missing values after imputation in target_df.


2. Training and Testing Part

In [62]:
# Defining features and labels
X = features_df_imputed
y = target_df_imputed['label']

In [63]:
# Defining train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69420)

In [64]:
# Build an SVM model with random hyperparameters 
svm_model = SVC(C= 1, gamma= 0.01, kernel= 'linear', random_state=42)

# Train the model
svm_model.fit(X_train, y_train)

In [65]:
# Run Test set on the model and print the confusion matrix
svm_pred = svm_model.predict(X_test)
print(confusion_matrix(y_test, svm_pred))
print('accuracy: ', accuracy_score(y_test, svm_pred))
print(classification_report(y_test, svm_pred))

[[279   0]
 [ 30   1]]
accuracy:  0.9032258064516129
              precision    recall  f1-score   support

         0.0       0.90      1.00      0.95       279
         1.0       1.00      0.03      0.06        31

    accuracy                           0.90       310
   macro avg       0.95      0.52      0.51       310
weighted avg       0.91      0.90      0.86       310



In [66]:
# Use GridSearchCV to tune the hyperparameters
svm_model_gscv = SVC()


param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001], 'kernel': ['linear', 'rbf', 'poly', 'sigmoid']}


grid_search = GridSearchCV(svm_model_gscv, param_grid,  cv=5)
grid_search.fit(X_train, y_train) 

In [67]:
# Build a model with the best params and run test set on it
print("Best Parameters:", grid_search.best_params_)
best_grid = grid_search.best_estimator_
svm_model_gscv_pred = best_grid.predict(X_test)
print(confusion_matrix(y_test, svm_model_gscv_pred))
print('accuracy: ', accuracy_score(y_test, svm_model_gscv_pred))
print(classification_report(y_test, svm_model_gscv_pred))

Best Parameters: {'C': 100, 'gamma': 1, 'kernel': 'rbf'}
[[266  13]
 [ 21  10]]
accuracy:  0.8903225806451613
              precision    recall  f1-score   support

         0.0       0.93      0.95      0.94       279
         1.0       0.43      0.32      0.37        31

    accuracy                           0.89       310
   macro avg       0.68      0.64      0.66       310
weighted avg       0.88      0.89      0.88       310



In [68]:
# Removing features
features_df_removed = features_df_imputed.drop(['Ind_ID', 'GENDER', 'Mobile_phone', 'Work_Phone', 'Phone', 'EMAIL_ID'], axis=1)
print(features_df_removed)
X_dropped = features_df_removed
X_train, X_test, y_train, y_test = train_test_split(X_dropped, y, test_size=0.2, random_state=69420)

      Car_Owner  Propert_Owner  CHILDREN  Annual_income  Marital_status  \
0           1.0            1.0  0.000000       0.094891             1.0   
1           1.0            0.0  0.000000       0.182482             1.0   
2           1.0            0.0  0.000000       0.182482             1.0   
3           1.0            0.0  0.000000       0.102287             1.0   
4           1.0            0.0  0.000000       0.182482             1.0   
...         ...            ...       ...            ...             ...   
1543        0.0            1.0  0.000000       0.102287             1.0   
1544        0.0            0.0  0.000000       0.124088             3.0   
1545        1.0            1.0  0.142857       0.094891             1.0   
1546        1.0            0.0  0.000000       0.153285             0.0   
1547        1.0            1.0  0.000000       0.124088             1.0   

      Birthday_count  Employed_days  Family_Members  \
0           0.358100       1.000000        0

In [69]:
# Run GridSearchCV again, run test set on the best model, and print the accuracy
grid_search.fit(X_train, y_train) 
print("Best Parameters:", grid_search.best_params_)
best_grid = grid_search.best_estimator_
svm_model_gscv_pred = best_grid.predict(X_test)
print(confusion_matrix(y_test, svm_model_gscv_pred))
print('accuracy: ', accuracy_score(y_test, svm_model_gscv_pred))
print(classification_report(y_test, svm_model_gscv_pred))

Best Parameters: {'C': 10, 'gamma': 1, 'kernel': 'linear'}
[[279   0]
 [ 30   1]]
accuracy:  0.9032258064516129
              precision    recall  f1-score   support

         0.0       0.90      1.00      0.95       279
         1.0       1.00      0.03      0.06        31

    accuracy                           0.90       310
   macro avg       0.95      0.52      0.51       310
weighted avg       0.91      0.90      0.86       310

