# Model training

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("data/CreditCardFaultPredictionDataset.csv")
df.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0
1,50000,1,1,2,37,0,0,0,0,0,...,19394,19619,20024,2500,1815,657,1000,1000,800,0
2,500000,1,1,2,29,0,0,0,0,0,...,542653,483003,473944,55000,40000,38000,20239,13750,13770,0
3,100000,2,2,2,23,0,-1,-1,0,0,...,221,-159,567,380,601,0,581,1687,1542,0
4,140000,2,3,1,28,0,0,2,0,0,...,12211,11793,3719,3329,0,432,1000,1000,1000,0


In [3]:
# Dependent and independent variables

X =df.drop(labels=["default payment next month"], axis=1)
y = df["default payment next month"]

In [4]:
X.shape, y.shape

((1001, 23), (1001,))

In [5]:
numerical_cols = X.select_dtypes(exclude='object').columns

In [6]:
list(numerical_cols)

['LIMIT_BAL',
 'SEX',
 'EDUCATION',
 'MARRIAGE',
 'AGE',
 'PAY_0',
 'PAY_2',
 'PAY_3',
 'PAY_4',
 'PAY_5',
 'PAY_6',
 'BILL_AMT1',
 'BILL_AMT2',
 'BILL_AMT3',
 'BILL_AMT4',
 'BILL_AMT5',
 'BILL_AMT6',
 'PAY_AMT1',
 'PAY_AMT2',
 'PAY_AMT3',
 'PAY_AMT4',
 'PAY_AMT5',
 'PAY_AMT6']

In [7]:
from sklearn.impute import SimpleImputer ## HAndling Missing Values
from sklearn.preprocessing import StandardScaler # HAndling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [8]:
## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

In [9]:
# Column transformation
preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,numerical_cols)
])

In [10]:
# Train test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.28,random_state=40)

In [11]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [12]:
X_train.head()

Unnamed: 0,num_pipeline__LIMIT_BAL,num_pipeline__SEX,num_pipeline__EDUCATION,num_pipeline__MARRIAGE,num_pipeline__AGE,num_pipeline__PAY_0,num_pipeline__PAY_2,num_pipeline__PAY_3,num_pipeline__PAY_4,num_pipeline__PAY_5,...,num_pipeline__BILL_AMT3,num_pipeline__BILL_AMT4,num_pipeline__BILL_AMT5,num_pipeline__BILL_AMT6,num_pipeline__PAY_AMT1,num_pipeline__PAY_AMT2,num_pipeline__PAY_AMT3,num_pipeline__PAY_AMT4,num_pipeline__PAY_AMT5,num_pipeline__PAY_AMT6
0,-0.148407,0.837931,-1.040392,0.762055,-0.438334,1.789609,-0.695827,-0.670331,-0.609972,-0.62217,...,-0.65792,-0.529934,-0.548763,-0.595037,-0.458626,-0.291815,0.076783,0.012158,-0.270694,-0.197288
1,1.446418,0.837931,-1.040392,-1.097876,0.538421,1.789609,1.855933,1.750932,0.239506,0.223042,...,0.951612,1.044354,1.21898,1.273145,0.617866,-0.314548,0.183748,-0.001409,-0.041894,-0.015649
2,-0.528128,0.837931,0.283075,0.762055,-1.089503,1.789609,0.15476,0.136757,0.239506,0.223042,...,0.346318,0.401914,0.463708,0.196053,-0.17286,-0.192604,-0.204312,-0.194362,-0.198117,-0.137038
3,-1.211624,-1.193416,0.283075,0.762055,-1.198032,-0.863483,1.855933,1.750932,1.938461,0.223042,...,-0.620492,-0.558764,-0.598888,-0.573894,-0.494787,-0.227112,-0.403317,-0.337214,-0.309442,-0.197288
4,0.535089,0.837931,0.283075,-1.097876,0.104308,0.905245,-1.546414,-1.477419,-1.45945,-1.467383,...,-0.677159,-0.612628,-0.645246,-0.62203,-0.494787,-0.314548,-0.403317,-0.337214,-0.318668,-0.197288


In [13]:
## Model Training

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [14]:
regression = LogisticRegression()
regression.fit(X_train,y_train)

In [15]:
def evaluate_model(true, predicted):
    accuracy = accuracy_score(true, predicted)
    confusionmat = confusion_matrix(true, predicted)
    report = classification_report(true, predicted)
    return accuracy,confusionmat,report

In [16]:
y_pred = regression.predict(X_test)
accuracyy,confusionmatt,reportt = evaluate_model(y_test,y_pred)

In [17]:
print('Model Training Performance')
print("\nConfusion Matrix:\n",confusionmatt)
print("\nClassification Report: \n\n",reportt)
print("\nAccuracy:",accuracyy)

Model Training Performance

Confusion Matrix:
 [[215   9]
 [ 49   8]]

Classification Report: 

               precision    recall  f1-score   support

           0       0.81      0.96      0.88       224
           1       0.47      0.14      0.22        57

    accuracy                           0.79       281
   macro avg       0.64      0.55      0.55       281
weighted avg       0.74      0.79      0.75       281


Accuracy: 0.7935943060498221


In [18]:
models={
    "Logistic Regression": LogisticRegression(),
    "Support Vector Machine": SVC(kernel='linear'),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(max_depth=10),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Decision Tree": DecisionTreeClassifier(max_depth=10),
    "Neural Network (MLP)": MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42)
}
trained_model_list=[]
model_list=[]
accuracy_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    accuracy,confusionmat,report =evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    #print('Model Training Performance')
    print("\nConfusion Matrix:\n",confusionmat)
    print("\nClassification Report: \n\n",report)
    print("\nAccuracy:",accuracy*100)

    accuracy_list.append(accuracy)
    
    print('='*35)

Logistic Regression

Confusion Matrix:
 [[215   9]
 [ 49   8]]

Classification Report: 

               precision    recall  f1-score   support

           0       0.81      0.96      0.88       224
           1       0.47      0.14      0.22        57

    accuracy                           0.79       281
   macro avg       0.64      0.55      0.55       281
weighted avg       0.74      0.79      0.75       281


Accuracy: 79.35943060498221


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Support Vector Machine

Confusion Matrix:
 [[224   0]
 [ 57   0]]

Classification Report: 

               precision    recall  f1-score   support

           0       0.80      1.00      0.89       224
           1       0.00      0.00      0.00        57

    accuracy                           0.80       281
   macro avg       0.40      0.50      0.44       281
weighted avg       0.64      0.80      0.71       281


Accuracy: 79.7153024911032
Naive Bayes

Confusion Matrix:
 [[154  70]
 [ 23  34]]

Classification Report: 

               precision    recall  f1-score   support

           0       0.87      0.69      0.77       224
           1       0.33      0.60      0.42        57

    accuracy                           0.67       281
   macro avg       0.60      0.64      0.60       281
weighted avg       0.76      0.67      0.70       281


Accuracy: 66.90391459074732
Random Forest

Confusion Matrix:
 [[216   8]
 [ 46  11]]

Classification Report: 

               precision    rec

In [19]:
my_dict = dict(zip(model_list, accuracy_list))
my_dict

{'Logistic Regression': 0.7935943060498221,
 'Support Vector Machine': 0.797153024911032,
 'Naive Bayes': 0.6690391459074733,
 'Random Forest': 0.8078291814946619,
 'Gradient Boosting': 0.8042704626334519,
 'Decision Tree': 0.7544483985765125,
 'Neural Network (MLP)': 0.7473309608540926}