<a href="https://colab.research.google.com/github/suchi-ta/Jupiter_Internship/blob/main/Model_building%26tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model Building

In [None]:
# Loading the preprocessed data
import numpy as np
import pandas as pd

smote_data = pd.read_csv("processed_creditcard.csv")


In [None]:
smote_data.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V22,V23,V24,V25,V26,V27,V28,Amount,Class,Hour
0,1.946747,-0.752526,-1.35513,-0.66163,1.502822,4.024933,-1.479661,1.13988,1.406819,-0.157403,...,0.297537,0.307915,0.69098,-0.350316,-0.388907,0.077641,-0.032248,-0.407614,0,1.193277
1,2.035149,-0.04888,-3.058693,0.247945,2.943487,3.298697,-0.002192,0.674782,0.045826,0.284864,...,0.228197,0.035542,0.70709,0.512885,-0.471198,0.00252,-0.069002,-0.426767,0,-0.441027
2,-0.99192,0.603193,0.711976,-0.992425,-0.825838,1.956261,-2.212603,-5.037523,0.000772,-2.009561,...,0.109526,-0.43653,-0.932803,0.826684,0.913773,0.038049,0.18534,0.334559,0,-0.305105
3,2.285718,-1.500239,-0.747565,-1.668119,-1.394143,-0.350339,-1.427984,0.01001,-1.118447,1.756121,...,0.077013,0.20831,-0.538236,-0.278032,-0.162068,0.018045,-0.063005,-0.41301,0,-0.085419
4,-0.448747,-1.01144,0.115903,-3.454854,0.715771,-0.14749,0.504347,-0.113817,-0.044782,-0.558955,...,-0.173298,-0.006692,-1.362383,-0.292234,-0.144622,-0.03258,-0.064194,-0.059131,0,0.48791


In [None]:
# Splitting the data

X = smote_data.drop("Class", axis=1)
y = smote_data["Class"]

In [None]:
X.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Hour
0,1.946747,-0.752526,-1.35513,-0.66163,1.502822,4.024933,-1.479661,1.13988,1.406819,-0.157403,...,0.076197,0.297537,0.307915,0.69098,-0.350316,-0.388907,0.077641,-0.032248,-0.407614,1.193277
1,2.035149,-0.04888,-3.058693,0.247945,2.943487,3.298697,-0.002192,0.674782,0.045826,0.284864,...,0.038628,0.228197,0.035542,0.70709,0.512885,-0.471198,0.00252,-0.069002,-0.426767,-0.441027
2,-0.99192,0.603193,0.711976,-0.992425,-0.825838,1.956261,-2.212603,-5.037523,0.000772,-2.009561,...,-2.798352,0.109526,-0.43653,-0.932803,0.826684,0.913773,0.038049,0.18534,0.334559,-0.305105
3,2.285718,-1.500239,-0.747565,-1.668119,-1.394143,-0.350339,-1.427984,0.01001,-1.118447,1.756121,...,-0.13967,0.077013,0.20831,-0.538236,-0.278032,-0.162068,0.018045,-0.063005,-0.41301,-0.085419
4,-0.448747,-1.01144,0.115903,-3.454854,0.715771,-0.14749,0.504347,-0.113817,-0.044782,-0.558955,...,-0.243245,-0.173298,-0.006692,-1.362383,-0.292234,-0.144622,-0.03258,-0.064194,-0.059131,0.48791


In [None]:
y.head()

Unnamed: 0,Class
0,0
1,0
2,0
3,0
4,0


In [None]:
# Model selection
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(max_iter=1000, solver="liblinear")

In [None]:
# Train-test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
# Save and download the test data

X_test.to_csv("X_test.csv", index=False)
y_test.to_csv("y_test.csv", index=False)


In [None]:
from google.colab import files

files.download("X_test.csv")
files.download("y_test.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Model training
log_reg.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
y_pred = log_reg.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# AUC score is very important in fraud detection
y_pred_prob = log_reg.predict_proba(X_test)[:, 1]
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_prob))


Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98     45349
           1       0.99      0.97      0.98     44789

    accuracy                           0.98     90138
   macro avg       0.98      0.98      0.98     90138
weighted avg       0.98      0.98      0.98     90138

Confusion Matrix:
 [[44878   471]
 [ 1374 43415]]
ROC-AUC Score: 0.9979521872190047


# Model Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# Defining hyperparameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],        # l1 = Lasso, l2 = Ridge
    'solver': ['liblinear']
}

grid_search = GridSearchCV(
    estimator=LogisticRegression(max_iter=1000),
    param_grid=param_grid,
    cv=5,
    scoring='roc_auc',
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best ROC-AUC Score:", grid_search.best_score_)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Parameters: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best ROC-AUC Score: 0.9976745545011696


We applied hyperparameter tuning but since default already gave strong results, the improvement was not much

In [None]:
best_log_reg = grid_search.best_estimator_

In [None]:
import joblib
from google.colab import files

In [None]:
joblib.dump(log_reg, "logistic_fraud_model.pkl")


['logistic_fraud_model.pkl']

In [None]:
joblib.dump(best_log_reg, "tuned_logistic_fraud_model.pkl")

['tuned_logistic_fraud_model.pkl']

In [None]:
files.download("logistic_fraud_model.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
files.download("tuned_logistic_fraud_model.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Component 3 Completed