In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score 
from sklearn.preprocessing import StandardScaler 
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC 
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    precision_recall_fscore_support,
    precision_recall_curve,
    auc,
    accuracy_score  
) 
from sklearn.metrics import precision_recall_curve, auc 
from imblearn.over_sampling import SMOTE 
from imblearn.pipeline import Pipeline as ImbPipeline 
import xgboost as xgb

In [None]:
df = pd.read_csv("fraudcreditcard.csv")

In [None]:
# df.head()  #Gives the first 5 rows
df.shape  #gives the number of rows and cols
# df.info() #gives datatype and memory used
# df.describe()

In [None]:
# Value counts to see if data is imbalanced
df['Class'].value_counts(normalize='True') #this dataset is highly imbalanced

In [None]:
#visualize the class prediction --> Dataset is highly imbalanced
plt.figure(figsize=(3,3))
df['Class'].value_counts().plot(kind='bar' , color='black')
plt.title("Class Distribution")
plt.xlabel("Class")
plt.ylabel("Count")

In [None]:
# Data Cleaning
print(df.isnull().sum())

In [None]:
# Feature scaling
scaler = StandardScaler()
df[['Amount','Time']] = scaler.fit_transform(df[['Amount','Time']])

In [None]:
df.head()

In [None]:
# Correlation Analysis
plt.figure(figsize=(3,3))
numeric_df = df.select_dtypes(include=['number'])
correlation = numeric_df.corr() #df.corr()works only with numeric values hence we consider numeric_df
sns.heatmap(correlation , annot = False , cmap = 'coolwarm' , center = 0)
plt.title('HeatMap')
plt.show()

# Features most correlated with churn
class_corr = correlation['Class'].sort_values(ascending = True)
print("Top features correlated with class")
print(class_corr.head(10))

In [None]:
# Separate features and target
X = df.drop('Class', axis=1)
y = df['Class']

# Stratified split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Scale only on training set
scaler = StandardScaler()
X_train[['Amount','Time']] = scaler.fit_transform(X_train[['Amount','Time']])
X_test[['Amount','Time']] = scaler.transform(X_test[['Amount','Time']])

# Apply SMOTE on training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


In [None]:
# Logistic Regression
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train_resampled, y_train_resampled)

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_resampled, y_train_resampled)

# XGBoost
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_clf.fit(X_train_resampled, y_train_resampled)

In [None]:
y_pred_lr = log_reg.predict(X_test)
y_pred_rf = rf.predict(X_test)
y_pred_xgb = xgb_clf.predict(X_test)

y_pred_pro_lr = log_reg.predict_proba(X_test)[: , 1]
y_pred_pro_rf = rf.predict_proba(X_test)[: , 1]
y_pred_pro_xgb = xgb_clf.predict_proba(X_test)[: , 1]

In [None]:
# Evaluation Metrics
def eval_models(y_true , y_pred , y_pred_proba , model_name):
    print(f"\n{ '=' *50}")
    print(f"{model_name} Results")
    print(f"{ '=' *50}")
    
    #Accuracy
    accuracy = accuracy_score(y_true , y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    
    #ROC-AUC Curve
    roc_auc = roc_auc_score(y_true , y_pred)
    print(f"ROC-AUC: {roc_auc:.4f}")
    
    #Confusion Matrix
    cm = confusion_matrix(y_true , y_pred)
    print(f"Confusion-Matrix:\n {cm}")
    
    print(f"Classification Report:\n {classification_report(y_true , y_pred)}")
    
    plt.figure(figsize=(2, 2))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'{model_name} - Confusion Matrix')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()
    
eval_models(y_test , y_pred_lr , y_pred_pro_lr , "Logistic Regression")
eval_models(y_test , y_pred_rf , y_pred_pro_rf , "Random Forest") 
eval_models(y_test , y_pred_xgb , y_pred_pro_xgb , "XGB Model") 

### Takeaways
1) Logistic Regression shows very low precision.</br>
 Since it is a linear model, it struggles to separate the classes even after SMOTE.</br>
 SMOTE helps create balanced training data, but LR still cannot capture the complex patterns needed to correctly identify minority (fraud) cases.</br>
</br>
4) Random Forest performs well with SMOTE. </br>
It achieves a strong balance of precision and recall, meaning it catches many frauds while keeping false alarms low.</br>
</br>
5) XGBoost performs the best.</br>
It learns the synthetic patterns created by SMOTE even more effectively, giving the highest recall and best overall performance (including ROC-AUC).</br>