In [20]:
!pip install xgboost imbalanced-learn


Defaulting to user installation because normal site-packages is not writeable


In [1]:
# Install necessary libraries (run once)
!pip install xgboost imbalanced-learn

# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve, f1_score
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

import warnings
warnings.filterwarnings('ignore')


Defaulting to user installation because normal site-packages is not writeable


In [2]:
# Load the dataset
df = pd.read_csv('C:/Users/Pranjal/Downloads/Fraud.csv')
print("Initial Shape of the Data:", df.shape)

# Quick check
df.head()


Initial Shape of the Data: (1048575, 11)


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [23]:
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
for col in num_cols:
    df[col] = df[col].fillna(df[col].median())

cat_cols = df.select_dtypes(include=['object']).columns
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])


In [24]:
df = df.drop_duplicates()
print("Shape after removing duplicates:", df.shape)


Shape after removing duplicates: (1048575, 11)


In [31]:
for col in num_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df[col] = np.where(df[col] < lower, lower, df[col])
    df[col] = np.where(df[col] > upper, upper, df[col])


In [33]:
# Select only numeric columns for correlation
numeric_df = df.select_dtypes(include=['float64', 'int64'])

# Calculate correlation matrix only on numeric columns
corr_matrix = numeric_df.corr()

# Get upper triangle of the correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find features with correlation > 0.9
to_drop = [column for column in upper.columns if any(upper[column].abs() > 0.9)]

print("Columns dropped due to multicollinearity:", to_drop)

# Drop the highly correlated features from df
df = df.drop(columns=to_drop)


Columns dropped due to multicollinearity: ['newbalanceDest']


In [36]:
df = df.drop(['nameOrig', 'nameDest'], axis=1)


In [37]:
df = pd.get_dummies(df, columns=['type'], drop_first=True)

print("Shape after encoding:", df.shape)

Shape after encoding: (1048575, 11)


In [41]:
scaler = StandardScaler()
scaled_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[scaled_cols] = scaler.fit_transform(df[scaled_cols])


In [42]:
# Target variable (change if needed)
target = 'isFraud'

X = df.drop(target, axis=1)
y = df[target]


In [45]:
print("Target value counts before SMOTE:\n", y.value_counts())


Target value counts before SMOTE:
 isFraud
0.0    1048575
Name: count, dtype: int64


In [47]:
from imblearn.over_sampling import SMOTE
from collections import Counter

# Define Features & Target again (in case you filtered data)
X = df.drop('isFraud', axis=1)
y = df['isFraud']

# Check class distribution
print("Class distribution before SMOTE:\n", Counter(y))

# Only apply SMOTE if both classes exist
if len(np.unique(y)) > 1:
    sm = SMOTE(random_state=42)
    X_resampled, y_resampled = sm.fit_resample(X, y)
    print("Class distribution after SMOTE:\n", Counter(y_resampled))
else:
    print("SMOTE skipped: Only one class present in target.")
    X_resampled, y_resampled = X, y  # Use original data without SMOTE



Class distribution before SMOTE:
 Counter({0.0: 1048575})
SMOTE skipped: Only one class present in target.


In [55]:
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, 
    test_size=0.3, 
    random_state=42, 
    stratify=y_resampled
)


In [56]:
import numpy as np
print("Train set labels:", np.unique(y_train, return_counts=True))



Train set labels: (array([0.]), array([734002], dtype=int64))


In [57]:
if len(np.unique(y_train)) > 1:
    model = XGBClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8,
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42
    )
    model.fit(X_train, y_train)
    print("Model trained successfully.")
else:
    print("Model training skipped: y_train has only one class. Cannot train classifier.")


Model training skipped: y_train has only one class. Cannot train classifier.


In [59]:
import numpy as np

# Check if model should be trained
if len(np.unique(y_train)) > 1:

    # Train model
    model = XGBClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8,
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42
    )
    
    model.fit(X_train, y_train)
    print("Model trained successfully.")

    # Evaluation
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:,1]

    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("ROC AUC Score:", roc_auc_score(y_test, y_prob))
    print("F1 Score:", f1_score(y_test, y_pred))

else:
    print("Model training & evaluation skipped because y_train has only one class.")



Model training & evaluation skipped because y_train has only one class.


In [61]:
import numpy as np

# Only plot ROC if model was trained
if len(np.unique(y_train)) > 1:
    
    # ROC Curve
    from sklearn.metrics import roc_curve, roc_auc_score

    fpr, tpr, _ = roc_curve(y_test, y_prob)

    plt.figure(figsize=(8,6))
    plt.plot(fpr, tpr, label='XGBoost AUC = %.3f' % roc_auc_score(y_test, y_prob))
    plt.plot([0,1], [0,1], 'k--')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.legend()
    plt.show()

else:
    print("ROC Curve skipped: Model was not trained (only one class in y_train).")



ROC Curve skipped: Model was not trained (only one class in y_train).


In [62]:
from sklearn.metrics import precision_recall_curve

if len(np.unique(y_train)) > 1:
    precision, recall, _ = precision_recall_curve(y_test, y_prob)

    plt.figure(figsize=(8,6))
    plt.plot(recall, precision, label='XGBoost PR Curve')
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title("Precision-Recall Curve")
    plt.legend()
    plt.show()
else:
    print("Precision-Recall Curve skipped: Model was not trained.")


Precision-Recall Curve skipped: Model was not trained.


In [64]:
from sklearn.metrics import precision_recall_curve

# Only plot if model was trained successfully
if len(np.unique(y_train)) > 1:
    precision, recall, _ = precision_recall_curve(y_test, y_prob)

    plt.figure(figsize=(8,6))
    plt.plot(recall, precision, label='XGBoost PR Curve')
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title("Precision-Recall Curve")
    plt.legend()
    plt.show()
else:
    print("Precision-Recall Curve skipped: Model was not trained because y_train has only one class.")


Precision-Recall Curve skipped: Model was not trained because y_train has only one class.


In [66]:
# Only show feature importance if model is trained
if len(np.unique(y_train)) > 1:
    
    importances = model.feature_importances_
    features = X.columns

    importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})
    importance_df = importance_df.sort_values(by='Importance', ascending=False)

    plt.figure(figsize=(10,6))
    sns.barplot(data=importance_df.head(10), x='Importance', y='Feature', palette='viridis')
    plt.title("Top 10 Important Features")
    plt.show()

    print("Top Features:\n", importance_df.head(10))

else:
    print("Feature importance skipped: Model was not trained because y_train has only one class.")


Feature importance skipped: Model was not trained because y_train has only one class.


In [72]:
import numpy as np

if len(np.unique(y_train)) > 1:
    # Model was trained, so it's safe to predict
    y_pred = model.predict(X_test)
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
else:
    # No fraud cases in training data
    print("Skipping prediction: Model was not trained because y_train has only one class.")



Skipping prediction: Model was not trained because y_train has only one class.


In [74]:
import numpy as np

if len(np.unique(y_train)) > 1:
    # Model is trained, so we can predict
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:,1]

    # Evaluate
    from sklearn.metrics import confusion_matrix, classification_report
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

else:
    # Model was not trained because there's no fraud in the training data
    print("⚠️ Skipping prediction: Model was not trained because y_train has only one class.")



⚠️ Skipping prediction: Model was not trained because y_train has only one class.


In [77]:
df = pd.read_csv('C:/Users/Pranjal/Downloads/Fraud.csv')
print(df['isFraud'].value_counts())

isFraud
0    1047433
1       1142
Name: count, dtype: int64
