In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
import pickle

# Load the dataset
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Bank_Transaction_Fraud_Detection.csv")

# Drop unnecessary columns
df.drop(columns=[
    'Customer_ID', 'Customer_Name', 'Transaction_ID', 'Merchant_ID',
    'Customer_Contact', 'Customer_Email', 'Transaction_Description'
], inplace=True)

# Convert date and time features
df['Transaction_Date'] = pd.to_datetime(df['Transaction_Date'], format='%d-%m-%Y')
df['Transaction_Hour'] = pd.to_datetime(df['Transaction_Time'], format='%H:%M:%S').dt.hour
df.drop(columns=['Transaction_Time'], inplace=True)

# Log transform skewed features
df['Transaction_Amount'] = np.log1p(df['Transaction_Amount'])
df['Account_Balance'] = np.log1p(df['Account_Balance'])

# Create new features
df['Amt_to_Balance_Ratio'] = df['Transaction_Amount'] / (df['Account_Balance'] + 1)
df['Day_of_Week'] = df['Transaction_Date'].dt.dayofweek
df.drop(columns=['Transaction_Date'], inplace=True)

# Encode categorical features using OneHotEncoder
categorical_cols = ['Gender', 'Account_Type', 'Transaction_Type', 'Merchant_Category', 'State',
                    'City', 'Bank_Branch', 'Transaction_Device', 'Transaction_Location',
                    'Device_Type', 'Transaction_Currency']

# Explicitly cast categorical columns to string to avoid issues with OneHotEncoding
df[categorical_cols] = df[categorical_cols].astype(str)

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')  # Handle unknown categories during prediction
encoded_data = encoder.fit_transform(df[categorical_cols])
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_cols))

# Drop original categorical columns and add encoded ones
df = df.drop(columns=categorical_cols)
df = pd.concat([df, encoded_df], axis=1)

# Separate features (X) and target variable (y)
y = df['Is_Fraud']
X = df.drop(columns=['Is_Fraud'])

# Scale numerical features
numerical_cols = X.select_dtypes(exclude=['object']).columns  # Exclude object type columns
scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

# Handle missing values (NaN) using imputation
imputer = SimpleImputer(strategy='most_frequent')  # Use 'most_frequent' for categorical columns
X = imputer.fit_transform(X)  # Impute missing values

# Apply SMOTE for oversampling
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, stratify=y_res, random_state=42)

# Train the XGBoost model
model = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='logloss'
)

model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

# Make predictions and evaluate the model
y_proba = model.predict_proba(X_test)[:, 1]
threshold = 0.4
y_pred = (y_proba >= threshold).astype(int)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy:", (y_pred == y_test).mean())
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-score:", f1_score(y_test, y_pred))
print("AUC-ROC:", roc_auc_score(y_test, y_proba))

# Save the model and other necessary objects
with open("xgboost_fraud_model.pkl", "wb") as f:
    pickle.dump({
        'model': model,
        'scaler': scaler,
        'encoder': encoder,  # Save the OneHotEncoder
        'imputer': imputer  # Save the imputer
    }, f)


Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.97     37983
           1       1.00      0.94      0.97     37982

    accuracy                           0.97     75965
   macro avg       0.97      0.97      0.97     75965
weighted avg       0.97      0.97      0.97     75965

Confusion Matrix:
 [[37982     1]
 [ 2206 35776]]
Accuracy: 0.9709471467123018
Precision: 0.9999720490818124
Recall: 0.9419198567742615
F1-score: 0.9700782277416993
AUC-ROC: 0.9743205531118764
