In [1]:
!pip install xgboost scikit-learn



In [2]:
import pandas as pd

# Load the generated dataset
df = pd.read_csv('data/transactions.csv')
print("Data shape:", df.shape)
df['is_fraud'].value_counts(normalize=True) * 100  # Check imbalance
df.head()

Data shape: (100000, 9)


Unnamed: 0,transaction_id,seller_id,buyer_id,amount,payment_method,device_type,location,timestamp,is_fraud
0,T075721,S101,B4020,2568.07,Net Banking,Android,Others,2025-05-04 17:37:43.843410,0
1,T080184,S171,B5128,1772.28,Net Banking,iOS,Kolkata,2025-04-01 20:03:58.324349,0
2,T019864,S168,B8791,1011.18,Credit Card,Tablet,Chennai,2025-04-06 19:09:20.679503,0
3,T076699,S198,B2414,923.68,UPI,Desktop,Mumbai,2025-04-29 12:14:29.747324,0
4,T092991,S187,B9719,3866.55,COD,Tablet,Others,2025-04-25 20:58:47.209029,0


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Encode all categorical columns including seller_id
cat_cols = ['payment_method', 'device_type', 'location', 'seller_id']
le_dict = {}

for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    le_dict[col] = le

# Drop irrelevant columns
X = df.drop(['transaction_id', 'timestamp', 'is_fraud', 'buyer_id'], axis=1)
y = df['is_fraud']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

In [6]:
import xgboost as xgb

# Set up the classifier with class imbalance handling
model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',
    scale_pos_weight=(len(y_train) - sum(y_train)) / sum(y_train),  # imbalance
    use_label_encoder=False,
    random_state=42
)

model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [7]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, zero_division=0))

[[17541  1859]
 [  333   267]]
              precision    recall  f1-score   support

           0       0.98      0.90      0.94     19400
           1       0.13      0.45      0.20       600

    accuracy                           0.89     20000
   macro avg       0.55      0.67      0.57     20000
weighted avg       0.96      0.89      0.92     20000



In [9]:
import os
os.makedirs('model', exist_ok=True)
model.save_model('model/xgb_fraud_model.json')
print("Model saved.")

Model saved.


In [10]:
import pickle

with open('model/label_encoders.pkl', 'wb') as f:
    pickle.dump(le_dict, f)