In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
import pickle



In [None]:
# Load the dataset
data = pd.read_csv("sample_data/Fraud.csv")  # Replace with your dataset path

# Drop unnecessary columns
data = data.drop(columns=["nameOrig", "nameDest"])


# Encode the 'type' column
encoder = LabelEncoder()
data["type"] = encoder.fit_transform(data["type"])  # Encode types to numeric values

# Check for missing values
print("Missing values:\n", data.isnull().sum())

# Impute missing values (if any)
data.fillna(0, inplace=True)

data = data.sample(frac=0.5, random_state=42)

# Features and target variable
X = data.drop(columns=["isFraud", "isFlaggedFraud"])  # Exclude target and flagged fraud
y = data["isFraud"]


Missing values:
 step              0
type              0
amount            0
oldbalanceOrg     0
newbalanceOrig    0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64


In [None]:
# Handle class imbalance using SMOTE
try:
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)
except ValueError as e:
    print(f"SMOTE error: {e}")

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train a Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    635348
           1       1.00      1.00      1.00    635530

    accuracy                           1.00   1270878
   macro avg       1.00      1.00      1.00   1270878
weighted avg       1.00      1.00      1.00   1270878



In [None]:
print(data.columns)

Index(['step', 'type', 'amount', 'oldbalanceOrg', 'newbalanceOrig',
       'oldbalanceDest', 'newbalanceDest', 'isFraud', 'isFlaggedFraud'],
      dtype='object')


In [None]:

# Save the model and encoder
with open("fraud_model.pkl", "wb") as f:
    pickle.dump(model, f)
with open("type_encoder.pkl", "wb") as f:
    pickle.dump(encoder, f)
