In [None]:
import pandas as pd

# Load the JSON file
input_data = pd.read_json('input.json')

# Optionally, save as CSV if you need to reuse your CSV-based pipeline
input_data.to_csv('input_2.csv', index=False)
print("JSON converted to CSV: 'input.csv'")


In [None]:
import pandas as pd
import numpy as np
import joblib
from datetime import datetime
from sklearn.preprocessing import LabelEncoder

# -------------------------------
# Step 1: Load the Input CSV
# -------------------------------
# Replace 'input.csv' with your actual input file name
input_data = pd.read_csv('input.csv')

# -------------------------------
# Step 2: Preprocess the Data
# -------------------------------
# If the input has a 'transaction_date', extract time features and drop the raw date
if 'transaction_date' in input_data.columns:
    input_data['transaction_date'] = pd.to_datetime(input_data['transaction_date'], errors='coerce')
    input_data['year']  = input_data['transaction_date'].dt.year
    input_data['month'] = input_data['transaction_date'].dt.month
    input_data['day']   = input_data['transaction_date'].dt.day
    input_data['hour']  = input_data['transaction_date'].dt.hour
    input_data.drop(columns=['transaction_date'], inplace=True)

# Define the expected features (must match training)
expected_features = [
    'transaction_amount',
    'transaction_channel',
    'transaction_payment_mode_anonymous',
    'payment_gateway_bank_anonymous',
    'payer_browser_anonymous',
    'payer_email_anonymous',
    'payee_ip_anonymous',
    'payer_mobile_anonymous',
    'year',
    'month',
    'day',
    'hour'
]

# Ensure all expected features are present; if missing, add them as NaN
for col in expected_features:
    if col not in input_data.columns:
        input_data[col] = np.nan

# Impute missing values:
# For numeric features, fill with the mean; for categorical features, fill with 'missing'
for col in expected_features:
    if input_data[col].dtype == 'object':
        input_data[col] = input_data[col].fillna('missing')
    else:
        input_data[col] = input_data[col].fillna(input_data[col].mean())

# Encode categorical features
categorical_cols = input_data[expected_features].select_dtypes(include=['object']).columns
for col in categorical_cols:
    le = LabelEncoder()
    input_data[col] = le.fit_transform(input_data[col].astype(str))

# -------------------------------
# Step 3: Load Scaler and Transform Data
# -------------------------------
# Load the scaler used during training (adjust filename as needed)
scaler = joblib.load('src/weight/scaler_sota.pkl')
X_input = scaler.transform(input_data[expected_features])

# -------------------------------
# Step 4: Load Trained Model and Run Inference
# -------------------------------
# Load the saved model (adjust filename as needed)
model = joblib.load('src/weight/xgb_model.pkl')

# Predict probabilities for fraud class (assumed class 1 is fraud)
fraud_probabilities = model.predict_proba(X_input)[:, 1]

# Define fraud_score as the predicted fraud probability and confidence_score as 1 - fraud_score
fraud_score = fraud_probabilities
confidence_score = 1 - fraud_probabilities

# Predict fraud label using a threshold of 0.5
threshold = 0.5
is_fraud_predicted = fraud_score >= threshold

# -------------------------------
# Step 5: Append Predictions to Data and Save as New CSV
# -------------------------------
input_data['is_fraud_predicted'] = is_fraud_predicted
input_data['confidence_score'] = confidence_score
input_data['fraud_score'] = fraud_score

# Save the augmented CSV; adjust output filename as desired
input_data.to_csv('src/output/output_with_predictions.csv', index=False)
print("Output saved to 'output_with_predictions.csv'")
