In [None]:
# Standard Python Libraries
import sys
import json
import warnings
from datetime import timedelta

In [None]:
# Split data into 70% training and 30% testing
train, test = ds_sp_ohe.random_split(weights=[0.70, 0.30], seed=0)

# Define categorical and numerical features
categorical_features = ['MOST_COMMON_DEVICE_TYPE', 'MOST_COMMON_LOCATION',
                       'MOST_COMMON_MERCHANT_CATEGORY', 'MOST_COMMON_TRANSACTION_DAY']
numerical_features = ['AVG_TRANSACTION_AMOUNT', 'TRANSACTION_COUNT', 'AVG_SENTIMENT_SCORE']

# Create preprocessing pipeline
preprocessing_pipeline = Pipeline([
    ('num_imputer', SimpleImputer(
        input_cols=numerical_features,
        output_cols=numerical_features,
        strategy='median'
    )),
    ('cat_encoder', OneHotEncoder(
        input_cols=categorical_features,
        output_cols=[f"{col}_encoded" for col in categorical_features],
        handle_unknown='ignore'
    ))
])

# Generate feature list
all_features = numerical_features + [f"{col}_encoded" for col in categorical_features]

# Define models with updated data splits
models = {
    "XGBoost": Pipeline([
        ('preprocessor', preprocessing_pipeline),
        ('classifier', XGBClassifier(
            input_cols=all_features,
            label_cols=['IS_FRAUD'],
            output_cols=['PREDICTION'],
            eval_metric='logloss'
        ))
    ]),
    "RandomForest": Pipeline([
        ('preprocessor', preprocessing_pipeline),
        ('classifier', RandomForestClassifier(
            input_cols=all_features,
            label_cols=['IS_FRAUD'],
            output_cols=['PREDICTION']
        ))
    ]),
    "LogisticRegression": Pipeline([
        ('preprocessor', preprocessing_pipeline),
        ('classifier', LogisticRegression(
            input_cols=all_features,
            label_cols=['IS_FRAUD'],
            output_cols=['PREDICTION']
        ))
    ])
}

# Model training and evaluation with new splits
best_model = None
best_score = 0

for name, model in models.items():
    print(f"Training {name} model...")
    model.fit(train)
    preds = model.predict(test)
    
    # Calculate metrics
    f1 = f1_score(test['IS_FRAUD'], preds['PREDICTION'])
    precision = precision_score(test['IS_FRAUD'], preds['PREDICTION'])
    recall = recall_score(test['IS_FRAUD'], preds['PREDICTION'])
    
    print(f"{name} Metrics:")
    print(f"F1: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}\n")

    if f1 > best_score:  # Using F1 as selection metric for fraud detection
        best_model = model
        best_score = f1

# Register best model
model_name = "Fraud_Detection_Model"
try:
    model_version = model_registry.get_model(model_name).version("v1")
    print("Updating existing model version...")
    model_version.set_metric("F1_score", best_score)
except:
    print("Logging new model...")
    model_version = model_registry.log_model(
        model_name=model_name,
        model=best_model,
        version_name="v1",
        sample_input_data=train,
        comment="Best performing fraud detection model with 70-30 split"
    )
    model_version.set_metric("F1_score", best_score)
    model_version.set_metric("Precision", precision)
    model_version.set_metric("Recall", recall)

print(f"Best model: {type(best_model.steps[-1][1]).__name__} with F1-score {best_score:.4f}")
