In [1]:
"""
05_modeling.ipynb


Key Steps:
1. Load and preprocess the data using previously defined functions.
2. Split the dataset into training and testing sets.
3. Construct a model pipeline with preprocessing steps (imputation, scaling, polynomial feature generation) and RandomForestClassifier.
4. Perform hyperparameter tuning using GridSearchCV to find the best model parameters.
5. Evaluate the tuned model's performance on the test set using metrics like ROC AUC, precision, recall, and F1-score.
6. Optionally, adjust the decision threshold to balance the precision-recall trade-off based on the application's requirements.

Components:
- Data Loading and Preprocessing
- Data Splitting
- Model Pipeline Construction
- Hyperparameter Tuning
- Model Evaluation
- Threshold Adjustment

"""

"\n05_modeling.ipynb\n\n\nKey Steps:\n1. Load and preprocess the data using previously defined functions.\n2. Split the dataset into training and testing sets.\n3. Construct a model pipeline with preprocessing steps (imputation, scaling, polynomial feature generation) and RandomForestClassifier.\n4. Perform hyperparameter tuning using GridSearchCV to find the best model parameters.\n5. Evaluate the tuned model's performance on the test set using metrics like ROC AUC, precision, recall, and F1-score.\n6. Optionally, adjust the decision threshold to balance the precision-recall trade-off based on the application's requirements.\n\nComponents:\n- Data Loading and Preprocessing\n- Data Splitting\n- Model Pipeline Construction\n- Hyperparameter Tuning\n- Model Evaluation\n- Threshold Adjustment\n\n"

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, auc, precision_recall_curve, average_precision_score
from sklearn.utils.class_weight import compute_class_weight
import sys
sys.path.append('../Helpers')  # Ensure this path is correct for your project structure
from data_helpers import load_config, load_data

# Load and preprocess data (Assuming functions are defined in external modules)
df = load_and_preprocess_data('../config/config.json')

# Splitting dataset
X = df.drop('Outcome', axis=1)
y = df['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build and tune model pipeline
class_weight_dict = compute_class_weights(y_train)
best_pipeline = hyperparameter_tuning(X_train, y_train, class_weight_dict)

# Evaluate the model
evaluate_model(best_pipeline, X_test, y_test)

# Include additional code for plotting and threshold adjustment as needed
