# Binary Classification Model
## 1. Project Setup and Data Loading

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Import custom modules
from src.data_preprocessing import DataLoader, DataPreprocessor, create_pipeline
from src.models import BinaryClassifier, ModelTrainer, HyperparameterTuner
from src.evaluation import ClassificationEvaluator, evaluate_model, compare_models
from src.feature_engineering import FeatureCreator, FeatureSelector

# Set random seed for reproducibility
np.random.seed(42)

# Initialize data loader
loader = DataLoader()

# Load diabetes dataset
print("Loading Diabetes Dataset...")
df = loader.load_sklearn_dataset("diabetes", save_raw=True)
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

## 2. Data EXploration and Analysis

In [None]:
# Display basic information about the dataset
from src.data_preprocessing import get_dataset_info

info = get_dataset_info(df)
print("Dataset Information:")
print(f"  - Shape: {info['shape']}")
print(f"  - Numeric columns: {info['numeric_columns']}")
print(f"  - Missing values: {sum(info['missing_values'].values())}")
print(f"  - Memory usage: {info['memory_usage_mb']:.2f} MB")

# Display first few rows
print("\nFirst 5 rows:")
print(df.head())

# Display statistical summary
print("\nStatistical Summary:")
print(df.describe())

## 3. Covert to Binary Classification Problem

In [None]:
# Convert diabetes regression to binary classification
# We'll create a binary target based on diabetes progression threshold
threshold = df['target'].median()
df['binary_target'] = (df['target'] > threshold).astype(int)

print(f"Threshold for binary classification: {threshold:.2f}")
print(f"Class distribution:")
print(df['binary_target'].value_counts())
print(f"Class balance: {df['binary_target'].value_counts(normalize=True)}")

# Drop the original regression target
df_binary = df.drop(columns=['target'])

## 4. Data Visualization

In [None]:
# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")

# Create correlation heatmap
plt.figure(figsize=(12, 8))
correlation_matrix = df_binary.drop(columns=['binary_target']).corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Feature Correlation Heatmap')
plt.tight_layout()
plt.show()

# Plot target distribution
plt.figure(figsize=(8, 6))
sns.countplot(data=df_binary, x='binary_target')
plt.title('Target Distribution')
plt.xlabel('Class (0: Low Progression, 1: High Progression)')
plt.ylabel('Count')
plt.show()

# Feature distributions by class
feature_cols = df_binary.drop(columns=['binary_target']).columns[:6]
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for i, col in enumerate(feature_cols):
    sns.boxplot(data=df_binary, x='binary_target', y=col, ax=axes[i])
    axes[i].set_title(f'{col} by Class')

plt.tight_layout()
plt.show()

## 5. Data Processing Pipeline

In [None]:
# Apply complete preprocessing pipeline
X_train, X_test, y_train, y_test, preprocessor = create_pipeline(
    df_binary,
    target_column='binary_target',
    handle_missing=True,
    missing_strategy='mean',
    encode_categorical=True,
    scale_features=True,
    scaling_method='standard',
    remove_outliers=False,
    test_size=0.2,
    random_state=42
)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"Training class distribution:")
print(pd.Series(y_train).value_counts().sort_index())

## 6. Feature Engineering (Optional)

In [None]:
# Create additional features if needed
feature_creator = FeatureCreator()

# Convert back to DataFrame for feature engineering
X_train_df = pd.DataFrame(X_train, columns=[f'feature_{i}' for i in range(X_train.shape[1])])
X_test_df = pd.DataFrame(X_test, columns=[f'feature_{i}' for i in range(X_test.shape[1])])

# Create interaction features
X_train_enhanced = feature_creator.create_interaction_features(
    X_train_df,
    column_pairs=[('feature_0', 'feature_1'), ('feature_2', 'feature_3')],
    operation='multiply'
)

# Apply same transformations to test set
X_test_enhanced = feature_creator.create_interaction_features(
    X_test_df,
    column_pairs=[('feature_0', 'feature_1'), ('feature_2', 'feature_3')],
    operation='multiply'
)

print(f"Enhanced training set shape: {X_train_enhanced.shape}")

## 7. Model Training - Single Model

In [None]:
# Train a single Random Forest classifier
print("Training Random Forest Classifier...")
rf_classifier = BinaryClassifier(
    model_type='random_forest',
    n_estimators=100,
    random_state=42
)

rf_classifier.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_classifier.predict(X_test)
y_pred_proba_rf = rf_classifier.predict_proba(X_test)

print("Random Forest training completed!")

## 8. Model Training - Multiple Models

In [None]:
# Train multiple models for comparison
trainer = ModelTrainer(random_state=42)

model_configs = {
    'Random Forest': {
        'model_type': 'random_forest',
        'n_estimators': 100,
        'max_depth': 10
    },
    'Logistic Regression': {
        'model_type': 'logistic_regression',
        'C': 1.0
    },
    'SVM': {
        'model_type': 'svm',
        'C': 1.0,
        'kernel': 'rbf'
    },
    'Gradient Boosting': {
        'model_type': 'gradient_boosting',
        'n_estimators': 100,
        'learning_rate': 0.1
    }
}

print("Training multiple models...")
trained_models = trainer.train_multiple_models(X_train, y_train, model_configs)
print(f"Successfully trained {len(trained_models)} models")

## 9. Hyperparameter Tuning

In [None]:
# Tune Random Forest hyperparameters
from sklearn.ensemble import RandomForestClassifier

base_model = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

print("Starting hyperparameter tuning...")
tuner = HyperparameterTuner(
    model=base_model,
    param_grid=param_grid,
    cv=5,
    scoring='f1',
    random_state=42
)

# Use RandomizedSearchCV for faster tuning
best_rf_model = tuner.random_search(X_train, y_train, n_iter=20, n_jobs=-1)

print(f"\nBest parameters: {tuner.get_best_params()}")
print(f"Best CV F1 score: {tuner.get_best_score():.4f}")

## 10. Model Evaluation

In [None]:
# Evaluate all trained models
print("Evaluating all models...")
results_df = compare_models(trained_models, X_test, y_test, average='weighted')
print("\nModel Comparison Results:")
print(results_df.round(4))

# Detailed evaluation of the best performing model
best_model_name = results_df.index[0]
best_model = trained_models[best_model_name]

print(f"\nDetailed evaluation of {best_model_name}:")
evaluator = ClassificationEvaluator(model_name=best_model_name)

y_pred_best = best_model.predict(X_test)
y_pred_proba_best = best_model.predict_proba(X_test)

metrics = evaluator.evaluate(y_test, y_pred_best, y_pred_proba_best)

for metric, value in metrics.items():
    print(f"  {metric}: {value:.4f}")

## 11. Visualization of Results

In [None]:
# Plot confusion matrix
fig_cm = evaluator.plot_confusion_matrix(
    y_test, y_pred_best,
    class_names=['Low Progression', 'High Progression'],
    normalize=True,
    figsize=(8, 6)
)
plt.show()

# Plot ROC curve
fig_roc = evaluator.plot_roc_curve(
    y_test, y_pred_proba_best[:, 1],
    figsize=(8, 6)
)
plt.show()

# Plot Precision-Recall curve
fig_pr = evaluator.plot_precision_recall_curve(
    y_test, y_pred_proba_best[:, 1],
    figsize=(8, 6)
)
plt.show()

# Model comparison visualization
metrics_dict = {name: evaluate_model(model, X_test, y_test, name)
                for name, model in trained_models.items()}

fig_comparison = evaluator.plot_metrics_comparison(
    metrics_dict,
    figsize=(12, 6)
)
plt.show()

## 12. Feature Importance Analysis

In [None]:
# Analyze feature importance for tree-based models
if hasattr(best_model, 'feature_importances_'):
    from src.models import get_feature_importance

    feature_names = [f'feature_{i}' for i in range(X_train.shape[1])]
    importance_df = get_feature_importance(best_model, feature_names)

    print("\nTop 10 Most Important Features:")
    print(importance_df.head(10))

    # Plot feature importance
    plt.figure(figsize=(10, 6))
    top_features = importance_df.head(10)
    plt.barh(range(len(top_features)), top_features['importance'])
    plt.yticks(range(len(top_features)), top_features['feature'])
    plt.xlabel('Feature Importance')
    plt.title(f'Top 10 Feature Importances - {best_model_name}')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()

## 13. Model Persistence

In [None]:
# Save the best model
from src.models import save_model, load_model

model_save_path = 'models/best_diabetes_classifier.pkl'
save_model(best_model, model_save_path)

# Save preprocessing pipeline
import pickle
with open('models/diabetes_preprocessor.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)

print(f"Model and preprocessor saved successfully!")

# Demonstrate loading and using the saved model
loaded_model = load_model(model_save_path)
test_predictions = loaded_model.predict(X_test[:5])
print(f"Test predictions from loaded model: {test_predictions}")

## 14. Cross-Validation Analysis

In [None]:
# Perform cross-validation on the best model
from src.models import cross_validate_model

cv_results = cross_validate_model(
    best_model, X_train, y_train,
    cv=5, scoring='f1'
)

print("\nCross-Validation Results:")
print(f"  Mean F1 Score: {cv_results['mean_score']:.4f}")
print(f"  Standard Deviation: {cv_results['std_score']:.4f}")
print(f"  Individual Scores: {cv_results['scores']}")

# Visualize CV scores
plt.figure(figsize=(8, 6))
plt.boxplot(cv_results['scores'])
plt.ylabel('F1 Score')
plt.title('Cross-Validation F1 Scores Distribution')
plt.xticks([1], [best_model_name])
plt.show()

## 15. Prediction on New Data

In [None]:
# Function to make predictions on new data
def predict_diabetes_progression(new_data, model_path='models/best_diabetes_classifier.pkl',
                                preprocessor_path='models/diabetes_preprocessor.pkl'):
    """
    Predict diabetes progression for new data.

    Args:
        new_data: DataFrame with same features as training data
        model_path: Path to saved model
        preprocessor_path: Path to saved preprocessor

    Returns:
        Predictions and probabilities
    """
    # Load model and preprocessor
    model = load_model(model_path)

    with open(preprocessor_path, 'rb') as f:
        preprocessor = pickle.load(f)

    # Preprocess new data (you'd need to implement this based on your pipeline)
    # For now, assuming data is already preprocessed
    predictions = model.predict(new_data)
    probabilities = model.predict_proba(new_data)

    return predictions, probabilities

# Example usage
print("\nExample prediction on test data:")
sample_data = X_test[:3]
preds, probs = predict_diabetes_progression(sample_data)
print(f"Predictions: {preds}")
print(f"Probabilities: {probs}")