In [9]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# 1. Load Dataset
def load_data(file_path):
    """
    Function to load the dataset from a CSV file into a Pandas DataFrame.
    Args:
    - file_path: The path to the CSV file containing the dataset.
    
    Returns:
    - df: A Pandas DataFrame containing the dataset.
    """
    return pd.read_csv(file_path)

# 2. Custom Data Preprocessing
def custom_label_encoder(df, columns):
    """
    Function to manually encode categorical columns into numerical values.
    For each categorical column, the unique values are mapped to integer labels.
    
    Args:
    - df: The Pandas DataFrame containing the dataset.
    - columns: A list of column names to be encoded.
    
    Returns:
    - df: The DataFrame with encoded columns.
    - label_encoders: A dictionary containing mappings of categorical values to integers for each column.
    """
    label_encoders = {}  # Dictionary to store the label encoders for each column
    
    for col in columns:
        # Get unique values in the categorical column
        unique_values = df[col].unique()
        
        # Create a mapping of unique values to integers
        label_map = {value: idx for idx, value in enumerate(unique_values)}
        
        # Map the original values in the column to the new integer labels
        df[col] = df[col].map(label_map)
        
        # Store the label map in the dictionary for reference
        label_encoders[col] = label_map
    
    return df, label_encoders

def custom_standard_scaler(df, columns):
    """
    Function to manually standardize numerical columns to have zero mean and unit variance.
    
    Args:
    - df: The Pandas DataFrame containing the dataset.
    - columns: A list of numerical column names to be scaled.
    
    Returns:
    - df: The DataFrame with scaled numerical columns.
    """
    for col in columns:
        # Calculate the mean and standard deviation of the column
        mean = df[col].mean()
        std = df[col].std()
        
        # Apply the standardization formula (z = (x - mean) / std)
        df[col] = (df[col] - mean) / std
    
    return df

# 3. Manual Train-Test Split
def custom_train_test_split(df, target_column, test_size=0.2):
    """
    Function to manually split the dataset into training and testing sets.
    The function shuffles the dataset and then splits it into training and testing based on the test size.
    
    Args:
    - df: The DataFrame containing the dataset.
    - target_column: The column name containing the target variable (the label to predict).
    - test_size: The fraction of the dataset to be used as the test set (default is 20%).
    
    Returns:
    - X_train: The features of the training data.
    - X_test: The features of the testing data.
    - y_train: The target values of the training data.
    - y_test: The target values of the testing data.
    """
    # Shuffle the dataset randomly
    shuffled_df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    # Calculate the split index based on the test_size
    split_idx = int(len(shuffled_df) * (1 - test_size))
    
    # Split the data into training and testing sets
    train_df = shuffled_df[:split_idx]
    test_df = shuffled_df[split_idx:]
    
    # Separate the features (X) and target (y) for both training and testing sets
    X_train = train_df.drop(target_column, axis=1)
    y_train = train_df[target_column]
    X_test = test_df.drop(target_column, axis=1)
    y_test = test_df[target_column]
    
    return X_train, X_test, y_train, y_test

# 4. Train and Evaluate Model with Manual Hyperparameter Tuning
def manual_hyperparameter_tuning(X_train, X_test, y_train, y_test):
    """
    Function to manually tune the hyperparameters of an XGBoost classifier.
    It evaluates different combinations of hyperparameters and selects the best model based on accuracy.
    
    Args:
    - X_train: The features of the training data.
    - X_test: The features of the testing data.
    - y_train: The target values of the training data.
    - y_test: The target values of the testing data.
    
    Returns:
    - best_model: The best XGBoost model after tuning.
    """
    best_accuracy = 0  # Variable to store the best accuracy
    best_params = None  # Variable to store the best hyperparameters
    best_model = None  # Variable to store the best model
    
    # Define the hyperparameter ranges to tune
    n_estimators_range = [50, 100, 150]  # Number of boosting rounds
    learning_rate_range = [0.01, 0.1, 0.2]  # Learning rate
    max_depth_range = [3, 5, 7]  # Maximum depth of each tree
    
    # Loop through all combinations of hyperparameters
    for n_estimators in n_estimators_range:
        for learning_rate in learning_rate_range:
            for max_depth in max_depth_range:
                print(f"Training model with n_estimators={n_estimators}, learning_rate={learning_rate}, max_depth={max_depth}")
                
                # Train the XGBoost model with the current combination of hyperparameters
                model = XGBClassifier(
                    n_estimators=n_estimators,
                    learning_rate=learning_rate,
                    max_depth=max_depth,
                    random_state=42,
                    use_label_encoder=False,
                    eval_metric="mlogloss"
                )
                model.fit(X_train, y_train)
                
                # Make predictions on the test data
                y_pred = model.predict(X_test)
                
                # Calculate accuracy of the model
                accuracy = accuracy_score(y_test, y_pred)
                
                print(f"Accuracy: {accuracy:.4f}")
                
                # If the current model's accuracy is better, update the best model and parameters
                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_params = {
                        "n_estimators": n_estimators,
                        "learning_rate": learning_rate,
                        "max_depth": max_depth
                    }
                    best_model = model
    
    print("\nBest Hyperparameters:", best_params)
    print(f"Best Accuracy: {best_accuracy:.4f}")
    return best_model

# 5. Evaluate the Final Model
def evaluate_model(model, X_test, y_test):
    """
    Function to evaluate the final trained model on the test data.
    It calculates accuracy and provides a detailed classification report.
    
    Args:
    - model: The trained model to be evaluated.
    - X_test: The features of the testing data.
    - y_test: The target values of the testing data.
    """
    # Make predictions on the test data
    y_pred = model.predict(X_test)
    
    # Print the accuracy of the final model
    print("Final Model Accuracy:", accuracy_score(y_test, y_pred))
    
    # Print the classification report with precision, recall, and F1-score
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

# 6. Main Function
def main():
    """
    Main function that orchestrates the entire pipeline:
    - Loads the data
    - Preprocesses the data
    - Splits the data
    - Tunes the model hyperparameters
    - Evaluates the final model.
    """
    # File path to your dataset
    file_path = "./data/combined_output1.csv"  # Update with your dataset file name
    
    # Load and preprocess the dataset
    df = load_data(file_path)
    
    # Step 1: Manually encode categorical columns
    categorical_columns = ["state", "state_name", "disease"]
    df, label_encoders = custom_label_encoder(df, categorical_columns)
    
    # Step 2: Manually scale numerical features
    numerical_columns = ["cases", "incidence_per_capita"]
    df = custom_standard_scaler(df, numerical_columns)
    
    # Split data into training and testing sets
    target_column = "disease"  # Column to predict
    X_train, X_test, y_train, y_test = custom_train_test_split(df, target_column)
    
    # Perform manual hyperparameter tuning
    best_model = manual_hyperparameter_tuning(X_train, X_test, y_train, y_test)
    
    # Evaluate the final model
    evaluate_model(best_model, X_test, y_test)

# Run the pipeline
if __name__ == "__main__":
    main()

Training model with n_estimators=50, learning_rate=0.01, max_depth=3
Accuracy: 0.8179
Training model with n_estimators=50, learning_rate=0.01, max_depth=5


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.8295
Training model with n_estimators=50, learning_rate=0.01, max_depth=7


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.8402
Training model with n_estimators=50, learning_rate=0.1, max_depth=3


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.8295
Training model with n_estimators=50, learning_rate=0.1, max_depth=5


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.8393
Training model with n_estimators=50, learning_rate=0.1, max_depth=7


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.8518
Training model with n_estimators=50, learning_rate=0.2, max_depth=3


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.8339
Training model with n_estimators=50, learning_rate=0.2, max_depth=5


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.8473
Training model with n_estimators=50, learning_rate=0.2, max_depth=7


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.8527
Training model with n_estimators=100, learning_rate=0.01, max_depth=3


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.8205
Training model with n_estimators=100, learning_rate=0.01, max_depth=5


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.8295
Training model with n_estimators=100, learning_rate=0.01, max_depth=7


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.8429
Training model with n_estimators=100, learning_rate=0.1, max_depth=3


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.8330
Training model with n_estimators=100, learning_rate=0.1, max_depth=5


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.8420
Training model with n_estimators=100, learning_rate=0.1, max_depth=7


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.8500
Training model with n_estimators=100, learning_rate=0.2, max_depth=3


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.8429
Training model with n_estimators=100, learning_rate=0.2, max_depth=5


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.8545
Training model with n_estimators=100, learning_rate=0.2, max_depth=7


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.8491
Training model with n_estimators=150, learning_rate=0.01, max_depth=3


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.8223
Training model with n_estimators=150, learning_rate=0.01, max_depth=5


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.8313
Training model with n_estimators=150, learning_rate=0.01, max_depth=7


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.8473
Training model with n_estimators=150, learning_rate=0.1, max_depth=3


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.8330
Training model with n_estimators=150, learning_rate=0.1, max_depth=5


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.8554
Training model with n_estimators=150, learning_rate=0.1, max_depth=7


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.8509
Training model with n_estimators=150, learning_rate=0.2, max_depth=3


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.8562
Training model with n_estimators=150, learning_rate=0.2, max_depth=5


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.8545
Training model with n_estimators=150, learning_rate=0.2, max_depth=7


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.8473

Best Hyperparameters: {'n_estimators': 150, 'learning_rate': 0.2, 'max_depth': 3}
Best Accuracy: 0.8562
Final Model Accuracy: 0.85625
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.90      0.84       173
           1       0.81      0.84      0.83       154
           2       1.00      1.00      1.00       157
           3       1.00      1.00      1.00       180
           4       0.78      0.90      0.83       151
           5       0.87      0.75      0.80       155
           6       0.71      0.57      0.63       150

    accuracy                           0.86      1120
   macro avg       0.85      0.85      0.85      1120
weighted avg       0.86      0.86      0.85      1120

