In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
import matplotlib.pyplot as plt

def load_and_preprocess_data(file_path, synthetic_labels=False):
    """
    Load and preprocess the dataset from a CSV file.
    Optionally generate synthetic labels for testing purposes.
    """
    # Load the dataset
    df = pd.read_csv(file_path)

    # Drop unnecessary columns
    if 'Unnamed: 0' in df.columns:
        df = df.drop(columns=['Unnamed: 0'])

    # Extract features (ECG signals)
    X = df[['MLII', 'V5']].values  # Features: MLII and V5

    # Handle labels
    if 'label' in df.columns:
        y = df['label'].values  # Use existing labels if available
    elif synthetic_labels:
        print("Generating synthetic labels for testing purposes...")
        y = np.random.randint(0, 5, size=len(X))  # Generate random labels (0 to 4)
    else:
        raise ValueError("The dataset does not contain a 'label' column. Please ensure labels are included.")

    # Encode categorical labels
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)  # Encode labels into integers

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Scale the features (standardize to zero mean and unit variance)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    return X_train, X_test, y_train, y_test, label_encoder.classes_

def train_random_forest_with_grid_search(X_train, y_train):
    """
    Train a Random Forest classifier with grid search for hyperparameter tuning.
    """
    print("Training the Random Forest Classifier with Grid Search...")
    # Initialize the Random Forest model
    rf_model = RandomForestClassifier(random_state=42)

    # Define the parameter grid
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    # Initialize Grid Search
    grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid,
                               cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

    # Perform Grid Search
    grid_search.fit(X_train, y_train)

    # Get the best model
    best_rf_model = grid_search.best_estimator_

    print(f"Best Parameters: {grid_search.best_params_}")
    return best_rf_model

def evaluate_model(rf_model, X_test, y_test):
    """
    Evaluate the performance of the Random Forest model.
    """
    print("Evaluating the Random Forest Model...")
    # Make predictions
    y_pred = rf_model.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Test Accuracy: {accuracy:.4f}")

    # Print classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    # Print confusion matrix
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

def analyze_feature_importance(rf_model, feature_names):
    """
    Analyze and visualize feature importance from the Random Forest model.
    """
    print("Analyzing Feature Importance...")
    # Get feature importances
    importances = rf_model.feature_importances_

    # Create a DataFrame for visualization
    feature_importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importances
    }).sort_values(by='Importance', ascending=False)

    # Print feature importance
    print("\nFeature Importance:")
    print(feature_importance_df)

    # Plot feature importance
    plt.figure(figsize=(8, 6))
    plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'], color='skyblue')
    plt.xlabel('Importance Score')
    plt.ylabel('Feature')
    plt.title('Feature Importance Analysis')
    plt.gca().invert_yaxis()  # Invert y-axis for better readability
    plt.show()

if __name__ == "__main__":
    # Load and preprocess the dataset
    file_path = "C:/Users/abdulssekyanzi/EDA Dataset.csv/100.csv"  # Replace with your dataset path
    X_train, X_test, y_train, y_test, class_names = load_and_preprocess_data(file_path, synthetic_labels=True)

    # Train the Random Forest model with grid search
    best_rf_model = train_random_forest_with_grid_search(X_train, y_train)

    # Evaluate the best model
    evaluate_model(best_rf_model, X_test, y_test)

    # Analyze feature importance
    feature_names = ['MLII', 'V5']  # Replace with actual feature names if different
    analyze_feature_importance(best_rf_model, feature_names)


Generating synthetic labels for testing purposes...
Training the Random Forest Classifier with Grid Search...
Fitting 3 folds for each of 108 candidates, totalling 324 fits
