In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score, roc_auc_score
from scipy.signal import find_peaks

# Feature Extraction (Optimized)
def extract_features_optimized(window_mlii, window_v5):
    mean_mlii = np.mean(window_mlii)
    std_mlii = np.std(window_mlii)
    min_mlii = np.min(window_mlii)
    max_mlii = np.max(window_mlii)
    mean_v5 = np.mean(window_v5)
    std_v5 = np.std(window_v5)
    min_v5 = np.min(window_v5)
    max_v5 = np.max(window_v5)
    peaks_mlii, _ = find_peaks(window_mlii, distance=50)
    rr_intervals = np.diff(peaks_mlii)
    rr_mean = np.mean(rr_intervals) if len(rr_intervals) > 0 else 0
    rr_std = np.std(rr_intervals) if len(rr_intervals) > 0 else 0
    return np.array([mean_mlii, std_mlii, min_mlii, max_mlii, mean_v5, std_v5, min_v5, max_v5, rr_mean, rr_std])

# Create Windows and Labels
def create_windows_and_labels(df, window_size, overlap, labels_dict):
    windows = []
    labels = []
    for start in range(0, len(df) - window_size, int(window_size * (1 - overlap))):
        end = start + window_size
        window_mlii = df['MLII'].values[start:end]
        window_v5 = df['V5'].values[start:end]
        features = extract_features_optimized(window_mlii, window_v5)
        windows.append(features)
        label = labels_dict.get(df['Unnamed: 0'].iloc[end - 1], 0)
        labels.append(label)
    return np.array(windows), np.array(labels)

# Load Data
df = pd.read_csv('C:/Users/abdulssekyanzi/EDA Dataset.csv/100.csv')

# Load Labels (Dummy Dictionary for Demonstration)
labels_dict = {}
# Example: labels_dict[1000] = 1

# Windowing and Feature Extraction
window_size = 300
overlap = 0.5
X, y = create_windows_and_labels(df, window_size, overlap, labels_dict)

# Data Splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Check Class Distribution
print("Train Class Distribution:", np.unique(y_train, return_counts=True))
print("Test Class Distribution:", np.unique(y_test, return_counts=True))

# Model Training and Hyperparameter Tuning (Optimized)
model = RandomForestClassifier(random_state=42, n_jobs=-1)
param_grid = {'n_estimators': [100], 'max_depth': [10], 'min_samples_split': [5], 'min_samples_leaf': [2]}
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='f1', n_jobs=-1)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

# Cross-Validation (Optimized)
cv_scores = cross_val_score(best_model, X_train, y_train, cv=3, scoring='f1', n_jobs=-1)
print(f"Cross-Validation F1 Scores: {cv_scores}")
print(f"Mean Cross-Validation F1 Score: {np.mean(cv_scores)}")

# Evaluation
y_pred = best_model.predict(X_test)
print(f"Test Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Test F1 Score: {f1_score(y_test, y_pred)}")

# Handle ROC-AUC
y_proba = best_model.predict_proba(X_test)
if y_proba.shape[1] == 2:  # Check if two classes exist
    print(f"Test ROC-AUC Score: {roc_auc_score(y_test, y_proba[:, 1])}")
else:
    print("ROC-AUC Score not calculated: Single-class prediction.")

print("Classification Report:")
print(classification_report(y_test, y_pred))

Train Class Distribution: (array([0]), array([3465]))
Test Class Distribution: (array([0]), array([867]))




Cross-Validation F1 Scores: [nan nan nan]
Mean Cross-Validation F1 Score: nan
Test Accuracy: 1.0
Test F1 Score: 0.0
ROC-AUC Score not calculated: Single-class prediction.
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       867

    accuracy                           1.00       867
   macro avg       1.00      1.00      1.00       867
weighted avg       1.00      1.00      1.00       867



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
