<a href="https://colab.research.google.com/github/saleemhamo/ecg-data-feature-engineering/blob/main/ML_Case_Study_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prepare Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/University/AI & ML - Case Study 3/

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/University/AI & ML - Case Study 3


In [None]:
import pandas as pd
feature_names = pd.read_csv('feature_names.csv').columns.tolist()
labels = pd.read_csv('labels.csv')
df_data = pd.read_csv('data.csv', header=None, names=feature_names)
# df_data = pd.read_csv('data.csv')
# data = df_data.values.tolist()


# Feature Selection

## Filtering Methods

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import KBinsDiscretizer
from scipy.stats import pearsonr
from sklearn.model_selection import LeaveOneGroupOut


# Assuming the last column is the dependent variable
X = df_data.iloc[:, :-1]
y = df_data.iloc[:, -1]

# Encode categorical variables if needed
label_encoder = LabelEncoder()
for column in X.select_dtypes(include=['object']).columns:
    X[column] = label_encoder.fit_transform(X[column])

# Discretize the target variable
num_bins = 5  # Adjust as needed
est = KBinsDiscretizer(n_bins=num_bins, encode='ordinal', strategy='uniform')
y_discrete = est.fit_transform(y.values.reshape(-1, 1)).flatten()

# Step 1: Measure relevance of individual features

# Pearson Correlation
correlation_scores = [np.abs(pearsonr(X[column], y)[0]) for column in X.columns]

# Chi-Square
chi2_scores, _ = chi2(X, y_discrete)

# ANOVA F-statistic
f_statistic, _ = f_classif(X, y_discrete)

# Mutual Information
mutual_info_scores = mutual_info_classif(X, y_discrete)

# Step 2: Rank features by relevance

# Create a DataFrame to store scores
feature_scores = pd.DataFrame({
    'Feature': X.columns,
    'Correlation': correlation_scores,
    'Chi2': chi2_scores,
    'F_statistic': f_statistic,
    'Mutual_Information': mutual_info_scores
})

# Rank features by sum of scores
feature_scores['Total_Score'] = feature_scores.sum(axis=1)
feature_scores = feature_scores.sort_values(by='Total_Score', ascending=False)

# Step 3: Keep top K relevant features using cross-validation

# Define the number of top features (K) to keep
k = 100  # Adjust this value based on your preference or cross-validation results

# Select top K features
selected_features = feature_scores['Feature'][:k]

print(selected_features)

# Create a new DataFrame with only the selected features
# X_selected = X
X_selected = X[selected_features]

groups = np.repeat(np.arange(len(X)//10), 10)
logo = LeaveOneGroupOut()

# Define classifiers
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
knn_clf = KNeighborsClassifier(n_neighbors=5)
svm_clf = SVC(kernel='linear', C=1)
y_binary = (y > y.mean()).astype(int)


# Iterate over classifiers
for clf, clf_name in zip([rf_clf, knn_clf, svm_clf], ['Random Forest', 'KNN', 'SVM']):
    # Use leave-one-out cross-validation to evaluate the performance of each classifier with the selected features
    cv_scores = cross_val_score(clf, X_selected, y_binary, cv=logo.split(X_selected, y, groups), error_score='raise')

    # Display cross-validation scores
    print(f'Leave-One-Out Cross-Validation Scores with {k} selected features ({clf_name}):')
    print(cv_scores)
    print(f'Mean Accuracy: {np.mean(cv_scores)}')
    print('\n')


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


384     ratio_theta_0
414    ratio_theta_30
401    ratio_theta_17
388     ratio_theta_4
385     ratio_theta_1
            ...      
300    ratio_alpha_12
130       theta_ec_34
138       theta_ec_42
321    ratio_alpha_33
125       theta_ec_29
Name: Feature, Length: 100, dtype: object
Leave-One-Out Cross-Validation Scores with 100 selected features (Random Forest):
[1.  0.8 0.8 1.  0.8 1.  0.9 0.6 0.9 0.9 0.9 0.8 0.8 0.7 0.9 1.  0.7 0.8]
Mean Accuracy: 0.8500000000000001


Leave-One-Out Cross-Validation Scores with 100 selected features (KNN):
[0.9 0.8 0.8 0.9 0.6 0.9 0.9 0.6 0.8 0.9 0.8 0.8 0.8 0.7 0.9 1.  0.5 0.9]
Mean Accuracy: 0.8055555555555557


Leave-One-Out Cross-Validation Scores with 100 selected features (SVM):
[0.9 0.7 0.9 0.7 0.8 0.8 1.  0.9 0.8 0.9 0.8 0.8 0.8 0.8 0.7 1.  0.7 0.6]
Mean Accuracy: 0.8111111111111111




In [None]:
result = {}

def apply(k):
  result[k] = {}

  # Assuming the last column is the dependent variable
  X = df_data.iloc[:, :-1]
  y = df_data.iloc[:, -1]

  # Encode categorical variables if needed
  label_encoder = LabelEncoder()
  for column in X.select_dtypes(include=['object']).columns:
      X[column] = label_encoder.fit_transform(X[column])

  # Discretize the target variable
  num_bins = 5  # Adjust as needed
  est = KBinsDiscretizer(n_bins=num_bins, encode='ordinal', strategy='uniform')
  y_discrete = est.fit_transform(y.values.reshape(-1, 1)).flatten()

  # Step 1: Measure relevance of individual features

  # Pearson Correlation
  correlation_scores = [np.abs(pearsonr(X[column], y)[0]) for column in X.columns]

  # Chi-Square
  chi2_scores, _ = chi2(X, y_discrete)

  # ANOVA F-statistic
  f_statistic, _ = f_classif(X, y_discrete)

  # Mutual Information
  mutual_info_scores = mutual_info_classif(X, y_discrete)

  # Step 2: Rank features by relevance

  # Create a DataFrame to store scores
  feature_scores = pd.DataFrame({
      'Feature': X.columns,
      'Correlation': correlation_scores,
      'Chi2': chi2_scores,
      'F_statistic': f_statistic,
      'Mutual_Information': mutual_info_scores
  })

  # Rank features by sum of scores
  feature_scores['Total_Score'] = feature_scores.sum(axis=1)
  feature_scores = feature_scores.sort_values(by='Total_Score', ascending=False)

  # Step 3: Keep top K relevant features using cross-validation

  # Define the number of top features (K) to keep
  # k = 100  # Adjust this value based on your preference or cross-validation results

  # Select top K features
  selected_features = feature_scores['Feature'][:k]

  # print(selected_features)

  # Create a new DataFrame with only the selected features
  # X_selected = X
  X_selected = X[selected_features]

  groups = np.repeat(np.arange(len(X)//10), 10)
  logo = LeaveOneGroupOut()

  # Define classifiers
  rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
  knn_clf = KNeighborsClassifier(n_neighbors=5)
  svm_clf = SVC(kernel='linear', C=1)
  y_binary = (y > y.mean()).astype(int)


  # Iterate over classifiers
  for clf, clf_name in zip([rf_clf, knn_clf, svm_clf], ['Random Forest', 'KNN', 'SVM']):
      # Use leave-one-out cross-validation to evaluate the performance of each classifier with the selected features
      cv_scores = cross_val_score(clf, X_selected, y_binary, cv=logo.split(X_selected, y, groups), error_score='raise')

      mean = np.mean(cv_scores)
      result[k][clf_name] = np.mean(cv_scores)
      # Display cross-validation scores
      # print(f'Leave-One-Out Cross-Validation Scores with {k} selected features ({clf_name}):')
      # print(cv_scores)
      # print(f'Mean Accuracy: {mean}')
      # print('\n')
  return result[k]

In [None]:
for i in range(1, 432):
  print(i)
  print(apply(i))
  print('------------------------------')

1


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.7166666666666667, 'KNN': 0.7166666666666667, 'SVM': 0.7166666666666667}
------------------------------
2


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.7666666666666666, 'KNN': 0.788888888888889, 'SVM': 0.8000000000000002}
------------------------------
3


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8722222222222222, 'KNN': 0.8444444444444444, 'SVM': 0.8222222222222223}
------------------------------
4


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8444444444444446, 'KNN': 0.8444444444444446, 'SVM': 0.8444444444444446}
------------------------------
5


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8555555555555556, 'KNN': 0.8333333333333334, 'SVM': 0.8222222222222223}
------------------------------
6


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8611111111111112, 'KNN': 0.8555555555555556, 'SVM': 0.7833333333333334}
------------------------------
7


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8833333333333333, 'KNN': 0.8833333333333334, 'SVM': 0.8666666666666667}
------------------------------
8


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8666666666666667, 'KNN': 0.8777777777777778, 'SVM': 0.8555555555555556}
------------------------------
9


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8444444444444446, 'KNN': 0.8555555555555556, 'SVM': 0.8555555555555556}
------------------------------
10


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8666666666666667, 'KNN': 0.8444444444444446, 'SVM': 0.8666666666666667}
------------------------------
11


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8611111111111112, 'KNN': 0.85, 'SVM': 0.8666666666666667}
------------------------------
12


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8944444444444445, 'KNN': 0.8444444444444446, 'SVM': 0.8611111111111112}
------------------------------
13


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8833333333333333, 'KNN': 0.8500000000000001, 'SVM': 0.8555555555555556}
------------------------------
14


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8722222222222222, 'KNN': 0.8722222222222222, 'SVM': 0.8666666666666667}
------------------------------
15


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8722222222222222, 'KNN': 0.8666666666666667, 'SVM': 0.8555555555555556}
------------------------------
16


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8722222222222222, 'KNN': 0.8666666666666667, 'SVM': 0.8555555555555556}
------------------------------
17


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8777777777777778, 'KNN': 0.85, 'SVM': 0.8666666666666667}
------------------------------
18


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8888888888888888, 'KNN': 0.8388888888888889, 'SVM': 0.8611111111111112}
------------------------------
19


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8944444444444445, 'KNN': 0.8388888888888889, 'SVM': 0.8611111111111112}
------------------------------
20


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8833333333333333, 'KNN': 0.8333333333333334, 'SVM': 0.8722222222222222}
------------------------------
21


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8944444444444445, 'KNN': 0.8444444444444446, 'SVM': 0.8666666666666667}
------------------------------
22


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8888888888888888, 'KNN': 0.838888888888889, 'SVM': 0.8611111111111112}
------------------------------
23


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8777777777777779, 'KNN': 0.8444444444444446, 'SVM': 0.8500000000000001}
------------------------------
24


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8833333333333333, 'KNN': 0.8444444444444446, 'SVM': 0.8444444444444446}
------------------------------
25


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8777777777777779, 'KNN': 0.8555555555555556, 'SVM': 0.8500000000000001}
------------------------------
26


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8722222222222222, 'KNN': 0.8555555555555556, 'SVM': 0.8555555555555556}
------------------------------
27


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8833333333333333, 'KNN': 0.8500000000000001, 'SVM': 0.8555555555555556}
------------------------------
28


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8888888888888888, 'KNN': 0.8444444444444444, 'SVM': 0.8444444444444446}
------------------------------
29


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8888888888888888, 'KNN': 0.8500000000000001, 'SVM': 0.8611111111111112}
------------------------------
30


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8833333333333334, 'KNN': 0.8444444444444444, 'SVM': 0.8500000000000001}
------------------------------
31


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8833333333333333, 'KNN': 0.8444444444444444, 'SVM': 0.8500000000000001}
------------------------------
32


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8888888888888888, 'KNN': 0.838888888888889, 'SVM': 0.8444444444444446}
------------------------------
33


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8722222222222222, 'KNN': 0.838888888888889, 'SVM': 0.8277777777777778}
------------------------------
34


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8777777777777778, 'KNN': 0.838888888888889, 'SVM': 0.8277777777777778}
------------------------------
35


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8944444444444445, 'KNN': 0.8444444444444444, 'SVM': 0.8555555555555556}
------------------------------
36


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8777777777777778, 'KNN': 0.8333333333333335, 'SVM': 0.8555555555555556}
------------------------------
37


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8833333333333334, 'KNN': 0.838888888888889, 'SVM': 0.8555555555555556}
------------------------------
38


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8888888888888888, 'KNN': 0.8222222222222223, 'SVM': 0.8666666666666667}
------------------------------
39


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8777777777777778, 'KNN': 0.8222222222222223, 'SVM': 0.8666666666666667}
------------------------------
40


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8722222222222222, 'KNN': 0.8222222222222223, 'SVM': 0.8555555555555556}
------------------------------
41


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8833333333333334, 'KNN': 0.8166666666666669, 'SVM': 0.8555555555555556}
------------------------------
42


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8666666666666667, 'KNN': 0.8111111111111112, 'SVM': 0.8444444444444446}
------------------------------
43


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8833333333333333, 'KNN': 0.8055555555555557, 'SVM': 0.8500000000000001}
------------------------------
44


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8611111111111112, 'KNN': 0.8055555555555557, 'SVM': 0.8500000000000001}
------------------------------
45


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8666666666666667, 'KNN': 0.8222222222222223, 'SVM': 0.8888888888888888}
------------------------------
46


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8555555555555556, 'KNN': 0.8222222222222223, 'SVM': 0.8888888888888888}
------------------------------
47


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


{'Random Forest': 0.8777777777777779, 'KNN': 0.8222222222222223, 'SVM': 0.8888888888888888}
------------------------------
48


  feature_scores['Total_Score'] = feature_scores.sum(axis=1)


KeyboardInterrupt: ignored

## Wrapper Methods

## Embedding Methods