### First Models

LogReg *baseline model*

Other Models *more advanced but still not tuned*



In [1]:
import pandas as pd
import numpy as np
from tsfresh import extract_features
from tsfresh.feature_extraction import ComprehensiveFCParameters

# If you need to plot or visualize data later on
import matplotlib.pyplot as plt
import seaborn as sns

# For any data preprocessing or manipulation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Depending on the models you plan to use
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


## Dataset

In [2]:
fazeli_mitbih_train_df = pd.read_csv('../data/mitbih_train.csv', header=None)

In [3]:
column_187 = fazeli_mitbih_train_df.iloc[:, 187]
column_187.value_counts()

0.0    72471
4.0     6431
2.0     5788
1.0     2223
3.0      641
Name: 187, dtype: int64

## Comprehensive Feature Extraction

using tsfresh time series comprehensive feature extraction package

### Next cell takes 15-30 min to run

In [4]:
import pandas as pd
from tsfresh import extract_features
from tsfresh.feature_extraction import ComprehensiveFCParameters

# Load your dataset
fazeli_mitbih_train_df = pd.read_csv('../data/mitbih_train.csv', header=None)

# Assign a unique ID to each row and separate the target variable
fazeli_mitbih_train_df['id'] = range(len(fazeli_mitbih_train_df))
target_series = fazeli_mitbih_train_df[187]

# Keep only features and the unique ID for feature extraction
fazeli_mitbih_train_df_features_only = fazeli_mitbih_train_df.drop(columns=[187])

# Convert to long format, preserving the 'id' for direct mapping
long_df = fazeli_mitbih_train_df_features_only.melt(id_vars='id', var_name='time', value_name='amplitude')

# Define feature extraction settings
extraction_settings = ComprehensiveFCParameters()

# Incremental extraction setup
unique_ids = long_df['id'].unique()
subset_size = 10000  # Adjust based on your dataset size and memory constraints
extracted_features_list = []

for i in range(0, len(unique_ids), subset_size):
    subset_ids = unique_ids[i:i+subset_size]
    subset_df = long_df[long_df['id'].isin(subset_ids)]
    
    # Extract features for this subset
    subset_features = extract_features(subset_df, column_id='id', column_sort='time',
                                       default_fc_parameters=extraction_settings, n_jobs=7)
    extracted_features_list.append(subset_features)



Feature Extraction: 100%|███████████████████████| 35/35 [01:03<00:00,  1.81s/it]
Feature Extraction: 100%|███████████████████████| 35/35 [01:08<00:00,  1.95s/it]
Feature Extraction: 100%|███████████████████████| 35/35 [01:08<00:00,  1.96s/it]
Feature Extraction: 100%|███████████████████████| 35/35 [01:10<00:00,  2.00s/it]
Feature Extraction: 100%|███████████████████████| 35/35 [01:11<00:00,  2.04s/it]
Feature Extraction: 100%|███████████████████████| 35/35 [01:09<00:00,  2.00s/it]
Feature Extraction: 100%|███████████████████████| 35/35 [01:11<00:00,  2.05s/it]
Feature Extraction: 100%|███████████████████████| 35/35 [01:11<00:00,  2.06s/it]
Feature Extraction: 100%|███████████████████████| 35/35 [00:58<00:00,  1.68s/it]


In [5]:

# Combine extracted features from all subsets
extracted_features = pd.concat(extracted_features_list)

# Re-associate the target labels using the 'id' column
# This step correctly maps the original labels to the extracted features based on 'id'
extracted_features['label'] = extracted_features.index.map(lambda idx: target_series.loc[idx])

# Verify the re-association of labels
print(extracted_features[['label']].head())


   label
0    0.0
1    0.0
2    0.0
3    0.0
4    0.0


## Taking a look at the extracted data

extracted_features.head()


extracted_features.describe()

extracted_features.corr()

extracted_features.corr()

extracted_features.cov()

In [6]:
column_187_extracted = extracted_features['label']
column_187_extracted.value_counts()

0.0    72471
4.0     6431
2.0     5788
1.0     2223
3.0      641
Name: label, dtype: int64

RandomForestClassifier?

## Set X and y from extracted features, Perform train_test_split

In [7]:
# Setting X and y from my extracted features
X = extracted_features.drop('label', axis=1)
y = extracted_features['label']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Baseline LogReg Model on PCA-derived Components Arranged in Pipeline

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.95)),  # Incorporate PCA directly into the pipeline
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print("Pipeline Model Accuracy:", accuracy_score(y_test, y_pred))
print("\nPipeline Classification Report:\n", classification_report(y_test, y_pred))


Pipeline Model Accuracy: 0.9661355719262178

Pipeline Classification Report:
               precision    recall  f1-score   support

         0.0       0.97      0.99      0.98     14579
         1.0       0.86      0.58      0.69       426
         2.0       0.90      0.84      0.87      1112
         3.0       0.80      0.70      0.75       145
         4.0       0.97      0.95      0.96      1249

    accuracy                           0.97     17511
   macro avg       0.90      0.81      0.85     17511
weighted avg       0.96      0.97      0.96     17511



## Further Models trained PCA-derived Component Features Arranged in Pipeline

In [15]:
from sklearn.ensemble import RandomForestClassifier

pipeline_rf = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.95)),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=7))
])

pipeline_rf.fit(X_train, y_train)

y_pred_rf = pipeline_rf.predict(X_test)

print("Random Forest Pipeline Model Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nRandom Forest Pipeline Classification Report:\n", classification_report(y_test, y_pred_rf))


Random Forest Pipeline Model Accuracy: 0.9567700302666895

Random Forest Pipeline Classification Report:
               precision    recall  f1-score   support

         0.0       0.95      1.00      0.98     14579
         1.0       0.97      0.46      0.62       426
         2.0       0.94      0.74      0.83      1112
         3.0       0.92      0.32      0.47       145
         4.0       1.00      0.91      0.95      1249

    accuracy                           0.96     17511
   macro avg       0.96      0.68      0.77     17511
weighted avg       0.96      0.96      0.95     17511



In [17]:
import xgboost as xgb
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier

pipeline_xgb = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.95)),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42, n_jobs=7))
])

# Fit the pipeline on the training data
pipeline_xgb.fit(X_train, y_train)

# Predict on the test data
y_pred_xgb = pipeline_xgb.predict(X_test)

# Evaluate the model
print("XGBoost Pipeline Model Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("\nXGBoost Pipeline Classification Report:\n", classification_report(y_test, y_pred_xgb))


XGBoost Pipeline Model Accuracy: 0.9758437553537777

XGBoost Pipeline Classification Report:
               precision    recall  f1-score   support

         0.0       0.98      1.00      0.99     14579
         1.0       0.96      0.62      0.75       426
         2.0       0.95      0.88      0.92      1112
         3.0       0.92      0.71      0.80       145
         4.0       0.99      0.97      0.98      1249

    accuracy                           0.98     17511
   macro avg       0.96      0.83      0.89     17511
weighted avg       0.98      0.98      0.97     17511



In [18]:
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

pipeline_lgbm = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.95)),  # Optional, depending on necessity of PCA
    ('classifier', LGBMClassifier(random_state=42, n_jobs=7))
])

pipeline_lgbm.fit(X_train, y_train)

y_pred_lgbm = pipeline_lgbm.predict(X_test)

print("LightGBM Pipeline Model Accuracy:", accuracy_score(y_test, y_pred_lgbm))
print("\nLightGBM Pipeline Classification Report:\n", classification_report(y_test, y_pred_lgbm))


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.022952 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 64260
[LightGBM] [Info] Number of data points in the train set: 70043, number of used features: 252
[LightGBM] [Info] Start training from score -0.190530
[LightGBM] [Info] Start training from score -3.662991
[LightGBM] [Info] Start training from score -2.706666
[LightGBM] [Info] Start training from score -4.950289
[LightGBM] [Info] Start training from score -2.603918
LightGBM Pipeline Model Accuracy: 0.9739592256296042

LightGBM Pipeline Classification Report:
               precision    recall  f1-score   support

         0.0       0.98      1.00      0.99     14579
         1.0       0.95      0.62      0.75       426
         2.0       0.94      0.88      0.91      1112
         3.0       0.83      0.57      0.68       145
         4.0       0.99      0.97      0.98      1249

    accuracy     

In [19]:
from catboost import CatBoostClassifier

n_jobs = 7

pipeline_catboost = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.95)),
    ('classifier', CatBoostClassifier(silent=True, random_state=42, thread_count=n_jobs))
])


pipeline_catboost.fit(X_train, y_train)

y_pred_catboost = pipeline_catboost.predict(X_test)

print("CatBoost Pipeline Model Accuracy:", accuracy_score(y_test, y_pred_catboost))
print("\nCatBoost Pipeline Classification Report:\n", classification_report(y_test, y_pred_catboost))


CatBoost Pipeline Model Accuracy: 0.9756724344697618

CatBoost Pipeline Classification Report:
               precision    recall  f1-score   support

         0.0       0.98      1.00      0.99     14579
         1.0       0.94      0.65      0.77       426
         2.0       0.94      0.89      0.91      1112
         3.0       0.94      0.68      0.79       145
         4.0       0.99      0.97      0.98      1249

    accuracy                           0.98     17511
   macro avg       0.96      0.83      0.89     17511
weighted avg       0.98      0.98      0.97     17511



### having trouble running the tensorflow neural network

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

# Assuming 'extracted_features' and 'label' preparation is done
X = extracted_features.drop('label', axis=1)
y = extracted_features['label']

# One-hot encoding of labels
y_encoded = to_categorical(y)

# Splitting data
X_train, X_test, y_train_encoded, y_test_encoded = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Scaling features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Neural network architecture
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(y_train_encoded.shape[1], activation='softmax')  # Output layer
])

# Compiling the model
model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

# Training the model
history = model.fit(X_train_scaled, y_train_encoded, epochs=100, batch_size=128, validation_split=0.2, verbose=1)

# Evaluating the model
loss, accuracy = model.evaluate(X_test_scaled, y_test_encoded, verbose=0)
print(f"Test Accuracy: {accuracy*100:.2f}%")


In [10]:
from sklearn.svm import SVC

pipeline_svc = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('classifier', SVC(kernel='rbf', random_state=42))
])

pipeline_svc.fit(X_train, y_train)
y_pred_svc = pipeline_svc.predict(X_test)

print("SVC Model Accuracy:", accuracy_score(y_test, y_pred_svc))

SVC Model Accuracy: 0.9789846382273999
