In [50]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score
import torch
import torch.nn as nn
import torch.optim as optim

In [51]:
# Load the CSV data
data = pd.read_csv('final_data.csv')

In [52]:
# Split the data into features and labels
features = data.drop(['Person', 'Wav file', 'status', 'Language'], axis=1)
labels = data['status']

In [53]:
# Convert categorical labels to numerical
labels = labels.replace({'Parkinson': 1, 'Non-Parkinson': 0})

In [54]:
# One-hot encode the language column
languages = pd.get_dummies(data['Language'], drop_first=True)
features = pd.concat([features, languages], axis=1)

In [55]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42, stratify=labels)

In [56]:
# Normalize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [57]:
# Convert data to PyTorch tensors
X_train = torch.from_numpy(X_train).float()
X_test = torch.from_numpy(X_test).float()
y_train = torch.from_numpy(y_train.values).float()
y_test = torch.from_numpy(y_test.values).float()


In [58]:
# PyTorch LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.linear = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        batch_size = 1  # Assuming a single sequence as input
        h_0 = torch.zeros(batch_size, self.hidden_size)
        c_0 = torch.zeros(batch_size, self.hidden_size)
        output, _ = self.lstm(x.unsqueeze(0), (h_0.unsqueeze(0), c_0.unsqueeze(0)))
        output = self.linear(output.squeeze(0))
        return output

In [67]:
class CNNModel(nn.Module):
    def __init__(self, input_size, output_size):
        super(CNNModel, self).__init__()
        self.conv1 = nn.Conv1d(1, 16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(16, 32, kernel_size=3, padding=1)
        self.max_pool = nn.MaxPool1d(2)
        self.fc1 = nn.Linear(320, 64)  # Adjusted input size
        self.fc2 = nn.Linear(64, output_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = x.unsqueeze(1)  # Add channel dimension
        x = self.conv1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.relu(x)
        x = self.max_pool(x)
        x = x.view(x.size(0), -1)  # Flatten tensor
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

In [68]:
# Train PyTorch models
def train_pytorch_models(X_train, y_train):
    lstm = LSTMModel(X_train.shape[1], 64, 1)
    cnn = CNNModel(X_train.shape[1], 1)
    
    criterion = nn.BCEWithLogitsLoss()
    lstm_optimizer = optim.Adam(lstm.parameters())
    cnn_optimizer = optim.Adam(cnn.parameters())
    
    for epoch in range(10):
        # Train LSTM
        lstm.train()
        lstm_optimizer.zero_grad()
        outputs = lstm(X_train)
        loss = criterion(outputs.squeeze(), y_train)
        loss.backward()
        lstm_optimizer.step()
        
        # Train CNN
        cnn.train()
        cnn_optimizer.zero_grad()
        outputs = cnn(X_train)
        loss = criterion(outputs.squeeze(), y_train)
        loss.backward()
        cnn_optimizer.step()
    
    return lstm, cnn

lstm, cnn = train_pytorch_models(X_train, y_train)

In [69]:

# Evaluate PyTorch models
def evaluate_pytorch_models(models, X, y):
    scores = []
    for model in models:
        model.eval()
        with torch.no_grad():
            outputs = model(X)
            y_pred = torch.round(torch.sigmoid(outputs))
            scores.append((
                accuracy_score(y.numpy(), y_pred.numpy()),
                roc_auc_score(y.numpy(), torch.sigmoid(outputs).numpy()),
                precision_score(y.numpy(), y_pred.numpy()),
                recall_score(y.numpy(), y_pred.numpy()),
                f1_score(y.numpy(), y_pred.numpy())
            ))
    return scores

In [71]:
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# LightGBM
lgbm = LGBMClassifier(n_estimators=100, random_state=42)
lgbm.fit(X_train, y_train)

# SVM
svm = SVC(kernel='rbf', gamma='auto', random_state=42)
svm.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 465, number of negative: 317
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000226 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4893
[LightGBM] [Info] Number of data points in the train set: 782, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.594629 -> initscore=0.383136
[LightGBM] [Info] Start training from score 0.383136


In [74]:
# models = [rf, lgbm, svm, lstm, cnn]
models = [lstm, cnn]
scores = evaluate_pytorch_models(models, X_train, y_train)
scores.append((
    accuracy_score(y_train.numpy(), rf.predict(X_train)),
    roc_auc_score(y_train.numpy(), rf.predict(X_train)),
    precision_score(y_train.numpy(), rf.predict(X_train)),
    recall_score(y_train.numpy(), rf.predict(X_train)),
    f1_score(y_train.numpy(), rf.predict(X_train))
))
scores.append((
    accuracy_score(y_train.numpy(), lgbm.predict(X_train)),
    roc_auc_score(y_train.numpy(), lgbm.predict(X_train)),
    precision_score(y_train.numpy(), lgbm.predict(X_train)),
    recall_score(y_train.numpy(), lgbm.predict(X_train)),
    f1_score(y_train.numpy(), lgbm.predict(X_train))
))
scores.append((
    accuracy_score(y_train.numpy(), svm.predict(X_train)),
    roc_auc_score(y_train.numpy(), svm.decision_function(X_train)),
    precision_score(y_train.numpy(), svm.predict(X_train)),
    recall_score(y_train.numpy(), svm.predict(X_train)),
    f1_score(y_train.numpy(), svm.predict(X_train))
))
print("Model Evaluation Scores:")
for model_name, score in zip(['Random Forest', 'LightGBM', 'SVM', 'LSTM', 'CNN'], scores):
    print(f"{model_name}: Accuracy={score[0]:.4f}, AUC={score[1]:.4f}, Precision={score[2]:.4f}, Recall={score[3]:.4f}, F1={score[4]:.4f}")


Model Evaluation Scores:
Random Forest: Accuracy=0.6637, AUC=0.7017, Precision=0.7186, Recall=0.7140, F1=0.7163
LightGBM: Accuracy=0.6650, AUC=0.8119, Precision=0.6396, Recall=1.0000, F1=0.7802
SVM: Accuracy=1.0000, AUC=1.0000, Precision=1.0000, Recall=1.0000, F1=1.0000
LSTM: Accuracy=1.0000, AUC=1.0000, Precision=1.0000, Recall=1.0000, F1=1.0000
CNN: Accuracy=0.9450, AUC=0.9825, Precision=0.9396, Recall=0.9699, F1=0.9545


In [75]:
# Ensemble models
ensemble = VotingClassifier(estimators=[('rf', rf), ('lgbm', lgbm), ('svm', svm), ('lstm', lstm), ('cnn', cnn)], voting='soft')
ensemble.fit(X_train.numpy(), y_train.numpy())
y_pred = ensemble.predict(X_test.numpy())
ensemble_score = (
    accuracy_score(y_test.numpy(), y_pred),
    roc_auc_score(y_test.numpy(), ensemble.predict_proba(X_test.numpy())[:, 1]),
    precision_score(y_test.numpy(), y_pred),
    recall_score(y_test.numpy(), y_pred),
    f1_score(y_test.numpy(), y_pred)
)
print("\nEnsemble Model Evaluation Scores:")
print(f"Ensemble: Accuracy={ensemble_score[0]:.4f}, AUC={ensemble_score[1]:.4f}, Precision={ensemble_score[2]:.4f}, Recall={ensemble_score[3]:.4f}, F1={ensemble_score[4]:.4f}")

ValueError: The estimator LSTMModel should be a classifier.