In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import pickle

# Load Parkinson’s disease dataset
parkinsons_dataset = pd.read_csv("../datasets/parkinsons.csv")

# Exploratory Data Analysis
print(parkinsons_dataset.head())
print(parkinsons_dataset.shape)
print(parkinsons_dataset.describe())

# Ensure only numeric columns are used for mean calculation
numeric_cols = parkinsons_dataset.select_dtypes(include=['number']).columns
print(parkinsons_dataset.groupby('status')[numeric_cols].mean())


# Splitting features and target
X_parkinsons = parkinsons_dataset.drop(columns=['status', 'name'], axis=1)  # 'name' is removed if present
y_parkinsons = parkinsons_dataset['status']

# Splitting dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X_parkinsons, y_parkinsons, test_size=0.2, random_state=42)

# Training the SVM model
parkinsons_model = SVC(kernel='linear')
parkinsons_model.fit(X_train, Y_train)

# Making predictions
y_pred = parkinsons_model.predict(X_test)

# Evaluating model accuracy
print("Parkinson’s Disease Prediction Accuracy:", accuracy_score(Y_test, y_pred))

# Saving model
filename = 'parkinsons_disease_model.sav'
pickle.dump(parkinsons_model, open(filename, 'wb'))


             name  MDVP:Fo(Hz)  MDVP:Fhi(Hz)  MDVP:Flo(Hz)  MDVP:Jitter(%)  \
0  phon_R01_S01_1      119.992       157.302        74.997         0.00784   
1  phon_R01_S01_2      122.400       148.650       113.819         0.00968   
2  phon_R01_S01_3      116.682       131.111       111.555         0.01050   
3  phon_R01_S01_4      116.676       137.871       111.366         0.00997   
4  phon_R01_S01_5      116.014       141.781       110.655         0.01284   

   MDVP:Jitter(Abs)  MDVP:RAP  MDVP:PPQ  Jitter:DDP  MDVP:Shimmer  ...  \
0           0.00007   0.00370   0.00554     0.01109       0.04374  ...   
1           0.00008   0.00465   0.00696     0.01394       0.06134  ...   
2           0.00009   0.00544   0.00781     0.01633       0.05233  ...   
3           0.00009   0.00502   0.00698     0.01505       0.05492  ...   
4           0.00011   0.00655   0.00908     0.01966       0.06425  ...   

   Shimmer:DDA      NHR     HNR  status      RPDE       DFA   spread1  \
0      0.0654

In [5]:
print("Feature Columns:")
for column in X_parkinsons.columns:
    print(column)


Feature Columns:
MDVP:Fo(Hz)
MDVP:Fhi(Hz)
MDVP:Flo(Hz)
MDVP:Jitter(%)
MDVP:Jitter(Abs)
MDVP:RAP
MDVP:PPQ
Jitter:DDP
MDVP:Shimmer
MDVP:Shimmer(dB)
Shimmer:APQ3
Shimmer:APQ5
MDVP:APQ
Shimmer:DDA
NHR
HNR
RPDE
DFA
spread1
spread2
D2
PPE


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
import pickle

# Load Parkinson’s disease dataset
parkinsons_dataset = pd.read_csv("../datasets/parkinsons.csv")

# Exploratory Data Analysis
print(parkinsons_dataset.head())
print(parkinsons_dataset.shape)
print(parkinsons_dataset.describe())

# Ensure only numeric columns are used for mean calculation
numeric_cols = parkinsons_dataset.select_dtypes(include=['number']).columns
print(parkinsons_dataset.groupby('status')[numeric_cols].mean())

# Splitting features and target
X_parkinsons = parkinsons_dataset.drop(columns=['status', 'name'], axis=1)  # 'name' is removed if present
y_parkinsons = parkinsons_dataset['status']

# Splitting dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X_parkinsons, y_parkinsons, test_size=0.2, random_state=42)

# Standardizing the data for models that might benefit from scaling (SVM, Logistic Regression, KNN)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# List of models to train
models = {
    'SVM': SVC(kernel='linear'),
    'Logistic Regression': LogisticRegression(max_iter=200),
    'Random Forest': RandomForestClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

# Loop over models to train and evaluate them
for model_name, model in models.items():
    print(f"Training {model_name}...")
    
    # Choose scaled data for models that benefit from scaling (SVM, Logistic Regression, KNN)
    if model_name in ['SVM', 'Logistic Regression', 'K-Nearest Neighbors']:
        model.fit(X_train_scaled, Y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, Y_train)
        y_pred = model.predict(X_test)
    
    # Accuracy score
    accuracy = accuracy_score(Y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy * 100:.2f}%")
    
    # Confusion Matrix
    conf_matrix = confusion_matrix(Y_test, y_pred)
    print(f"Confusion Matrix for {model_name}:\n{conf_matrix}\n")
    
    # Save the model with the prefix 'parkinsons_'
    filename = f'parkinsons_{model_name.lower().replace(" ", "_")}_model.sav'
    pickle.dump(model, open(filename, 'wb'))


             name  MDVP:Fo(Hz)  MDVP:Fhi(Hz)  MDVP:Flo(Hz)  MDVP:Jitter(%)  \
0  phon_R01_S01_1      119.992       157.302        74.997         0.00784   
1  phon_R01_S01_2      122.400       148.650       113.819         0.00968   
2  phon_R01_S01_3      116.682       131.111       111.555         0.01050   
3  phon_R01_S01_4      116.676       137.871       111.366         0.00997   
4  phon_R01_S01_5      116.014       141.781       110.655         0.01284   

   MDVP:Jitter(Abs)  MDVP:RAP  MDVP:PPQ  Jitter:DDP  MDVP:Shimmer  ...  \
0           0.00007   0.00370   0.00554     0.01109       0.04374  ...   
1           0.00008   0.00465   0.00696     0.01394       0.06134  ...   
2           0.00009   0.00544   0.00781     0.01633       0.05233  ...   
3           0.00009   0.00502   0.00698     0.01505       0.05492  ...   
4           0.00011   0.00655   0.00908     0.01966       0.06425  ...   

   Shimmer:DDA      NHR     HNR  status      RPDE       DFA   spread1  \
0      0.0654