
# Parkinson Disease Detector according to risk factors

In [None]:
import pandas as pd

# CSV
raw_url = "https://raw.githubusercontent.com/sharbt/PDDetector/refs/heads/main/data/pd_dataset.csv"
pdf = pd.read_csv(raw_url)

# Convert to Spark DataFrame
df = spark.createDataFrame(pdf)
df.show()
    


In [None]:
# Code generated by Data Wrangler for pandas DataFrame

import pandas as pd

def clean_data(pdf):
    # One-hot encode column: 'SmokingHistory'
    insert_loc = pdf.columns.get_loc('SmokingHistory')
    pdf = pd.concat([pdf.iloc[:,:insert_loc], pd.get_dummies(pdf.loc[:, ['SmokingHistory']]), pdf.iloc[:,insert_loc+1:]], axis=1)
    # One-hot encode column: 'CoffeeIntake'
    insert_loc = pdf.columns.get_loc('CoffeeIntake')
    pdf = pd.concat([pdf.iloc[:,:insert_loc], pd.get_dummies(pdf.loc[:, ['CoffeeIntake']]), pdf.iloc[:,insert_loc+1:]], axis=1)
    # One-hot encode column: 'Gender'
    insert_loc = pdf.columns.get_loc('Gender')
    pdf = pd.concat([pdf.iloc[:,:insert_loc], pd.get_dummies(pdf.loc[:, ['Gender']]), pdf.iloc[:,insert_loc+1:]], axis=1)
    # One-hot encode column: 'PhysicalActivity'
    insert_loc = pdf.columns.get_loc('PhysicalActivity')
    pdf = pd.concat([pdf.iloc[:,:insert_loc], pd.get_dummies(pdf.loc[:, ['PhysicalActivity']]), pdf.iloc[:,insert_loc+1:]], axis=1)
    # Drop column: 'PatientID'
    pdf = pdf.drop(columns=['PatientID'])
    # Drop duplicate rows across all columns
    pdf = pdf.drop_duplicates()
    return pdf

pdf_clean = clean_data(pdf.copy())
display(pdf_clean)

In [None]:
from sklearn.model_selection import train_test_split
    
X, y = pdf_clean[['Age','Gender_Female', 'Gender_Male',
'SmokingHistory_Current', 'SmokingHistory_Former', 'SmokingHistory_Never' , \
'FamilyHistoryPD','HeadInjuryHistory', 'PesticideExposure', \
'CoffeeIntake_High', 'CoffeeIntake_Low', 'CoffeeIntake_Moderate', \
'PhysicalActivity_High','PhysicalActivity_Low','PhysicalActivity_Moderate' , \
'OlfactoryLoss','Constipation', \
'REMBehaviorDisorder']].values, pdf_clean['ParkinsonsDiagnosis'].values
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)




In [None]:
import mlflow
experiment_name = "pd-classification"
mlflow.set_experiment(experiment_name)


In [None]:
from sklearn.linear_model import LogisticRegression
    
with mlflow.start_run():
    mlflow.sklearn.autolog()

    model = LogisticRegression(C=1/0.1, solver="liblinear")
    model.fit(X_train, y_train)

    mlflow.log_param("estimator", "LogisticRegression")

In [None]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


In [None]:
from sklearn.ensemble import RandomForestClassifier

with mlflow.start_run():
    mlflow.sklearn.autolog()

    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)

    mlflow.log_param("estimator", "RandomForestClassifier")

In [None]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
import mlflow
experiments = mlflow.search_experiments()
for exp in experiments:
    print(exp.name)

In [None]:
experiment_name = "pd-classification"
exp = mlflow.get_experiment_by_name(experiment_name)
print(exp)

In [None]:
mlflow.search_runs(exp.experiment_id)