In [90]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier


In [91]:
df = pd.read_csv("./Data/parkinsons_updrs.csv")

In [92]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5875 entries, 0 to 5874
Data columns (total 23 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   index          5875 non-null   int64  
 1   subject#       5875 non-null   int64  
 2   age            5875 non-null   int64  
 3   sex            5875 non-null   int64  
 4   test_time      5875 non-null   float64
 5   motor_UPDRS    5875 non-null   float64
 6   total_UPDRS    5875 non-null   float64
 7   Jitter(%)      5875 non-null   float64
 8   Jitter(Abs)    5875 non-null   float64
 9   Jitter:RAP     5875 non-null   float64
 10  Jitter:PPQ5    5875 non-null   float64
 11  Jitter:DDP     5875 non-null   float64
 12  Shimmer        5875 non-null   float64
 13  Shimmer(dB)    5875 non-null   float64
 14  Shimmer:APQ3   5875 non-null   float64
 15  Shimmer:APQ5   5875 non-null   float64
 16  Shimmer:APQ11  5875 non-null   float64
 17  Shimmer:DDA    5875 non-null   float64
 18  NHR     

In [93]:
# y = total_UPDRS

In [94]:
# Likely Important Features:
# motor_UPDRS: Direct measure of motor impairment, highly relevant to Parkinson's Disease.
# total_UPDRS: Comprehensive measure including both motor and non-motor symptoms.
# Jitter(%): Indicates frequency variation in the voice, often affected in Parkinson's Disease.
# Jitter(Abs): Another measure of frequency variation.
# Jitter
# : Relative average perturbation, reflecting frequency stability.
# Jitter
# : Period perturbation quotient, reflecting short-term variability in pitch.
# Jitter
# : Average absolute difference in pitch periods.
# Shimmer: Amplitude variation in the voice.
# Shimmer(dB): Logarithmic measure of amplitude variation.
# Shimmer
# , APQ5, APQ11, DDA: Various measures of amplitude perturbation.
# NHR: Noise-to-harmonics ratio, reflecting the amount of noise in the voice.
# HNR: Harmonics-to-noise ratio, reflecting the harmonic quality of the voice.
# RPDE: Recurrence period density entropy, a measure of complexity in the voice signal.
# DFA: Detrended fluctuation analysis, reflecting fractal scaling properties.
# PPE: Pitch period entropy, capturing randomness in the voice signal.

In [95]:
x = df[["age","sex","motor_UPDRS","Jitter(%)","Jitter(Abs)","Jitter:RAP","Jitter:PPQ5","Jitter:DDP","Shimmer","Shimmer(dB)","Shimmer:APQ3"]]


In [96]:
x

Unnamed: 0,age,sex,motor_UPDRS,Jitter(%),Jitter(Abs),Jitter:RAP,Jitter:PPQ5,Jitter:DDP,Shimmer,Shimmer(dB),Shimmer:APQ3
0,72,0,28.199,0.00662,0.000034,0.00401,0.00317,0.01204,0.02565,0.230,0.01438
1,72,0,28.447,0.00300,0.000017,0.00132,0.00150,0.00395,0.02024,0.179,0.00994
2,72,0,28.695,0.00481,0.000025,0.00205,0.00208,0.00616,0.01675,0.181,0.00734
3,72,0,28.905,0.00528,0.000027,0.00191,0.00264,0.00573,0.02309,0.327,0.01106
4,72,0,29.187,0.00335,0.000020,0.00093,0.00130,0.00278,0.01703,0.176,0.00679
...,...,...,...,...,...,...,...,...,...,...,...
5870,61,0,22.485,0.00406,0.000031,0.00167,0.00168,0.00500,0.01896,0.160,0.00973
5871,61,0,21.988,0.00297,0.000025,0.00119,0.00147,0.00358,0.02315,0.215,0.01052
5872,61,0,21.495,0.00349,0.000025,0.00152,0.00187,0.00456,0.02499,0.244,0.01371
5873,61,0,21.007,0.00281,0.000020,0.00128,0.00151,0.00383,0.01484,0.131,0.00693


In [97]:
y = df["total_UPDRS"]

In [98]:
y

0       34.398
1       34.894
2       35.389
3       35.810
4       36.375
         ...  
5870    33.485
5871    32.988
5872    32.495
5873    32.007
5874    31.513
Name: total_UPDRS, Length: 5875, dtype: float64

In [99]:
y = df["total_UPDRS"]  # Replace "target" with your actual target column name
print(y.unique())


[34.398 34.894 35.389 ... 35.863 36.961 35.401]


In [100]:
import numpy as np

# Example: Binning continuous values into 2 categories
y = np.digitize(y, bins=[-np.inf, 20.0, np.inf])  # Adjust bins according to your data
print(np.unique(y))


[1 2]


In [101]:
model = RandomForestClassifier()
model

In [102]:
model.fit(x, y)

In [103]:
model.predict([[72,	0,	28.905,	0.00528,	0.000027,	0.00191,	0.00264	,0.00573,	0.02309	,0.327	,0.01106]])



array([2])