# 1 - Data Cleaning

In [10]:
import numpy as np
import pandas as pd
import csv

In [11]:
df = pd.read_csv('/Users/lamaayash/Downloads/Data Science Project/parkinsons_updrs.data')
df.head()

Unnamed: 0,subject#,age,sex,test_time,motor_UPDRS,total_UPDRS,Jitter(%),Jitter(Abs),Jitter:RAP,Jitter:PPQ5,...,Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,Shimmer:APQ11,Shimmer:DDA,NHR,HNR,RPDE,DFA,PPE
0,1,72,0,5.6431,28.199,34.398,0.00662,3.4e-05,0.00401,0.00317,...,0.23,0.01438,0.01309,0.01662,0.04314,0.01429,21.64,0.41888,0.54842,0.16006
1,1,72,0,12.666,28.447,34.894,0.003,1.7e-05,0.00132,0.0015,...,0.179,0.00994,0.01072,0.01689,0.02982,0.011112,27.183,0.43493,0.56477,0.1081
2,1,72,0,19.681,28.695,35.389,0.00481,2.5e-05,0.00205,0.00208,...,0.181,0.00734,0.00844,0.01458,0.02202,0.02022,23.047,0.46222,0.54405,0.21014
3,1,72,0,25.647,28.905,35.81,0.00528,2.7e-05,0.00191,0.00264,...,0.327,0.01106,0.01265,0.01963,0.03317,0.027837,24.445,0.4873,0.57794,0.33277
4,1,72,0,33.642,29.187,36.375,0.00335,2e-05,0.00093,0.0013,...,0.176,0.00679,0.00929,0.01819,0.02036,0.011625,26.126,0.47188,0.56122,0.19361


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5875 entries, 0 to 5874
Data columns (total 22 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   subject#       5875 non-null   int64  
 1   age            5875 non-null   int64  
 2   sex            5875 non-null   int64  
 3   test_time      5875 non-null   float64
 4   motor_UPDRS    5875 non-null   float64
 5   total_UPDRS    5875 non-null   float64
 6   Jitter(%)      5875 non-null   float64
 7   Jitter(Abs)    5875 non-null   float64
 8   Jitter:RAP     5875 non-null   float64
 9   Jitter:PPQ5    5875 non-null   float64
 10  Jitter:DDP     5875 non-null   float64
 11  Shimmer        5875 non-null   float64
 12  Shimmer(dB)    5875 non-null   float64
 13  Shimmer:APQ3   5875 non-null   float64
 14  Shimmer:APQ5   5875 non-null   float64
 15  Shimmer:APQ11  5875 non-null   float64
 16  Shimmer:DDA    5875 non-null   float64
 17  NHR            5875 non-null   float64
 18  HNR     

# let's learn more about our features 




subject# - Integer that uniquely identifies each subject 

age - Subject age 

sex - Subject gender '0' - male, '1' - female 

test_time - Time since recruitment into the trial. The integer part is the number of days since recruitment. 

motor_UPDRS - Clinician's motor UPDRS score, linearly interpolated 

total_UPDRS - Clinician's total UPDRS score, linearly interpolated 

Jitter(%),Jitter(Abs),Jitter:RAP,Jitter:PPQ5,Jitter:DDP - Several measures of variation in fundamental frequency 

Shimmer,Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,Shimmer:APQ11,Shimmer:DDA - Several measures of variation in amplitude 

NHR,HNR - Two measures of ratio of noise to tonal components in the voice 

RPDE - A nonlinear dynamical complexity measure 

DFA - Signal fractal scaling exponent 

PPE - A nonlinear measure of fundamental frequency variation 



In [13]:
#42 subject * 150-200 voice note
df.shape

(5875, 22)

In [14]:
# Check the number of the subjects
df['subject#'].nunique()

42

In [15]:
copies = df.duplicated(subset=['subject#','age', 'sex', 'test_time', 'motor_UPDRS', 'total_UPDRS', 'Jitter(%)',
       'Jitter(Abs)', 'Jitter:RAP', 'Jitter:PPQ5', 'Jitter:DDP', 'Shimmer',
       'Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5', 'Shimmer:APQ11',
       'Shimmer:DDA', 'NHR', 'HNR', 'RPDE', 'DFA', 'PPE'])


In [16]:
copies.value_counts()

False    5875
dtype: int64

fortunately no duplicates columns

In [17]:
df.columns

Index(['subject#', 'age', 'sex', 'test_time', 'motor_UPDRS', 'total_UPDRS',
       'Jitter(%)', 'Jitter(Abs)', 'Jitter:RAP', 'Jitter:PPQ5', 'Jitter:DDP',
       'Shimmer', 'Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5',
       'Shimmer:APQ11', 'Shimmer:DDA', 'NHR', 'HNR', 'RPDE', 'DFA', 'PPE'],
      dtype='object')

In [18]:
#we will drop 'subject' as long as we don't need it
df.drop(['subject#'],axis=1,inplace=True)

In [19]:
#check if any nulls in our data -luckily no there is no nulls
df.isnull().sum()

age              0
sex              0
test_time        0
motor_UPDRS      0
total_UPDRS      0
Jitter(%)        0
Jitter(Abs)      0
Jitter:RAP       0
Jitter:PPQ5      0
Jitter:DDP       0
Shimmer          0
Shimmer(dB)      0
Shimmer:APQ3     0
Shimmer:APQ5     0
Shimmer:APQ11    0
Shimmer:DDA      0
NHR              0
HNR              0
RPDE             0
DFA              0
PPE              0
dtype: int64

In [20]:
#Rename some colmuns to make it easier to use
df.rename(
    inplace=True,
    columns={
        'Jitter(%)':'Jitter',
        'Jitter(Abs)':'Jitter_Abc',
        'Jitter:RAP':'Jitter_RAP',
        'Jitter:PPQ5':'Jitter_PPQ5',
        'Jitter:DDP':'Jitter_DDP',
        'Shimmer(dB)':'Shimmer_dB',
        'Shimmer:APQ3':'Shimmer_APQ3',
        'Shimmer:APQ5':'Shimmer_APQ5',
        'Shimmer:APQ11':'Shimmer_APQ11',
        'Shimmer:DDA':'Shimmer_DDA'
    })



In [21]:
df.head(2)

Unnamed: 0,age,sex,test_time,motor_UPDRS,total_UPDRS,Jitter,Jitter_Abc,Jitter_RAP,Jitter_PPQ5,Jitter_DDP,...,Shimmer_dB,Shimmer_APQ3,Shimmer_APQ5,Shimmer_APQ11,Shimmer_DDA,NHR,HNR,RPDE,DFA,PPE
0,72,0,5.6431,28.199,34.398,0.00662,3.4e-05,0.00401,0.00317,0.01204,...,0.23,0.01438,0.01309,0.01662,0.04314,0.01429,21.64,0.41888,0.54842,0.16006
1,72,0,12.666,28.447,34.894,0.003,1.7e-05,0.00132,0.0015,0.00395,...,0.179,0.00994,0.01072,0.01689,0.02982,0.011112,27.183,0.43493,0.56477,0.1081


In [22]:
# Confirm that our column renamed correctly - it did
df.columns

Index(['age', 'sex', 'test_time', 'motor_UPDRS', 'total_UPDRS', 'Jitter',
       'Jitter_Abc', 'Jitter_RAP', 'Jitter_PPQ5', 'Jitter_DDP', 'Shimmer',
       'Shimmer_dB', 'Shimmer_APQ3', 'Shimmer_APQ5', 'Shimmer_APQ11',
       'Shimmer_DDA', 'NHR', 'HNR', 'RPDE', 'DFA', 'PPE'],
      dtype='object')

According to our dataset we have to divide our dataset into 2 models and work upon 2 individually

First : predict total_UPDRS

Second: predict motor_UPDRS

In [23]:
df1 = df.copy()
df2 = df.copy()

In [24]:
# Save output as CSV
df1.to_csv('/Users/lamaayash/df1.csv', index=False)
df2.to_csv('/Users/lamaayash/df2.csv', index=False)

# 2-Feature engineering

In [25]:
#import my clean data1
df = pd.read_csv('/Users/lamaayash/df1.csv')

In [26]:
#import my clean data2
df = pd.read_csv('/Users/lamaayash/df2.csv')

In [27]:
df1.head()

Unnamed: 0,age,sex,test_time,motor_UPDRS,total_UPDRS,Jitter,Jitter_Abc,Jitter_RAP,Jitter_PPQ5,Jitter_DDP,...,Shimmer_dB,Shimmer_APQ3,Shimmer_APQ5,Shimmer_APQ11,Shimmer_DDA,NHR,HNR,RPDE,DFA,PPE
0,72,0,5.6431,28.199,34.398,0.00662,3.4e-05,0.00401,0.00317,0.01204,...,0.23,0.01438,0.01309,0.01662,0.04314,0.01429,21.64,0.41888,0.54842,0.16006
1,72,0,12.666,28.447,34.894,0.003,1.7e-05,0.00132,0.0015,0.00395,...,0.179,0.00994,0.01072,0.01689,0.02982,0.011112,27.183,0.43493,0.56477,0.1081
2,72,0,19.681,28.695,35.389,0.00481,2.5e-05,0.00205,0.00208,0.00616,...,0.181,0.00734,0.00844,0.01458,0.02202,0.02022,23.047,0.46222,0.54405,0.21014
3,72,0,25.647,28.905,35.81,0.00528,2.7e-05,0.00191,0.00264,0.00573,...,0.327,0.01106,0.01265,0.01963,0.03317,0.027837,24.445,0.4873,0.57794,0.33277
4,72,0,33.642,29.187,36.375,0.00335,2e-05,0.00093,0.0013,0.00278,...,0.176,0.00679,0.00929,0.01819,0.02036,0.011625,26.126,0.47188,0.56122,0.19361


In [28]:
df2.head()

Unnamed: 0,age,sex,test_time,motor_UPDRS,total_UPDRS,Jitter,Jitter_Abc,Jitter_RAP,Jitter_PPQ5,Jitter_DDP,...,Shimmer_dB,Shimmer_APQ3,Shimmer_APQ5,Shimmer_APQ11,Shimmer_DDA,NHR,HNR,RPDE,DFA,PPE
0,72,0,5.6431,28.199,34.398,0.00662,3.4e-05,0.00401,0.00317,0.01204,...,0.23,0.01438,0.01309,0.01662,0.04314,0.01429,21.64,0.41888,0.54842,0.16006
1,72,0,12.666,28.447,34.894,0.003,1.7e-05,0.00132,0.0015,0.00395,...,0.179,0.00994,0.01072,0.01689,0.02982,0.011112,27.183,0.43493,0.56477,0.1081
2,72,0,19.681,28.695,35.389,0.00481,2.5e-05,0.00205,0.00208,0.00616,...,0.181,0.00734,0.00844,0.01458,0.02202,0.02022,23.047,0.46222,0.54405,0.21014
3,72,0,25.647,28.905,35.81,0.00528,2.7e-05,0.00191,0.00264,0.00573,...,0.327,0.01106,0.01265,0.01963,0.03317,0.027837,24.445,0.4873,0.57794,0.33277
4,72,0,33.642,29.187,36.375,0.00335,2e-05,0.00093,0.0013,0.00278,...,0.176,0.00679,0.00929,0.01819,0.02036,0.011625,26.126,0.47188,0.56122,0.19361


# Now let's learn about our first scale in this dataset

(UPDRS) indicate for UNIFIED PARKINSON’S DISEASE RATING SCALE.

A score of 199 on the UPDRS scale represents the worst (total disability)

with a score of zero representing (no disability)

In [29]:
df1['total_UPDRS'].describe()

count    5875.000000
mean       29.018942
std        10.700283
min         7.000000
25%        21.371000
50%        27.576000
75%        36.399000
max        54.992000
Name: total_UPDRS, dtype: float64

the seconed scale in our dataset UPDRS
includes assess motor abnormalities such as rest tremor,

action tremor, rigidity, bradykinesia,gait and posture, and facial masking. 

The total motor UPDRS exam score ranges from 0 to 108.

In [30]:
df1['motor_UPDRS'].describe()

count    5875.000000
mean       21.296229
std         8.129282
min         5.037700
25%        15.000000
50%        20.871000
75%        27.596500
max        39.511000
Name: motor_UPDRS, dtype: float64

In [31]:
df1[['Jitter', 'Jitter_Abc', 'Jitter_RAP', 'Jitter_PPQ5', 'Jitter_DDP',
       'Shimmer', 'Shimmer_dB', 'Shimmer_APQ3', 'Shimmer_APQ5',
       'Shimmer_APQ11', 'Shimmer_DDA', 'NHR', 'HNR', 'RPDE', 'DFA', 'PPE']].describe()

Unnamed: 0,Jitter,Jitter_Abc,Jitter_RAP,Jitter_PPQ5,Jitter_DDP,Shimmer,Shimmer_dB,Shimmer_APQ3,Shimmer_APQ5,Shimmer_APQ11,Shimmer_DDA,NHR,HNR,RPDE,DFA,PPE
count,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0
mean,0.006154,4.4e-05,0.002987,0.003277,0.008962,0.034035,0.31096,0.017156,0.020144,0.027481,0.051467,0.03212,21.679495,0.541473,0.65324,0.219589
std,0.005624,3.6e-05,0.003124,0.003732,0.009371,0.025835,0.230254,0.013237,0.016664,0.019986,0.039711,0.059692,4.291096,0.100986,0.070902,0.091498
min,0.00083,2e-06,0.00033,0.00043,0.00098,0.00306,0.026,0.00161,0.00194,0.00249,0.00484,0.000286,1.659,0.15102,0.51404,0.021983
25%,0.00358,2.2e-05,0.00158,0.00182,0.00473,0.01912,0.175,0.00928,0.01079,0.015665,0.02783,0.010955,19.406,0.469785,0.59618,0.15634
50%,0.0049,3.5e-05,0.00225,0.00249,0.00675,0.02751,0.253,0.0137,0.01594,0.02271,0.04111,0.018448,21.92,0.54225,0.6436,0.2055
75%,0.0068,5.3e-05,0.00329,0.00346,0.00987,0.03975,0.365,0.020575,0.023755,0.032715,0.061735,0.031463,24.444,0.614045,0.711335,0.26449
max,0.09999,0.000446,0.05754,0.06956,0.17263,0.26863,2.107,0.16267,0.16702,0.27546,0.48802,0.74826,37.875,0.96608,0.8656,0.73173


# Here we go our data is ready for EDA.