## Creating a DataFrame

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf
from pathlib import Path
from sqlalchemy import create_engine

#Create a reference path to the DataBase file 
data_path = Path("../SQlite/Parkinson_DB.db")

#Create engine, query all data
engine = create_engine(f"sqlite:///{data_path}")
conn = engine.connect()
data_df = pd.read_sql("SELECT * FROM Cleaned_PD_Data", conn)

#Read the Parkinson's data into a DataFrame
data_df.head()

Unnamed: 0,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,...,MoCA,FunctionalAssessment,Tremor,Rigidity,Bradykinesia,PosturalInstability,SpeechProblems,SleepDisorders,Constipation,Diagnosis
0,85,0,3,1,19.619878,0,5.108241,1.38066,3.893969,9.283194,...,29.181289,1.572427,1,0,0,0,0,0,0,0
1,75,0,0,2,16.247339,1,6.027648,8.409804,8.513428,5.60247,...,12.332639,4.787551,0,1,0,1,0,1,0,1
2,70,1,0,0,15.368239,0,2.242135,0.213275,6.498805,9.929824,...,29.927783,2.130686,1,0,0,0,1,0,1,1
3,52,0,0,0,15.454557,0,5.997788,1.375045,6.715033,4.196189,...,21.304268,3.391288,1,1,1,0,0,0,1,1
4,87,0,0,1,18.616042,0,9.775243,1.188607,4.657572,9.363925,...,8.336364,3.200969,0,0,0,1,0,1,0,0


## Processing Data for Optimization of Model

In [2]:
#Determine the number of unique values un each column
data_df.nunique()

Age                           40
Gender                         2
Ethnicity                      4
EducationLevel                 4
BMI                         2105
Smoking                        2
AlcoholConsumption          2105
PhysicalActivity            2105
DietQuality                 2105
SleepQuality                2105
FamilyHistoryParkinsons        2
TraumaticBrainInjury           2
Hypertension                   2
Diabetes                       2
Depression                     2
Stroke                         2
SystolicBP                    90
DiastolicBP                   60
CholesterolTotal            2105
CholesterolLDL              2105
CholesterolHDL              2105
CholesterolTriglycerides    2105
UPDRS                       2105
MoCA                        2105
FunctionalAssessment        2105
Tremor                         2
Rigidity                       2
Bradykinesia                   2
PosturalInstability            2
SpeechProblems                 2
SleepDisor

In [3]:
#Determine the statistics for UPDRS in order to bin data
data_df["UPDRS"].describe()

count    2105.000000
mean      101.415318
std        56.591448
min         0.028441
25%        53.048148
50%       102.561023
75%       149.831682
max       198.953604
Name: UPDRS, dtype: float64

In [4]:
#Change 'UPDRS' from integer to binned ranges
bins= [0, 25, 50, 75, 100, 125, 150, 175, 200]
labels = ['0-25',
          '25.1-50',
          '50.1-75',
          '75.1-100',
          '100.1-125',
          '125.1-150',
          '150.1-175',
          '175.1-200']

data_df['UPDRS_BINNED'] = pd.cut(data_df['UPDRS'], bins=bins, labels=labels)

#Drop 'UPDRS' from the DataFrame so as not to be included in the features
data_df.drop(["UPDRS"], axis=1, inplace=True)
data_df.head()

Unnamed: 0,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,...,FunctionalAssessment,Tremor,Rigidity,Bradykinesia,PosturalInstability,SpeechProblems,SleepDisorders,Constipation,Diagnosis,UPDRS_BINNED
0,85,0,3,1,19.619878,0,5.108241,1.38066,3.893969,9.283194,...,1.572427,1,0,0,0,0,0,0,0,0-25
1,75,0,0,2,16.247339,1,6.027648,8.409804,8.513428,5.60247,...,4.787551,0,1,0,1,0,1,0,1,25.1-50
2,70,1,0,0,15.368239,0,2.242135,0.213275,6.498805,9.929824,...,2.130686,1,0,0,0,1,0,1,1,50.1-75
3,52,0,0,0,15.454557,0,5.997788,1.375045,6.715033,4.196189,...,3.391288,1,1,1,0,0,0,1,1,50.1-75
4,87,0,0,1,18.616042,0,9.775243,1.188607,4.657572,9.363925,...,3.200969,0,0,0,1,0,1,0,0,0-25


In [5]:
#Determine the statistics for MoCA in order to bin data
data_df["MoCA"].describe()

count    2105.000000
mean       15.094314
std         8.643014
min         0.021191
25%         7.517160
50%        14.963574
75%        22.608362
max        29.970107
Name: MoCA, dtype: float64

In [6]:
#Change 'MoCA' from integer to binned ranges
bins= [0, 5, 10, 15, 20, 25, 30]
labels = ['0-5',
          '5.1-10',
          '10.1-15',
          '15.1-20',
          '20.1-25',
          '25.1-30']

data_df['MoCA_BINNED'] = pd.cut(data_df['MoCA'], bins=bins, labels=labels)

#Drop 'MoCA' from the DataFrame so as not to be included in the features
data_df.drop(["MoCA"], axis=1, inplace=True)
data_df.head()

Unnamed: 0,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,...,Tremor,Rigidity,Bradykinesia,PosturalInstability,SpeechProblems,SleepDisorders,Constipation,Diagnosis,UPDRS_BINNED,MoCA_BINNED
0,85,0,3,1,19.619878,0,5.108241,1.38066,3.893969,9.283194,...,1,0,0,0,0,0,0,0,0-25,25.1-30
1,75,0,0,2,16.247339,1,6.027648,8.409804,8.513428,5.60247,...,0,1,0,1,0,1,0,1,25.1-50,10.1-15
2,70,1,0,0,15.368239,0,2.242135,0.213275,6.498805,9.929824,...,1,0,0,0,1,0,1,1,50.1-75,25.1-30
3,52,0,0,0,15.454557,0,5.997788,1.375045,6.715033,4.196189,...,1,1,1,0,0,0,1,1,50.1-75,20.1-25
4,87,0,0,1,18.616042,0,9.775243,1.188607,4.657572,9.363925,...,0,0,0,1,0,1,0,0,0-25,5.1-10


In [7]:
# Convert categorical data to numeric with `pd.get_dummies`
data_dummies=pd.get_dummies(data_df,
                                               columns=['UPDRS_BINNED',
                                                        'MoCA_BINNED'],
                                               drop_first=True)
data_dummies.head()

Unnamed: 0,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,...,UPDRS_BINNED_75.1-100,UPDRS_BINNED_100.1-125,UPDRS_BINNED_125.1-150,UPDRS_BINNED_150.1-175,UPDRS_BINNED_175.1-200,MoCA_BINNED_5.1-10,MoCA_BINNED_10.1-15,MoCA_BINNED_15.1-20,MoCA_BINNED_20.1-25,MoCA_BINNED_25.1-30
0,85,0,3,1,19.619878,0,5.108241,1.38066,3.893969,9.283194,...,False,False,False,False,False,False,False,False,False,True
1,75,0,0,2,16.247339,1,6.027648,8.409804,8.513428,5.60247,...,False,False,False,False,False,False,True,False,False,False
2,70,1,0,0,15.368239,0,2.242135,0.213275,6.498805,9.929824,...,False,False,False,False,False,False,False,False,False,True
3,52,0,0,0,15.454557,0,5.997788,1.375045,6.715033,4.196189,...,False,False,False,False,False,False,False,False,True,False
4,87,0,0,1,18.616042,0,9.775243,1.188607,4.657572,9.363925,...,False,False,False,False,False,True,False,False,False,False


In [8]:
#Split data into features and target arrays
X=data_dummies.copy()
X.drop("Diagnosis", axis=1, inplace=True)

y=data_dummies["Diagnosis"]

In [9]:
#Split features and array into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [10]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


## Compile, Train, and Evaluate Model

In [11]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train.columns)
hidden_nodes_layer1= 25
hidden_nodes_layer2=10

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [12]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [13]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=50)

Epoch 1/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.5945 - loss: 0.6759
Epoch 2/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7000 - loss: 0.5746
Epoch 3/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7724 - loss: 0.5074
Epoch 4/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8356 - loss: 0.4176
Epoch 5/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8591 - loss: 0.3637
Epoch 6/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8579 - loss: 0.3415
Epoch 7/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8940 - loss: 0.2979
Epoch 8/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8969 - loss: 0.2704
Epoch 9/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

In [14]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

17/17 - 0s - 18ms/step - accuracy: 0.8824 - loss: 0.3852
Loss: 0.3852328062057495, Accuracy: 0.8823529481887817
