## Creating a DataFrame

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

#Import and read the Parkinson's data into a DataFrame
data_df=pd.read_csv("../Resources/Cleaned_PD_Data.csv", index_col=False)
data_df.drop(["Unnamed: 0"], axis=1, inplace=True)
data_df.head()

Unnamed: 0,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,...,MoCA,FunctionalAssessment,Tremor,Rigidity,Bradykinesia,PosturalInstability,SpeechProblems,SleepDisorders,Constipation,Diagnosis
0,85,0,3,1,19.619878,0,5.108241,1.38066,3.893969,9.283194,...,29.181289,1.572427,1,0,0,0,0,0,0,0
1,75,0,0,2,16.247339,1,6.027648,8.409804,8.513428,5.60247,...,12.332639,4.787551,0,1,0,1,0,1,0,1
2,70,1,0,0,15.368239,0,2.242135,0.213275,6.498805,9.929824,...,29.927783,2.130686,1,0,0,0,1,0,1,1
3,52,0,0,0,15.454557,0,5.997788,1.375045,6.715033,4.196189,...,21.304268,3.391288,1,1,1,0,0,0,1,1
4,87,0,0,1,18.616042,0,9.775243,1.188607,4.657572,9.363925,...,8.336364,3.200969,0,0,0,1,0,1,0,0


In [2]:
#Drop columns with diagnostic testing scores
data_df.drop(["UPDRS", "MoCA", "FunctionalAssessment"], axis=1, inplace=True)
data_df.columns

Index(['Age', 'Gender', 'Ethnicity', 'EducationLevel', 'BMI', 'Smoking',
       'AlcoholConsumption', 'PhysicalActivity', 'DietQuality', 'SleepQuality',
       'FamilyHistoryParkinsons', 'TraumaticBrainInjury', 'Hypertension',
       'Diabetes', 'Depression', 'Stroke', 'SystolicBP', 'DiastolicBP',
       'CholesterolTotal', 'CholesterolLDL', 'CholesterolHDL',
       'CholesterolTriglycerides', 'Tremor', 'Rigidity', 'Bradykinesia',
       'PosturalInstability', 'SpeechProblems', 'SleepDisorders',
       'Constipation', 'Diagnosis'],
      dtype='object')

In [3]:
#Determinethe number of unique values un each column
data_df.nunique()

Age                           40
Gender                         2
Ethnicity                      4
EducationLevel                 4
BMI                         2105
Smoking                        2
AlcoholConsumption          2105
PhysicalActivity            2105
DietQuality                 2105
SleepQuality                2105
FamilyHistoryParkinsons        2
TraumaticBrainInjury           2
Hypertension                   2
Diabetes                       2
Depression                     2
Stroke                         2
SystolicBP                    90
DiastolicBP                   60
CholesterolTotal            2105
CholesterolLDL              2105
CholesterolHDL              2105
CholesterolTriglycerides    2105
Tremor                         2
Rigidity                       2
Bradykinesia                   2
PosturalInstability            2
SpeechProblems                 2
SleepDisorders                 2
Constipation                   2
Diagnosis                      2
dtype: int

In [4]:
#Split data into features and target arrays
X=data_df.copy()
X.drop("Diagnosis", axis=1, inplace=True)

y=data_df["Diagnosis"]

In [5]:
#Split features and array into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [6]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train, and Evaluate Initial Model

In [7]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train.columns)
hidden_nodes_layer1= 20
hidden_nodes_layer2=10

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [8]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [9]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=55)

Epoch 1/55
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.4187 - loss: 0.7860
Epoch 2/55
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5571 - loss: 0.6812
Epoch 3/55
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6241 - loss: 0.6492
Epoch 4/55
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6433 - loss: 0.6372
Epoch 5/55
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6735 - loss: 0.5980
Epoch 6/55
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6927 - loss: 0.5923
Epoch 7/55
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6985 - loss: 0.5697
Epoch 8/55
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7050 - loss: 0.5630
Epoch 9/55
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

In [10]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

17/17 - 0s - 19ms/step - accuracy: 0.6319 - loss: 0.7535
Loss: 0.7535212635993958, Accuracy: 0.6318785548210144


## Process Data to Investigate Optimization

In [11]:
#Determine the statistics for AlcoholConsumption in order to bin data
data_df["AlcoholConsumption"].describe()

count    2105.000000
mean       10.040413
std         5.687014
min         0.002228
25%         5.150278
50%        10.070337
75%        14.829565
max        19.988866
Name: AlcoholConsumption, dtype: float64

In [12]:
#Change 'AlcoholConsumption' from integer to binned ranges
bins= [0, 4, 8, 12, 16, 20]
labels = ['0-4',
          '4.1-8',
          '8.1-12',
          '12.1-16',
          '16.1-20']

data_df['AlcoholConsumption_BINNED'] = pd.cut(data_df['AlcoholConsumption'], bins=bins, labels=labels)

#Drop 'AlcoholConsumption' from the DataFrame so as not to be included in the features
data_df.drop(["AlcoholConsumption"], axis=1, inplace=True)
data_df.head()

Unnamed: 0,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,PhysicalActivity,DietQuality,SleepQuality,FamilyHistoryParkinsons,...,CholesterolTriglycerides,Tremor,Rigidity,Bradykinesia,PosturalInstability,SpeechProblems,SleepDisorders,Constipation,Diagnosis,AlcoholConsumption_BINNED
0,85,0,3,1,19.619878,0,1.38066,3.893969,9.283194,0,...,337.307114,1,0,0,0,0,0,0,0,4.1-8
1,75,0,0,2,16.247339,1,8.409804,8.513428,5.60247,0,...,264.635521,0,1,0,1,0,1,0,1,4.1-8
2,70,1,0,0,15.368239,0,0.213275,6.498805,9.929824,0,...,395.662649,1,0,0,0,1,0,1,1,0-4
3,52,0,0,0,15.454557,0,1.375045,6.715033,4.196189,0,...,362.189688,1,1,1,0,0,0,1,1,4.1-8
4,87,0,0,1,18.616042,0,1.188607,4.657572,9.363925,0,...,149.956586,0,0,0,1,0,1,0,0,8.1-12


In [13]:
# Convert categorical data to numeric with `pd.get_dummies`
data_dummies=pd.get_dummies(data_df,
                                               columns=['AlcoholConsumption_BINNED'],
                                               drop_first=True)
data_dummies.head()

Unnamed: 0,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,PhysicalActivity,DietQuality,SleepQuality,FamilyHistoryParkinsons,...,Bradykinesia,PosturalInstability,SpeechProblems,SleepDisorders,Constipation,Diagnosis,AlcoholConsumption_BINNED_4.1-8,AlcoholConsumption_BINNED_8.1-12,AlcoholConsumption_BINNED_12.1-16,AlcoholConsumption_BINNED_16.1-20
0,85,0,3,1,19.619878,0,1.38066,3.893969,9.283194,0,...,0,0,0,0,0,0,True,False,False,False
1,75,0,0,2,16.247339,1,8.409804,8.513428,5.60247,0,...,0,1,0,1,0,1,True,False,False,False
2,70,1,0,0,15.368239,0,0.213275,6.498805,9.929824,0,...,0,0,1,0,1,1,False,False,False,False
3,52,0,0,0,15.454557,0,1.375045,6.715033,4.196189,0,...,1,0,0,0,1,1,True,False,False,False
4,87,0,0,1,18.616042,0,1.188607,4.657572,9.363925,0,...,0,1,0,1,0,0,False,True,False,False


In [14]:
#Split data into features and target arrays
X=data_dummies.copy()
X.drop("Diagnosis", axis=1, inplace=True)

y=data_dummies["Diagnosis"]

In [15]:
#Split features and array into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [16]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train, and Evaluate New Model

In [17]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train.columns)
hidden_nodes_layer1= 20
hidden_nodes_layer2=10

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [18]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [19]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=55)

Epoch 1/55
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.4824 - loss: 0.7154
Epoch 2/55
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5756 - loss: 0.6691
Epoch 3/55
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6442 - loss: 0.6343
Epoch 4/55
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6909 - loss: 0.6038
Epoch 5/55
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6823 - loss: 0.5928
Epoch 6/55
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7042 - loss: 0.5785
Epoch 7/55
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7374 - loss: 0.5598
Epoch 8/55
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7285 - loss: 0.5462
Epoch 9/55
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

In [20]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

17/17 - 0s - 21ms/step - accuracy: 0.6528 - loss: 0.7399
Loss: 0.7398568391799927, Accuracy: 0.6527514457702637


## Process Data to Investigate Most Influencial Variables

In [22]:
#Re-import and read the Parkinson's data into a DataFrame
parkinsons_df=pd.read_csv("../Resources/Cleaned_PD_Data.csv", index_col=False)
parkinsons_df.drop(["Unnamed: 0"], axis=1, inplace=True)
parkinsons_df.head()

Unnamed: 0,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,...,MoCA,FunctionalAssessment,Tremor,Rigidity,Bradykinesia,PosturalInstability,SpeechProblems,SleepDisorders,Constipation,Diagnosis
0,85,0,3,1,19.619878,0,5.108241,1.38066,3.893969,9.283194,...,29.181289,1.572427,1,0,0,0,0,0,0,0
1,75,0,0,2,16.247339,1,6.027648,8.409804,8.513428,5.60247,...,12.332639,4.787551,0,1,0,1,0,1,0,1
2,70,1,0,0,15.368239,0,2.242135,0.213275,6.498805,9.929824,...,29.927783,2.130686,1,0,0,0,1,0,1,1
3,52,0,0,0,15.454557,0,5.997788,1.375045,6.715033,4.196189,...,21.304268,3.391288,1,1,1,0,0,0,1,1
4,87,0,0,1,18.616042,0,9.775243,1.188607,4.657572,9.363925,...,8.336364,3.200969,0,0,0,1,0,1,0,0


In [23]:
#Drop columns with diagnostic testing scores
parkinsons_df.drop(["UPDRS", "MoCA", "FunctionalAssessment"], axis=1, inplace=True)
parkinsons_df.columns

Index(['Age', 'Gender', 'Ethnicity', 'EducationLevel', 'BMI', 'Smoking',
       'AlcoholConsumption', 'PhysicalActivity', 'DietQuality', 'SleepQuality',
       'FamilyHistoryParkinsons', 'TraumaticBrainInjury', 'Hypertension',
       'Diabetes', 'Depression', 'Stroke', 'SystolicBP', 'DiastolicBP',
       'CholesterolTotal', 'CholesterolLDL', 'CholesterolHDL',
       'CholesterolTriglycerides', 'Tremor', 'Rigidity', 'Bradykinesia',
       'PosturalInstability', 'SpeechProblems', 'SleepDisorders',
       'Constipation', 'Diagnosis'],
      dtype='object')

In [None]:
#Drop ''