In [1]:
import pandas as pd

# Load the CSV file into a DataFrame
embedded_df = pd.read_csv('embedded_review.csv')

# Display the first few rows of the DataFrame to verify it was loaded correctly
embedded_df.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,lengthReview,conditionCluster_label,drugNameCluster_label,...,758,759,760,761,762,763,764,765,766,767
0,95260,Guanfacine,ADHD,My son is halfway through his fourth week of I...,8.0,27-Apr-10,192.0,712.0,2.0,4.0,...,-0.416409,-0.36404,-0.03606,0.383963,0.176255,-0.147201,-0.243359,-0.541467,0.06216,0.049585
1,92703,Lybrel,Birth Control,I used to take another oral contraceptive whic...,5.0,14-Dec-09,17.0,708.0,9.0,6.0,...,-0.279307,-0.419729,-0.389261,0.328398,0.291834,-0.027217,-0.35979,-0.706709,0.047264,-0.017902
2,138000,Ortho Evra,Birth Control,This is my first time using any form of birth ...,8.0,3-Nov-15,10.0,428.0,9.0,4.0,...,-0.232733,-0.031823,-0.032784,0.18844,0.162272,0.363399,-0.09665,-0.693634,-0.024901,0.548486
3,35696,Buprenorphine naloxone,Opiate Dependence,Suboxone has completely turned my life around ...,9.0,27-Nov-16,37.0,669.0,0.0,2.0,...,-0.310564,-0.599643,-0.375174,0.309915,0.577983,0.051811,-0.184821,-0.710691,0.065533,0.371945
4,155963,Cialis,Benign Prostatic Hyperplasia,2nd day on 5mg started to work with rock hard ...,2.0,28-Nov-15,43.0,373.0,0.0,5.0,...,-0.247983,-0.438636,-0.037911,-0.030183,0.50878,0.064493,-0.205261,-0.527391,-0.101341,0.039573


In [2]:
len(embedded_df)

11316

In [3]:
#This Neural Net is just going to focus on classifying the Condition Cluster by the Review Embeddings
#This Neural Net is also going to use the dataframe that has not had dimensions reduced

# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

In [4]:
# I am going to drop the columns that are not embeddings of the review and the condition_cluster

columns_to_drop = ['uniqueID', 'drugName', 'date', 'condition', 'review', 'lengthReview']

embedded_df.drop(columns=columns_to_drop, inplace=True)

In [5]:
#check the dataframe

embedded_df.head()

Unnamed: 0,rating,usefulCount,conditionCluster_label,drugNameCluster_label,0,1,2,3,4,5,...,758,759,760,761,762,763,764,765,766,767
0,8.0,192.0,2.0,4.0,-0.010977,0.010914,0.200967,-0.22949,-0.535286,0.012419,...,-0.416409,-0.36404,-0.03606,0.383963,0.176255,-0.147201,-0.243359,-0.541467,0.06216,0.049585
1,5.0,17.0,9.0,6.0,0.06632,0.189584,0.369006,-0.04692,-0.473988,-0.238288,...,-0.279307,-0.419729,-0.389261,0.328398,0.291834,-0.027217,-0.35979,-0.706709,0.047264,-0.017902
2,8.0,10.0,9.0,4.0,0.084101,-0.019134,0.294494,0.029783,-0.228783,0.170102,...,-0.232733,-0.031823,-0.032784,0.18844,0.162272,0.363399,-0.09665,-0.693634,-0.024901,0.548486
3,9.0,37.0,0.0,2.0,0.00782,0.207558,0.179105,-0.210057,-0.197015,0.104799,...,-0.310564,-0.599643,-0.375174,0.309915,0.577983,0.051811,-0.184821,-0.710691,0.065533,0.371945
4,2.0,43.0,0.0,5.0,-0.193177,0.360585,0.448292,-0.253824,-0.532782,0.085381,...,-0.247983,-0.438636,-0.037911,-0.030183,0.50878,0.064493,-0.205261,-0.527391,-0.101341,0.039573


In [6]:
#checking on why the values of the condition cluster look like a float

condition_value_counts = embedded_df['conditionCluster_label'].value_counts()
print(condition_value_counts)

0.0    4074
8.0    2100
2.0     955
1.0     865
7.0     777
9.0     752
4.0     736
5.0     356
3.0     353
6.0     347
Name: conditionCluster_label, dtype: int64


In [7]:
# When I start training the model I am getting an error messsage that NAN values are present
embedded_df = embedded_df.dropna(subset=['conditionCluster_label'])

In [8]:
# Split our preprocessed data into our features and target arrays
# Choosing randon_state = 30 to be able to re-create attempts later
# There is a lot of data, so I can train on 70%, setting test_size at .3
# This step is necessary for training the neural network to classify 10 categories
# This code is altered by chatgpt after I could not fix it on my own

from keras.utils import to_categorical

# Separate the target variable
target = embedded_df['conditionCluster_label']

# Verify the unique values in the target variable
print(target.unique())

# Extract features (excluding the target variable)
features = embedded_df.drop(columns=['conditionCluster_label'])

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=30)

# Convert target labels to one-hot encoded format
y_train_encoded = to_categorical(y_train, num_classes=10)
y_test_encoded = to_categorical(y_test, num_classes=10)

# Check the shapes of the one-hot encoded target labels
print("Shape of y_train_encoded:", y_train_encoded.shape)
print("Shape of y_test_encoded:", y_test_encoded.shape)

[2. 9. 0. 1. 3. 8. 5. 7. 4. 6.]
Shape of y_train_encoded: (7920, 10)
Shape of y_test_encoded: (3395, 10)


In [9]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [10]:
# Check the number of dimensions, make it a variable so it passes into

num_dimensions = embedded_df.shape[1]-1
print(num_dimensions)

771


In [25]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
# Most of this is going to be the same as the HW assignment, but there are 10 classifications to predict now
# So the output layer has been adjusted

# Changed the number of nodes in the input layer to macth the dimensionality of the input data.
# Changed the hidden layer nodes to have relu as their activation funciton

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=771, activation="sigmoid", input_dim=num_dimensions))

# Second hidden layer
nn.add(Dense(units=600, activation='relu'))

# Third hidden layer (example of adding an additional hidden layer)
nn.add(Dense(units=500, activation='relu'))

#Fourth hidden layer (example of adding another hidden layer)
nn.add(Dense(units=400, activation='relu'))

#Fifth hidden layer (example of adding another hidden layer)
nn.add(Dense(units=300, activation='relu'))

#Sixth hidden layer (example of adding another hidden layer)
nn.add(Dense(units=200, activation='relu'))

#Seventh hidden layer (example of adding another hidden layer)
nn.add(Dense(units=100, activation='relu'))

#Eighth hidden layer (example of adding another hidden layer)
nn.add(Dense(units=50, activation='relu'))

# Output layer
nn.add(Dense(units=10, activation='softmax'))  # 10 units for 10 classes, softmax activation

# Check the structure of the model
nn.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_45 (Dense)            (None, 771)               595212    
                                                                 
 dense_46 (Dense)            (None, 600)               463200    
                                                                 
 dense_47 (Dense)            (None, 500)               300500    
                                                                 
 dense_48 (Dense)            (None, 400)               200400    
                                                                 
 dense_49 (Dense)            (None, 300)               120300    
                                                                 
 dense_50 (Dense)            (None, 200)               60200     
                                                                 
 dense_51 (Dense)            (None, 100)              

In [26]:
from tensorflow.keras.optimizers import Adam
# Compile the model - the loss function is categorical and not for binary classification
custom_optimizer = Adam(learning_rate=0.001)
nn.compile(loss="categorical_crossentropy", optimizer=custom_optimizer, metrics=["accuracy"])

In [27]:
 # Fit the model to the training data
fit_model = nn.fit(X_train_scaled, y_train_encoded, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [28]:
test_loss, test_accuracy = nn.evaluate(X_test_scaled, y_test_encoded)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)

Test Loss: 1.7096757888793945
Test Accuracy: 0.46804124116897583


In [23]:
#Using L2 Normalization
# regularization strength = 0.001
from tensorflow.keras import regularizers

# Define the model - deep neural net with L2 regularization
nn = Sequential()

# First hidden layer with L2 regularization
nn.add(Dense(units=771, activation="sigmoid", kernel_regularizer=regularizers.l2(0.001), input_dim=num_dimensions))

# Second hidden layer with L2 regularization
nn.add(Dense(units=512, activation='relu', kernel_regularizer=regularizers.l2(0.001)))

# Third hidden layer with L2 regularization
nn.add(Dense(units=256, activation='relu', kernel_regularizer=regularizers.l2(0.001)))

# Fourth hidden layer with L2 regularization
nn.add(Dense(units=256, activation='relu', kernel_regularizer=regularizers.l2(0.001)))

# Fifth hidden layer with L2 regularization
nn.add(Dense(units=256, activation='relu', kernel_regularizer=regularizers.l2(0.001)))

# Sixth hidden layer with L2 regularization
nn.add(Dense(units=128, activation='relu', kernel_regularizer=regularizers.l2(0.001)))

# Seventh hidden layer with L2 regularization
nn.add(Dense(units=64, activation='relu', kernel_regularizer=regularizers.l2(0.001)))

# Eighth hidden layer with L2 regularization
nn.add(Dense(units=64, activation='relu', kernel_regularizer=regularizers.l2(0.001)))

# Output layer
nn.add(Dense(units=10, activation='softmax'))

# Check the structure of the model
nn.summary()

# Compile the model
nn.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

# Fit the model to the training data
fit_model = nn.fit(X_train_scaled, y_train_encoded, epochs=200)

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_36 (Dense)            (None, 771)               595212    
                                                                 
 dense_37 (Dense)            (None, 512)               395264    
                                                                 
 dense_38 (Dense)            (None, 256)               131328    
                                                                 
 dense_39 (Dense)            (None, 256)               65792     
                                                                 
 dense_40 (Dense)            (None, 256)               65792     
                                                                 
 dense_41 (Dense)            (None, 128)               32896     
                                                                 
 dense_42 (Dense)            (None, 64)               

In [24]:
test_loss, test_accuracy = nn.evaluate(X_test_scaled, y_test_encoded)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)

Test Loss: 4.547938823699951
Test Accuracy: 0.42268040776252747
