# Training model with features: rating, usefulCount, review-embedding and condition-dummy-variables.

In [57]:
import pandas as pd

# Load the CSV file into a DataFrame
embedded_df = pd.read_csv('embedded_review.csv')

# Display the first few rows of the DataFrame to verify it was loaded correctly
embedded_df.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,lengthReview,conditionCluster_label,drugNameCluster_label,...,758,759,760,761,762,763,764,765,766,767
0,95260,Guanfacine,ADHD,My son is halfway through his fourth week of I...,8.0,27-Apr-10,192.0,712.0,2.0,4.0,...,-0.416409,-0.36404,-0.03606,0.383963,0.176255,-0.147201,-0.243359,-0.541467,0.06216,0.049585
1,92703,Lybrel,Birth Control,I used to take another oral contraceptive whic...,5.0,14-Dec-09,17.0,708.0,9.0,6.0,...,-0.279307,-0.419729,-0.389261,0.328398,0.291834,-0.027217,-0.35979,-0.706709,0.047264,-0.017902
2,138000,Ortho Evra,Birth Control,This is my first time using any form of birth ...,8.0,3-Nov-15,10.0,428.0,9.0,4.0,...,-0.232733,-0.031823,-0.032784,0.18844,0.162272,0.363399,-0.09665,-0.693634,-0.024901,0.548486
3,35696,Buprenorphine naloxone,Opiate Dependence,Suboxone has completely turned my life around ...,9.0,27-Nov-16,37.0,669.0,0.0,2.0,...,-0.310564,-0.599643,-0.375174,0.309915,0.577983,0.051811,-0.184821,-0.710691,0.065533,0.371945
4,155963,Cialis,Benign Prostatic Hyperplasia,2nd day on 5mg started to work with rock hard ...,2.0,28-Nov-15,43.0,373.0,0.0,5.0,...,-0.247983,-0.438636,-0.037911,-0.030183,0.50878,0.064493,-0.205261,-0.527391,-0.101341,0.039573


In [58]:
# Drop non- beneficial columns

columns_to_drop = [ 'uniqueID','date', 'drugName','review','lengthReview', 'conditionCluster_label']

embedded_df.drop(columns=columns_to_drop, inplace=True)

In [59]:
embedded_df.head()

Unnamed: 0,condition,rating,usefulCount,drugNameCluster_label,0,1,2,3,4,5,...,758,759,760,761,762,763,764,765,766,767
0,ADHD,8.0,192.0,4.0,-0.010977,0.010914,0.200967,-0.22949,-0.535286,0.012419,...,-0.416409,-0.36404,-0.03606,0.383963,0.176255,-0.147201,-0.243359,-0.541467,0.06216,0.049585
1,Birth Control,5.0,17.0,6.0,0.06632,0.189584,0.369006,-0.04692,-0.473988,-0.238288,...,-0.279307,-0.419729,-0.389261,0.328398,0.291834,-0.027217,-0.35979,-0.706709,0.047264,-0.017902
2,Birth Control,8.0,10.0,4.0,0.084101,-0.019134,0.294494,0.029783,-0.228783,0.170102,...,-0.232733,-0.031823,-0.032784,0.18844,0.162272,0.363399,-0.09665,-0.693634,-0.024901,0.548486
3,Opiate Dependence,9.0,37.0,2.0,0.00782,0.207558,0.179105,-0.210057,-0.197015,0.104799,...,-0.310564,-0.599643,-0.375174,0.309915,0.577983,0.051811,-0.184821,-0.710691,0.065533,0.371945
4,Benign Prostatic Hyperplasia,2.0,43.0,5.0,-0.193177,0.360585,0.448292,-0.253824,-0.532782,0.085381,...,-0.247983,-0.438636,-0.037911,-0.030183,0.50878,0.064493,-0.205261,-0.527391,-0.101341,0.039573


In [60]:
# Convert drugname into multiple columns of dummy variables

dummies_drugs = pd.get_dummies(embedded_df['condition'])

# Concatenate dummy variables with the original DataFrame
embedded_df = pd.concat([
    embedded_df.drop(columns=['condition']),
    dummies_drugs
], axis=1)

In [61]:
# review embedded_df
embedded_df.head()

Unnamed: 0,rating,usefulCount,drugNameCluster_label,0,1,2,3,4,5,6,...,mance Anxiety,min,min saxagliptin,min sitagliptin,mis,moterol,moterol mometasone,t Pac with Cyclobenzaprine cyclobenzaprine,tic mycophenolic acid,zen Shoulde
0,8.0,192.0,4.0,-0.010977,0.010914,0.200967,-0.22949,-0.535286,0.012419,0.57862,...,0,0,0,0,0,0,0,0,0,0
1,5.0,17.0,6.0,0.06632,0.189584,0.369006,-0.04692,-0.473988,-0.238288,0.341089,...,0,0,0,0,0,0,0,0,0,0
2,8.0,10.0,4.0,0.084101,-0.019134,0.294494,0.029783,-0.228783,0.170102,0.185404,...,0,0,0,0,0,0,0,0,0,0
3,9.0,37.0,2.0,0.00782,0.207558,0.179105,-0.210057,-0.197015,0.104799,0.338058,...,0,0,0,0,0,0,0,0,0,0
4,2.0,43.0,5.0,-0.193177,0.360585,0.448292,-0.253824,-0.532782,0.085381,0.607802,...,0,0,0,0,0,0,0,0,0,0


In [62]:
# drop nan column from drugNAmeCluster_label
embedded_df = embedded_df.dropna(subset=['drugNameCluster_label'])

In [63]:

# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

from keras.utils import to_categorical

# Separate the target variable
target = embedded_df['drugNameCluster_label']

# Verify the unique values in the target variable
print(target.unique())

# Extract features (excluding the target variable)
features = embedded_df.drop(columns=['drugNameCluster_label'])

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=30)

# Convert target labels to one-hot encoded format
y_train_encoded = to_categorical(y_train, num_classes=10)
y_test_encoded = to_categorical(y_test, num_classes=10)

# Check the shapes of the one-hot encoded target labels
print("Shape of y_train_encoded:", y_train_encoded.shape)
print("Shape of y_test_encoded:", y_test_encoded.shape)


[4. 6. 2. 5. 0. 1. 7. 3. 9. 8.]
Shape of y_train_encoded: (7920, 10)
Shape of y_test_encoded: (3395, 10)


In [64]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [65]:
# Check the number of dimensions, make it a variable so it passes into

num_dimensions = embedded_df.shape[1]-1
print(num_dimensions)

1190


In [66]:
# Define the model - using deep neural

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import regularizers

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=num_dimensions, activation="relu", input_dim=num_dimensions))

# Second hidden layer
nn.add(Dense(units=600, activation='leaky_relu'))

# Third hidden layer (example of adding an additional hidden layer)
nn.add(Dense(units=512, activation='relu', kernel_regularizer=regularizers.l2(0.001)))

#Fourth hidden layer (example of adding another hidden layer)
nn.add(Dense(units=256, activation='relu', kernel_regularizer=regularizers.l2(0.001)))

#Fifth hidden layer (example of adding another hidden layer)
nn.add(Dense(units=256, activation='leaky_relu', kernel_regularizer=regularizers.l2(0.001)))

#Sixth hidden layer (example of adding another hidden layer)
nn.add(Dense(units=128, activation='relu', kernel_regularizer=regularizers.l2(0.001)))

#Seventh hidden layer (example of adding another hidden layer)
nn.add(Dense(units=64, activation='relu', kernel_regularizer=regularizers.l2(0.001)))

#Eighth hidden layer (example of adding another hidden layer)
nn.add(Dense(units=15, activation='relu', kernel_regularizer=regularizers.l2(0.001)))

# Output layer
nn.add(Dense(units=10, activation='softmax'))  # 10 units for 10 classes, softmax activation

# Check the structure of the model
nn.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_24 (Dense)            (None, 1190)              1417290   
                                                                 
 dense_25 (Dense)            (None, 600)               714600    
                                                                 
 dense_26 (Dense)            (None, 512)               307712    
                                                                 
 dense_27 (Dense)            (None, 256)               131328    
                                                                 
 dense_28 (Dense)            (None, 256)               65792     
                                                                 
 dense_29 (Dense)            (None, 128)               32896     
                                                                 
 dense_30 (Dense)            (None, 64)               

In [67]:
# Compile the model - the loss function is categorical and not for binary classification

nn.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [68]:
 # Fit the model to the training data
fit_model = nn.fit(X_train_scaled, y_train_encoded, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [69]:
# Evaluate the model using the test data
test_loss, test_accuracy = nn.evaluate(X_test_scaled, y_test_encoded)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)

Test Loss: 2.407886505126953
Test Accuracy: 0.4170839488506317


In [None]:

# Trying hyperparameter in KerasClassifier model to increase accuracy

!pip install scikeras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, InputLayer
from tensorflow.keras.optimizers import Adam, RMSprop
from sklearn.model_selection import GridSearchCV
from scikeras.wrappers import KerasClassifier
from tensorflow.keras import regularizers
import tensorflow as tf


def create_model2(units=1190, activation='sigmoid', kernel_initializer='glorot_uniform', optimizer='adam', learning_rate=0.001, loss='binary_crossentropy'):
    model = Sequential()
    model.add(InputLayer(input_shape=(1190,)))  # Assuming 771 input features, adjust accordingly
    model.add(Dense(units=600, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
    model.add(Dense(units=512, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
    model.add(Dense(units=256, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
    model.add(Dense(units=256, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
    model.add(Dense(units=128, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
    model.add(Dense(units=64, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
    model.add(Dense(units=15, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
    model.add(Dense(units=10, activation='softmax'))


   # Compile the model - the loss function is categorical and not for binary classification
    custom_optimizer = Adam(learning_rate=0.001)
    model.compile(loss="categorical_crossentropy", optimizer=custom_optimizer, metrics=["accuracy"])

    #model.compile(optimizer=adam', loss=loss, metrics=['binary_accuracy'])
    return model

# Define hyperparameters to search
hyperparameters = {
    'units': [32, 64, 128, 256, 512 ],
    'optimizer': ['adam',],
    'learning_rate': [0.001, 0.01, 0.1],

}

# Construct the KerasClassifier with the create_model function and other necessary parameters
model = KerasClassifier(build_fn=create_model2, verbose=0, learning_rate=0.001, units=1190)




In [None]:

# Create GridSearchCV instance
grid = GridSearchCV(estimator=model, param_grid=hyperparameters, scoring='accuracy', error_score='raise')

In [None]:
# Fit the grid search
grid_result = grid.fit(X_train_scaled, y_train_encoded, epochs=10)

  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y =

In [None]:
best_model = grid_result.best_estimator_
best_model

In [None]:
test_accuracy = best_model.score(X_test_scaled, y_test_encoded)
print('Test accuracy:', test_accuracy)

Test accuracy: 0.40706921944035346


# Training model with embedded_review_reduced csv

In [70]:

import pandas as pd

# Load the review embeddings CSV file into a DataFrame
embedded_df = pd.read_csv('embedded_review_reduced.csv')

# Display the first few rows of the DataFrame to verify it was loaded correctly
embedded_df.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,lengthReview,conditionCluster_label,drugNameCluster_label,...,320,321,322,323,324,325,326,327,328,329
0,95260,Guanfacine,ADHD,My son is halfway through his fourth week of I...,8.0,27-Apr-10,192.0,712.0,2.0,4.0,...,-0.072203,0.050732,-0.066741,-0.018283,-0.092436,-0.059485,-0.002017,0.068087,0.062693,0.019311
1,92703,Lybrel,Birth Control,I used to take another oral contraceptive whic...,5.0,14-Dec-09,17.0,708.0,9.0,6.0,...,0.022241,-0.060561,0.042767,0.100795,-0.017772,-0.053921,-0.089212,-0.116708,0.053316,-0.029458
2,138000,Ortho Evra,Birth Control,This is my first time using any form of birth ...,8.0,3-Nov-15,10.0,428.0,9.0,4.0,...,0.100959,-0.072072,-0.060204,-0.01296,0.144626,-0.023831,0.137507,-0.030472,-0.074193,0.128502
3,35696,Buprenorphine naloxone,Opiate Dependence,Suboxone has completely turned my life around ...,9.0,27-Nov-16,37.0,669.0,0.0,2.0,...,0.117297,0.001229,0.01479,-0.107192,-0.008831,-0.10001,-0.108753,0.033484,0.010719,0.117703
4,155963,Cialis,Benign Prostatic Hyperplasia,2nd day on 5mg started to work with rock hard ...,2.0,28-Nov-15,43.0,373.0,0.0,5.0,...,0.06463,0.036089,-0.013739,-0.062118,-0.048534,0.005309,-0.077575,0.146462,-0.013111,-0.208725


In [71]:
# Droping  the non-beneficial columns

columns_to_drop = [ 'uniqueID','date', 'drugName','review','lengthReview', 'conditionCluster_label']

embedded_df.drop(columns=columns_to_drop, inplace=True)

In [72]:
#review embedded_df
embedded_df.head()

Unnamed: 0,condition,rating,usefulCount,drugNameCluster_label,0,1,2,3,4,5,...,320,321,322,323,324,325,326,327,328,329
0,ADHD,8.0,192.0,4.0,1.887339,-0.869075,-2.625546,1.417739,0.745655,-0.63982,...,-0.072203,0.050732,-0.066741,-0.018283,-0.092436,-0.059485,-0.002017,0.068087,0.062693,0.019311
1,Birth Control,5.0,17.0,6.0,-0.000775,0.706566,-0.605744,-1.173229,0.28467,0.259833,...,0.022241,-0.060561,0.042767,0.100795,-0.017772,-0.053921,-0.089212,-0.116708,0.053316,-0.029458
2,Birth Control,8.0,10.0,4.0,0.710067,1.259798,-1.086482,-0.472165,-1.45568,-0.347598,...,0.100959,-0.072072,-0.060204,-0.01296,0.144626,-0.023831,0.137507,-0.030472,-0.074193,0.128502
3,Opiate Dependence,9.0,37.0,2.0,0.987319,-0.213137,-0.683326,-1.648944,-0.1489,0.311274,...,0.117297,0.001229,0.01479,-0.107192,-0.008831,-0.10001,-0.108753,0.033484,0.010719,0.117703
4,Benign Prostatic Hyperplasia,2.0,43.0,5.0,-0.92463,0.702602,0.450114,-0.958159,-0.097065,-1.075369,...,0.06463,0.036089,-0.013739,-0.062118,-0.048534,0.005309,-0.077575,0.146462,-0.013111,-0.208725


In [73]:
# Convert drugname into multiple columns of dummy variables

dummies_drugs = pd.get_dummies(embedded_df['condition'])

# Concatenate dummy variables with the original DataFrame
embedded_df = pd.concat([
    embedded_df.drop(columns=['condition']),
    dummies_drugs
], axis=1)

In [74]:
#review embedded_df
embedded_df.head()

Unnamed: 0,rating,usefulCount,drugNameCluster_label,0,1,2,3,4,5,6,...,mance Anxiety,min,min saxagliptin,min sitagliptin,mis,moterol,moterol mometasone,t Pac with Cyclobenzaprine cyclobenzaprine,tic mycophenolic acid,zen Shoulde
0,8.0,192.0,4.0,1.887339,-0.869075,-2.625546,1.417739,0.745655,-0.63982,-0.805294,...,0,0,0,0,0,0,0,0,0,0
1,5.0,17.0,6.0,-0.000775,0.706566,-0.605744,-1.173229,0.28467,0.259833,0.353663,...,0,0,0,0,0,0,0,0,0,0
2,8.0,10.0,4.0,0.710067,1.259798,-1.086482,-0.472165,-1.45568,-0.347598,0.39165,...,0,0,0,0,0,0,0,0,0,0
3,9.0,37.0,2.0,0.987319,-0.213137,-0.683326,-1.648944,-0.1489,0.311274,0.131049,...,0,0,0,0,0,0,0,0,0,0
4,2.0,43.0,5.0,-0.92463,0.702602,0.450114,-0.958159,-0.097065,-1.075369,-0.084259,...,0,0,0,0,0,0,0,0,0,0


In [75]:
# Drop NAN column from drug name cluster label
embedded_df = embedded_df.dropna(subset=['drugNameCluster_label'])

In [76]:
#data split step
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

from keras.utils import to_categorical

# Separate the target variable
target = embedded_df['drugNameCluster_label']

# Verify the unique values in the target variable
print(target.unique())

# Extract features (excluding the target variable)
features = embedded_df.drop(columns=['drugNameCluster_label'])

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=30)

# Convert target labels to one-hot encoded format
y_train_encoded = to_categorical(y_train, num_classes=10)
y_test_encoded = to_categorical(y_test, num_classes=10)

# Check the shapes of the one-hot encoded target labels
print("Shape of y_train_encoded:", y_train_encoded.shape)
print("Shape of y_test_encoded:", y_test_encoded.shape)

[4. 6. 2. 5. 0. 1. 7. 3. 9. 8.]
Shape of y_train_encoded: (7920, 10)
Shape of y_test_encoded: (3395, 10)


In [77]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [78]:
# Check the number of dimensions, make it a variable so it passes into

num_dimensions = embedded_df.shape[1]-1
print(num_dimensions)

752


In [79]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
# Most of this is going to be the same as the HW assignment, but there are 10 classifications to predict now
# So the output layer has been adjusted

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import regularizers

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=num_dimensions, activation="relu", input_dim=num_dimensions))

# Second hidden layer
nn.add(Dense(units=600, activation='leaky_relu'))

# Third hidden layer (example of adding an additional hidden layer)
nn.add(Dense(units=512, activation='relu', kernel_regularizer=regularizers.l2(0.001)))

#Fourth hidden layer (example of adding another hidden layer)
nn.add(Dense(units=256, activation='relu', kernel_regularizer=regularizers.l2(0.001)))

#Fifth hidden layer (example of adding another hidden layer)
nn.add(Dense(units=256, activation='leaky_relu', kernel_regularizer=regularizers.l2(0.001)))

#Sixth hidden layer (example of adding another hidden layer)
nn.add(Dense(units=128, activation='relu', kernel_regularizer=regularizers.l2(0.001)))

#Seventh hidden layer (example of adding another hidden layer)
nn.add(Dense(units=64, activation='relu', kernel_regularizer=regularizers.l2(0.001)))

#Eighth hidden layer (example of adding another hidden layer)
nn.add(Dense(units=15, activation='relu', kernel_regularizer=regularizers.l2(0.001)))

# Output layer
nn.add(Dense(units=10, activation='softmax'))  # 10 units for 10 classes, softmax activation

# Check the structure of the model
nn.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_33 (Dense)            (None, 752)               566256    
                                                                 
 dense_34 (Dense)            (None, 600)               451800    
                                                                 
 dense_35 (Dense)            (None, 512)               307712    
                                                                 
 dense_36 (Dense)            (None, 256)               131328    
                                                                 
 dense_37 (Dense)            (None, 256)               65792     
                                                                 
 dense_38 (Dense)            (None, 128)               32896     
                                                                 
 dense_39 (Dense)            (None, 64)               

In [80]:
# Compile the model - the loss function is categorical and not for binary classification

nn.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [81]:
 # Fit the model to the training data
fit_model = nn.fit(X_train_scaled, y_train_encoded, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [83]:
test_loss, test_accuracy = nn.evaluate(X_test_scaled, y_test_encoded)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)

Test Loss: 2.930248498916626
Test Accuracy: 0.40913107991218567


# Trying to predict DrugName using sentiment, review embedding and condition dummy variables

In [84]:
import pandas as pd

# Load the review embeddings CSV file into a DataFrame
embedded_df = pd.read_csv('embedded_review_reduced.csv')

# Display the first few rows of the DataFrame to verify it was loaded correctly
embedded_df.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,lengthReview,conditionCluster_label,drugNameCluster_label,...,320,321,322,323,324,325,326,327,328,329
0,95260,Guanfacine,ADHD,My son is halfway through his fourth week of I...,8.0,27-Apr-10,192.0,712.0,2.0,4.0,...,-0.072203,0.050732,-0.066741,-0.018283,-0.092436,-0.059485,-0.002017,0.068087,0.062693,0.019311
1,92703,Lybrel,Birth Control,I used to take another oral contraceptive whic...,5.0,14-Dec-09,17.0,708.0,9.0,6.0,...,0.022241,-0.060561,0.042767,0.100795,-0.017772,-0.053921,-0.089212,-0.116708,0.053316,-0.029458
2,138000,Ortho Evra,Birth Control,This is my first time using any form of birth ...,8.0,3-Nov-15,10.0,428.0,9.0,4.0,...,0.100959,-0.072072,-0.060204,-0.01296,0.144626,-0.023831,0.137507,-0.030472,-0.074193,0.128502
3,35696,Buprenorphine naloxone,Opiate Dependence,Suboxone has completely turned my life around ...,9.0,27-Nov-16,37.0,669.0,0.0,2.0,...,0.117297,0.001229,0.01479,-0.107192,-0.008831,-0.10001,-0.108753,0.033484,0.010719,0.117703
4,155963,Cialis,Benign Prostatic Hyperplasia,2nd day on 5mg started to work with rock hard ...,2.0,28-Nov-15,43.0,373.0,0.0,5.0,...,0.06463,0.036089,-0.013739,-0.062118,-0.048534,0.005309,-0.077575,0.146462,-0.013111,-0.208725


In [85]:
# Read the review_sentiment CSV
sentiment_df = pd.read_csv('reviews_sentiments.csv')

# Display the first few rows of the DataFrame to verify it was loaded correctly
sentiment_df.head()

Unnamed: 0,uniqueID,review,sentiment
0,95260,My son is halfway through his fourth week of I...,POSITIVE
1,92703,I used to take another oral contraceptive whic...,NEGATIVE
2,138000,This is my first time using any form of birth ...,NEGATIVE
3,35696,Suboxone has completely turned my life around ...,POSITIVE
4,155963,2nd day on 5mg started to work with rock hard ...,NEGATIVE


In [86]:
#update the sentiment labels

sentiment_df['sentiment'].replace('POSITIVE', 0, inplace =True)

sentiment_df['sentiment'].replace('NEGATIVE', 1, inplace =True)

sentiment_df['sentiment'].replace('NEUTRAL', 2, inplace =True)

sentiment_df.head()

Unnamed: 0,uniqueID,review,sentiment
0,95260,My son is halfway through his fourth week of I...,0
1,92703,I used to take another oral contraceptive whic...,1
2,138000,This is my first time using any form of birth ...,1
3,35696,Suboxone has completely turned my life around ...,0
4,155963,2nd day on 5mg started to work with rock hard ...,1


In [87]:
# Records for each sentiments.POstive and negative labels are almost balanced, but very few records with neutral label.
sentiment_df['sentiment'].value_counts()

0    6294
1    4953
2      68
Name: sentiment, dtype: int64

In [88]:
# drop the review column from sentiment_df
sentiment_df.drop('review', axis=1, inplace=True)

In [89]:
sentiment_df.head()

Unnamed: 0,uniqueID,sentiment
0,95260,0
1,92703,1
2,138000,1
3,35696,0
4,155963,1


In [90]:
embedded_sentiment_df = pd.merge(embedded_df, sentiment_df, on='uniqueID')

embedded_sentiment_df.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,lengthReview,conditionCluster_label,drugNameCluster_label,...,321,322,323,324,325,326,327,328,329,sentiment
0,95260,Guanfacine,ADHD,My son is halfway through his fourth week of I...,8.0,27-Apr-10,192.0,712.0,2.0,4.0,...,0.050732,-0.066741,-0.018283,-0.092436,-0.059485,-0.002017,0.068087,0.062693,0.019311,0
1,92703,Lybrel,Birth Control,I used to take another oral contraceptive whic...,5.0,14-Dec-09,17.0,708.0,9.0,6.0,...,-0.060561,0.042767,0.100795,-0.017772,-0.053921,-0.089212,-0.116708,0.053316,-0.029458,1
2,138000,Ortho Evra,Birth Control,This is my first time using any form of birth ...,8.0,3-Nov-15,10.0,428.0,9.0,4.0,...,-0.072072,-0.060204,-0.01296,0.144626,-0.023831,0.137507,-0.030472,-0.074193,0.128502,1
3,35696,Buprenorphine naloxone,Opiate Dependence,Suboxone has completely turned my life around ...,9.0,27-Nov-16,37.0,669.0,0.0,2.0,...,0.001229,0.01479,-0.107192,-0.008831,-0.10001,-0.108753,0.033484,0.010719,0.117703,0
4,155963,Cialis,Benign Prostatic Hyperplasia,2nd day on 5mg started to work with rock hard ...,2.0,28-Nov-15,43.0,373.0,0.0,5.0,...,0.036089,-0.013739,-0.062118,-0.048534,0.005309,-0.077575,0.146462,-0.013111,-0.208725,1


In [91]:
# Total number of records in the dataframe
len(embedded_sentiment_df)

11315

In [92]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

In [93]:
embedded_sentiment_df.columns

Index(['uniqueID', 'drugName', 'condition', 'review', 'rating', 'date',
       'usefulCount', 'lengthReview', 'conditionCluster_label',
       'drugNameCluster_label',
       ...
       '321', '322', '323', '324', '325', '326', '327', '328', '329',
       'sentiment'],
      dtype='object', length=341)

In [94]:
# Convert drugname into multiple columns of dummy variables

dummies_drugs = pd.get_dummies(embedded_sentiment_df['condition'])

# Concatenate dummy variables with the original DataFrame
embedded_sentiment_df = pd.concat([
    embedded_sentiment_df.drop(columns=['condition']),
    dummies_drugs
], axis=1)

In [95]:
embedded_sentiment_df.head()

Unnamed: 0,uniqueID,drugName,review,rating,date,usefulCount,lengthReview,conditionCluster_label,drugNameCluster_label,0,...,mance Anxiety,min,min saxagliptin,min sitagliptin,mis,moterol,moterol mometasone,t Pac with Cyclobenzaprine cyclobenzaprine,tic mycophenolic acid,zen Shoulde
0,95260,Guanfacine,My son is halfway through his fourth week of I...,8.0,27-Apr-10,192.0,712.0,2.0,4.0,1.887339,...,0,0,0,0,0,0,0,0,0,0
1,92703,Lybrel,I used to take another oral contraceptive whic...,5.0,14-Dec-09,17.0,708.0,9.0,6.0,-0.000775,...,0,0,0,0,0,0,0,0,0,0
2,138000,Ortho Evra,This is my first time using any form of birth ...,8.0,3-Nov-15,10.0,428.0,9.0,4.0,0.710067,...,0,0,0,0,0,0,0,0,0,0
3,35696,Buprenorphine naloxone,Suboxone has completely turned my life around ...,9.0,27-Nov-16,37.0,669.0,0.0,2.0,0.987319,...,0,0,0,0,0,0,0,0,0,0
4,155963,Cialis,2nd day on 5mg started to work with rock hard ...,2.0,28-Nov-15,43.0,373.0,0.0,5.0,-0.92463,...,0,0,0,0,0,0,0,0,0,0


In [96]:
# Drop all the columns that are not embeddings of the review and the sentiment labels.

columns_to_drop = ['uniqueID', 'drugName', 'review', 'date', 'lengthReview', 'conditionCluster_label']

embedded_sentiment_df_new = embedded_sentiment_df.drop(columns=columns_to_drop)
embedded_sentiment_df_new.head(5)

Unnamed: 0,rating,usefulCount,drugNameCluster_label,0,1,2,3,4,5,6,...,mance Anxiety,min,min saxagliptin,min sitagliptin,mis,moterol,moterol mometasone,t Pac with Cyclobenzaprine cyclobenzaprine,tic mycophenolic acid,zen Shoulde
0,8.0,192.0,4.0,1.887339,-0.869075,-2.625546,1.417739,0.745655,-0.63982,-0.805294,...,0,0,0,0,0,0,0,0,0,0
1,5.0,17.0,6.0,-0.000775,0.706566,-0.605744,-1.173229,0.28467,0.259833,0.353663,...,0,0,0,0,0,0,0,0,0,0
2,8.0,10.0,4.0,0.710067,1.259798,-1.086482,-0.472165,-1.45568,-0.347598,0.39165,...,0,0,0,0,0,0,0,0,0,0
3,9.0,37.0,2.0,0.987319,-0.213137,-0.683326,-1.648944,-0.1489,0.311274,0.131049,...,0,0,0,0,0,0,0,0,0,0
4,2.0,43.0,5.0,-0.92463,0.702602,0.450114,-0.958159,-0.097065,-1.075369,-0.084259,...,0,0,0,0,0,0,0,0,0,0


In [97]:
# Split our preprocessed data into our features and target arrays

from keras.utils import to_categorical

# Separate the target variable
target = embedded_sentiment_df_new['drugNameCluster_label']

# Verify the unique values in the target variable
print(target.unique())

# Extract features (excluding the target variable)
features = embedded_sentiment_df_new.drop(columns=['drugNameCluster_label'])

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=30)

# Convert target labels to one-hot encoded format
y_train_encoded = to_categorical(y_train, num_classes=10)
y_test_encoded = to_categorical(y_test, num_classes=10)

# Check the shapes of the one-hot encoded target labels
print("Shape of y_train_encoded:", y_train_encoded.shape)
print("Shape of y_test_encoded:", y_test_encoded.shape)

[4. 6. 2. 5. 0. 1. 7. 3. 9. 8.]
Shape of y_train_encoded: (7920, 10)
Shape of y_test_encoded: (3395, 10)


In [98]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [99]:
# Check the number of dimensions, make it a variable so it passes into

num_dimensions = embedded_sentiment_df_new.shape[1]-1
print(num_dimensions)

753


In [100]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
# 3 classifications to predict now


from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

nn = tf.keras.models.Sequential()

# Input layer
nn.add(tf.keras.layers.Dense(units=768, activation="sigmoid", input_dim=num_dimensions))

# Second hidden layer
nn.add(Dense(units=512, activation='relu'))

# Third hidden layer (example of adding an additional hidden layer)
nn.add(Dense(units=256, activation='relu'))

#Fourth hidden layer (example of adding another hidden layer)
nn.add(Dense(units=256, activation='relu'))

#Fifth hidden layer (example of adding another hidden layer)
# nn.add(Dense(units=100, activation='relu'))

#Sixth hidden layer (example of adding another hidden layer)
nn.add(Dense(units=50, activation='relu'))

#Seventh hidden layer (example of adding another hidden layer)
nn.add(Dense(units=25, activation='relu'))

#Eighth hidden layer (example of adding another hidden layer)
nn.add(Dense(units=15, activation='relu'))

# Output layer
nn.add(Dense(units=10, activation='softmax'))  # 3 units for 3 classes, softmax activation

# Check the structure of the model
nn.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_42 (Dense)            (None, 768)               579072    
                                                                 
 dense_43 (Dense)            (None, 512)               393728    
                                                                 
 dense_44 (Dense)            (None, 256)               131328    
                                                                 
 dense_45 (Dense)            (None, 256)               65792     
                                                                 
 dense_46 (Dense)            (None, 50)                12850     
                                                                 
 dense_47 (Dense)            (None, 25)                1275      
                                                                 
 dense_48 (Dense)            (None, 15)               

In [101]:
#compile the model
nn.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [103]:
 # Fit the model to the training data
fit_model = nn.fit(X_train_scaled, y_train_encoded, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [105]:
test_loss, test_accuracy = nn.evaluate(X_test_scaled, y_test_encoded)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)

Test Loss: 3.37424898147583
Test Accuracy: 0.4085419774055481


# Predicting DrugName_Cluster_label (target) using review embedding, sentiment, rating, usefulCount, condition dummy variables and drugName dummy variables (features)

In [106]:
import pandas as pd

# Load the review embeddings CSV file into a DataFrame
embedded_df = pd.read_csv('embedded_review_reduced.csv')

# Display the first few rows of the DataFrame to verify it was loaded correctly
embedded_df.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,lengthReview,conditionCluster_label,drugNameCluster_label,...,320,321,322,323,324,325,326,327,328,329
0,95260,Guanfacine,ADHD,My son is halfway through his fourth week of I...,8.0,27-Apr-10,192.0,712.0,2.0,4.0,...,-0.072203,0.050732,-0.066741,-0.018283,-0.092436,-0.059485,-0.002017,0.068087,0.062693,0.019311
1,92703,Lybrel,Birth Control,I used to take another oral contraceptive whic...,5.0,14-Dec-09,17.0,708.0,9.0,6.0,...,0.022241,-0.060561,0.042767,0.100795,-0.017772,-0.053921,-0.089212,-0.116708,0.053316,-0.029458
2,138000,Ortho Evra,Birth Control,This is my first time using any form of birth ...,8.0,3-Nov-15,10.0,428.0,9.0,4.0,...,0.100959,-0.072072,-0.060204,-0.01296,0.144626,-0.023831,0.137507,-0.030472,-0.074193,0.128502
3,35696,Buprenorphine naloxone,Opiate Dependence,Suboxone has completely turned my life around ...,9.0,27-Nov-16,37.0,669.0,0.0,2.0,...,0.117297,0.001229,0.01479,-0.107192,-0.008831,-0.10001,-0.108753,0.033484,0.010719,0.117703
4,155963,Cialis,Benign Prostatic Hyperplasia,2nd day on 5mg started to work with rock hard ...,2.0,28-Nov-15,43.0,373.0,0.0,5.0,...,0.06463,0.036089,-0.013739,-0.062118,-0.048534,0.005309,-0.077575,0.146462,-0.013111,-0.208725


In [107]:
# Read the review_sentiment CSV
sentiment_df = pd.read_csv('reviews_sentiments.csv')

# Display the first few rows of the DataFrame to verify it was loaded correctly
sentiment_df.head()

Unnamed: 0,uniqueID,review,sentiment
0,95260,My son is halfway through his fourth week of I...,POSITIVE
1,92703,I used to take another oral contraceptive whic...,NEGATIVE
2,138000,This is my first time using any form of birth ...,NEGATIVE
3,35696,Suboxone has completely turned my life around ...,POSITIVE
4,155963,2nd day on 5mg started to work with rock hard ...,NEGATIVE


In [108]:
#update the sentiment labels

sentiment_df['sentiment'].replace('POSITIVE', 0, inplace =True)

sentiment_df['sentiment'].replace('NEGATIVE', 1, inplace =True)

sentiment_df['sentiment'].replace('NEUTRAL', 2, inplace =True)

sentiment_df.head()

Unnamed: 0,uniqueID,review,sentiment
0,95260,My son is halfway through his fourth week of I...,0
1,92703,I used to take another oral contraceptive whic...,1
2,138000,This is my first time using any form of birth ...,1
3,35696,Suboxone has completely turned my life around ...,0
4,155963,2nd day on 5mg started to work with rock hard ...,1


In [22]:
# drop the review column from sentiment_df
sentiment_df.drop('review', axis=1, inplace=True)
sentiment_df.head()

Unnamed: 0,uniqueID,sentiment
0,95260,0
1,92703,1
2,138000,1
3,35696,0
4,155963,1


In [109]:
# merge embedded_df and sentiment_df
embedded_sentiment_df = pd.merge(embedded_df, sentiment_df, on='uniqueID')

embedded_sentiment_df.head()

Unnamed: 0,uniqueID,drugName,condition,review_x,rating,date,usefulCount,lengthReview,conditionCluster_label,drugNameCluster_label,...,322,323,324,325,326,327,328,329,review_y,sentiment
0,95260,Guanfacine,ADHD,My son is halfway through his fourth week of I...,8.0,27-Apr-10,192.0,712.0,2.0,4.0,...,-0.066741,-0.018283,-0.092436,-0.059485,-0.002017,0.068087,0.062693,0.019311,My son is halfway through his fourth week of I...,0
1,92703,Lybrel,Birth Control,I used to take another oral contraceptive whic...,5.0,14-Dec-09,17.0,708.0,9.0,6.0,...,0.042767,0.100795,-0.017772,-0.053921,-0.089212,-0.116708,0.053316,-0.029458,I used to take another oral contraceptive whic...,1
2,138000,Ortho Evra,Birth Control,This is my first time using any form of birth ...,8.0,3-Nov-15,10.0,428.0,9.0,4.0,...,-0.060204,-0.01296,0.144626,-0.023831,0.137507,-0.030472,-0.074193,0.128502,This is my first time using any form of birth ...,1
3,35696,Buprenorphine naloxone,Opiate Dependence,Suboxone has completely turned my life around ...,9.0,27-Nov-16,37.0,669.0,0.0,2.0,...,0.01479,-0.107192,-0.008831,-0.10001,-0.108753,0.033484,0.010719,0.117703,Suboxone has completely turned my life around ...,0
4,155963,Cialis,Benign Prostatic Hyperplasia,2nd day on 5mg started to work with rock hard ...,2.0,28-Nov-15,43.0,373.0,0.0,5.0,...,-0.013739,-0.062118,-0.048534,0.005309,-0.077575,0.146462,-0.013111,-0.208725,2nd day on 5mg started to work with rock hard ...,1


In [110]:
# Total number of records in the dataframe
len(embedded_sentiment_df)

11315

In [111]:
#Review embedded_sentiment_df
embedded_sentiment_df.columns

Index(['uniqueID', 'drugName', 'condition', 'review_x', 'rating', 'date',
       'usefulCount', 'lengthReview', 'conditionCluster_label',
       'drugNameCluster_label',
       ...
       '322', '323', '324', '325', '326', '327', '328', '329', 'review_y',
       'sentiment'],
      dtype='object', length=342)

In [112]:
# Convert condition into multiple columns of dummy variables

dummies_condition_sentiment = pd.get_dummies(embedded_sentiment_df['condition'])

# Concatenate dummy variables with the original DataFrame
embedded_sentiment_df = pd.concat([
    embedded_sentiment_df.drop(columns=['condition']),
    dummies_condition_sentiment
], axis=1)

In [113]:
# Convert drugname into multiple columns of dummy variables

dummies_drugs = pd.get_dummies(embedded_sentiment_df['drugName'])

# Concatenate dummy variables with the original DataFrame
embedded_sentiment_df = pd.concat([
    embedded_sentiment_df.drop(columns=['drugName']),
    dummies_drugs
], axis=1)

In [116]:
#Review embedded_sentiment_df
embedded_sentiment_df.head()

Unnamed: 0,uniqueID,review_x,rating,date,usefulCount,lengthReview,conditionCluster_label,drugNameCluster_label,0,1,...,Zovirax Ointment,Zyban,Zyclara,Zymine,Zyprexa,Zyprexa Zydis,Zyrtec,Zyvox,ella,femhrt
0,95260,My son is halfway through his fourth week of I...,8.0,27-Apr-10,192.0,712.0,2.0,4.0,1.887339,-0.869075,...,0,0,0,0,0,0,0,0,0,0
1,92703,I used to take another oral contraceptive whic...,5.0,14-Dec-09,17.0,708.0,9.0,6.0,-0.000775,0.706566,...,0,0,0,0,0,0,0,0,0,0
2,138000,This is my first time using any form of birth ...,8.0,3-Nov-15,10.0,428.0,9.0,4.0,0.710067,1.259798,...,0,0,0,0,0,0,0,0,0,0
3,35696,Suboxone has completely turned my life around ...,9.0,27-Nov-16,37.0,669.0,0.0,2.0,0.987319,-0.213137,...,0,0,0,0,0,0,0,0,0,0
4,155963,2nd day on 5mg started to work with rock hard ...,2.0,28-Nov-15,43.0,373.0,0.0,5.0,-0.92463,0.702602,...,0,0,0,0,0,0,0,0,0,0


In [123]:
# Drop all the non- beneficial columns

columns_to_drop = ['uniqueID', 'review_x', 'review_y', 'date', 'lengthReview', 'conditionCluster_label']

embedded_sentiment_df_new = embedded_sentiment_df.drop(columns=columns_to_drop)
embedded_sentiment_df_new.head(5)

Unnamed: 0,rating,usefulCount,drugNameCluster_label,0,1,2,3,4,5,6,...,Zovirax Ointment,Zyban,Zyclara,Zymine,Zyprexa,Zyprexa Zydis,Zyrtec,Zyvox,ella,femhrt
0,8.0,192.0,4.0,1.887339,-0.869075,-2.625546,1.417739,0.745655,-0.63982,-0.805294,...,0,0,0,0,0,0,0,0,0,0
1,5.0,17.0,6.0,-0.000775,0.706566,-0.605744,-1.173229,0.28467,0.259833,0.353663,...,0,0,0,0,0,0,0,0,0,0
2,8.0,10.0,4.0,0.710067,1.259798,-1.086482,-0.472165,-1.45568,-0.347598,0.39165,...,0,0,0,0,0,0,0,0,0,0
3,9.0,37.0,2.0,0.987319,-0.213137,-0.683326,-1.648944,-0.1489,0.311274,0.131049,...,0,0,0,0,0,0,0,0,0,0
4,2.0,43.0,5.0,-0.92463,0.702602,0.450114,-0.958159,-0.097065,-1.075369,-0.084259,...,0,0,0,0,0,0,0,0,0,0


In [124]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

# Split our preprocessed data into our features and target arrays

from keras.utils import to_categorical

# Separate the target variable
target = embedded_sentiment_df_new['drugNameCluster_label']

# Extract features (excluding the target variable)
features = embedded_sentiment_df_new.drop(columns=['drugNameCluster_label'])

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=30)

# Convert target labels to one-hot encoded format
y_train_encoded = to_categorical(y_train, num_classes=10)
y_test_encoded = to_categorical(y_test, num_classes=10)

# Verify the unique values in the target variable
print(target.unique())

# Check the shapes of the one-hot encoded target labels
print("Shape of y_train_encoded:", y_train_encoded.shape)
print("Shape of y_test_encoded:", y_test_encoded.shape)

[4. 6. 2. 5. 0. 1. 7. 3. 9. 8.]
Shape of y_train_encoded: (7920, 10)
Shape of y_test_encoded: (3395, 10)


In [125]:
# Review feature columns
features.columns

Index(['rating', 'usefulCount', '0', '1', '2', '3', '4', '5', '6', '7',
       ...
       'Zovirax Ointment', 'Zyban', 'Zyclara', 'Zymine', 'Zyprexa',
       'Zyprexa Zydis', 'Zyrtec', 'Zyvox', 'ella', 'femhrt'],
      dtype='object', length=2175)

In [126]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [127]:
# Check the number of dimensions, make it a variable so it passes into

num_dimensions = embedded_sentiment_df_new.shape[1]-1
print(num_dimensions)

2175


In [128]:
# Import our dependencies
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


nn = tf.keras.models.Sequential()

# Input layer
nn.add(tf.keras.layers.Dense(units=332, activation="sigmoid", input_dim=num_dimensions))

# Second hidden layer
nn.add(Dense(units=250, activation='relu'))

# Third hidden layer (example of adding an additional hidden layer)
nn.add(Dense(units=200, activation='relu'))

#Fourth hidden layer (example of adding another hidden layer)
nn.add(Dense(units=100, activation='relu'))

#Sixth hidden layer (example of adding another hidden layer)
nn.add(Dense(units=50, activation='relu'))

#Seventh hidden layer (example of adding another hidden layer)
nn.add(Dense(units=25, activation='relu'))

#Eighth hidden layer (example of adding another hidden layer)
nn.add(Dense(units=15, activation='relu'))

# Output layer
nn.add(Dense(units=10, activation='softmax'))  # 3 units for 3 classes, softmax activation

# Check the structure of the model
nn.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_50 (Dense)            (None, 332)               722432    
                                                                 
 dense_51 (Dense)            (None, 250)               83250     
                                                                 
 dense_52 (Dense)            (None, 200)               50200     
                                                                 
 dense_53 (Dense)            (None, 100)               20100     
                                                                 
 dense_54 (Dense)            (None, 50)                5050      
                                                                 
 dense_55 (Dense)            (None, 25)                1275      
                                                                 
 dense_56 (Dense)            (None, 15)               

In [129]:
#compile the model
nn.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [130]:
 # Fit the model to the training data
fit_model = nn.fit(X_train_scaled, y_train_encoded, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [218]:
# Evaluate the model using the test data
test_loss, test_accuracy = nn.evaluate(X_test_scaled, y_test_encoded)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)

Test Loss: 0.3547576367855072
Test Accuracy: 0.9540500640869141


In [219]:
import numpy as np
# Get predicted probabilities for each class
y_pred_probabilities = nn.predict(X_test_scaled)
# Convert probabilities to predicted classes by selecting the class with the highest probability
y_pred = np.argmax(y_pred_probabilities, axis=1)
# Convert one-hot encoded y_test_encoded back to categorical labels
y_test_categorical = np.argmax(y_test_encoded, axis=1)
# Create a DataFrame to compare predicted and actual classes
result_df = pd.DataFrame({'Actual': y_test_categorical, 'Predicted': y_pred})
# Add a column indicating whether the prediction was correct
result_df['Correct'] = result_df['Actual'] == result_df['Predicted']
# Display the DataFrame
print(result_df)
# Calculate accuracy manually
accuracy = result_df['Correct'].mean()
print("Manual Test Accuracy:", accuracy)

      Actual  Predicted  Correct
0          1          1     True
1          3          3     True
2          4          4     True
3          4          1    False
4          4          4     True
...      ...        ...      ...
3390       1          1     True
3391       4          4     True
3392       7          7     True
3393       0          0     True
3394       3          3     True

[3395 rows x 3 columns]
Manual Test Accuracy: 0.9540500736377026


In [222]:

from google.colab import files
# Export result_df to a CSV file
result_df.to_csv('DrugName_prediction_results.csv', index=False)

In [224]:
# Download the CSV file
files.download('DrugName_prediction_results.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# **Training model with rating, review embedding, usefulcounts, condition dummy variables and drug name dummy variables**

In [136]:
import pandas as pd

# Load the review embeddings CSV file into a DataFrame
embedded_df = pd.read_csv('embedded_review_reduced.csv')

# Display the first few rows of the DataFrame to verify it was loaded correctly
embedded_df.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,lengthReview,conditionCluster_label,drugNameCluster_label,...,320,321,322,323,324,325,326,327,328,329
0,95260,Guanfacine,ADHD,My son is halfway through his fourth week of I...,8.0,27-Apr-10,192.0,712.0,2.0,4.0,...,-0.072203,0.050732,-0.066741,-0.018283,-0.092436,-0.059485,-0.002017,0.068087,0.062693,0.019311
1,92703,Lybrel,Birth Control,I used to take another oral contraceptive whic...,5.0,14-Dec-09,17.0,708.0,9.0,6.0,...,0.022241,-0.060561,0.042767,0.100795,-0.017772,-0.053921,-0.089212,-0.116708,0.053316,-0.029458
2,138000,Ortho Evra,Birth Control,This is my first time using any form of birth ...,8.0,3-Nov-15,10.0,428.0,9.0,4.0,...,0.100959,-0.072072,-0.060204,-0.01296,0.144626,-0.023831,0.137507,-0.030472,-0.074193,0.128502
3,35696,Buprenorphine naloxone,Opiate Dependence,Suboxone has completely turned my life around ...,9.0,27-Nov-16,37.0,669.0,0.0,2.0,...,0.117297,0.001229,0.01479,-0.107192,-0.008831,-0.10001,-0.108753,0.033484,0.010719,0.117703
4,155963,Cialis,Benign Prostatic Hyperplasia,2nd day on 5mg started to work with rock hard ...,2.0,28-Nov-15,43.0,373.0,0.0,5.0,...,0.06463,0.036089,-0.013739,-0.062118,-0.048534,0.005309,-0.077575,0.146462,-0.013111,-0.208725


In [137]:
# Convert condition into multiple columns of dummy variables

dummies_conditions = pd.get_dummies(embedded_df['condition'])

# Concatenate dummy variables with the original DataFrame
embedded_df = pd.concat([
    embedded_df.drop(columns=['condition']),
    dummies_conditions
], axis=1)

In [138]:
# Convert drugname into multiple columns of dummy variables

dummies_drugs = pd.get_dummies(embedded_df['drugName'])

# Concatenate dummy variables with the original DataFrame
embedded_df = pd.concat([
    embedded_df.drop(columns=['drugName']),
    dummies_drugs
], axis=1)

In [139]:
#Review embedded_sentiment_df
embedded_df.head()

Unnamed: 0,uniqueID,review,rating,date,usefulCount,lengthReview,conditionCluster_label,drugNameCluster_label,0,1,...,Zovirax Ointment,Zyban,Zyclara,Zymine,Zyprexa,Zyprexa Zydis,Zyrtec,Zyvox,ella,femhrt
0,95260,My son is halfway through his fourth week of I...,8.0,27-Apr-10,192.0,712.0,2.0,4.0,1.887339,-0.869075,...,0,0,0,0,0,0,0,0,0,0
1,92703,I used to take another oral contraceptive whic...,5.0,14-Dec-09,17.0,708.0,9.0,6.0,-0.000775,0.706566,...,0,0,0,0,0,0,0,0,0,0
2,138000,This is my first time using any form of birth ...,8.0,3-Nov-15,10.0,428.0,9.0,4.0,0.710067,1.259798,...,0,0,0,0,0,0,0,0,0,0
3,35696,Suboxone has completely turned my life around ...,9.0,27-Nov-16,37.0,669.0,0.0,2.0,0.987319,-0.213137,...,0,0,0,0,0,0,0,0,0,0
4,155963,2nd day on 5mg started to work with rock hard ...,2.0,28-Nov-15,43.0,373.0,0.0,5.0,-0.92463,0.702602,...,0,0,0,0,0,0,0,0,0,0


In [140]:
# Drop all the non- beneficial columns

columns_to_drop = ['uniqueID', 'review', 'date', 'lengthReview', 'conditionCluster_label']

embedded_df = embedded_df.drop(columns=columns_to_drop)
embedded_df.head(5)

Unnamed: 0,rating,usefulCount,drugNameCluster_label,0,1,2,3,4,5,6,...,Zovirax Ointment,Zyban,Zyclara,Zymine,Zyprexa,Zyprexa Zydis,Zyrtec,Zyvox,ella,femhrt
0,8.0,192.0,4.0,1.887339,-0.869075,-2.625546,1.417739,0.745655,-0.63982,-0.805294,...,0,0,0,0,0,0,0,0,0,0
1,5.0,17.0,6.0,-0.000775,0.706566,-0.605744,-1.173229,0.28467,0.259833,0.353663,...,0,0,0,0,0,0,0,0,0,0
2,8.0,10.0,4.0,0.710067,1.259798,-1.086482,-0.472165,-1.45568,-0.347598,0.39165,...,0,0,0,0,0,0,0,0,0,0
3,9.0,37.0,2.0,0.987319,-0.213137,-0.683326,-1.648944,-0.1489,0.311274,0.131049,...,0,0,0,0,0,0,0,0,0,0
4,2.0,43.0,5.0,-0.92463,0.702602,0.450114,-0.958159,-0.097065,-1.075369,-0.084259,...,0,0,0,0,0,0,0,0,0,0


In [141]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

# Split our preprocessed data into our features and target arrays

from keras.utils import to_categorical

# Separate the target variable
target = embedded_df['drugNameCluster_label']

# Extract features (excluding the target variable)
features = embedded_df.drop(columns=['drugNameCluster_label'])

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=30)

# Convert target labels to one-hot encoded format
y_train_encoded = to_categorical(y_train, num_classes=10)
y_test_encoded = to_categorical(y_test, num_classes=10)

# Verify the unique values in the target variable
print(target.unique())

# Check the shapes of the one-hot encoded target labels
print("Shape of y_train_encoded:", y_train_encoded.shape)
print("Shape of y_test_encoded:", y_test_encoded.shape)

[4. 6. 2. 5. 0. 1. 7. 3. 9. 8.]
Shape of y_train_encoded: (7920, 10)
Shape of y_test_encoded: (3395, 10)


In [142]:
# Review feature columns
features.columns

Index(['rating', 'usefulCount', '0', '1', '2', '3', '4', '5', '6', '7',
       ...
       'Zovirax Ointment', 'Zyban', 'Zyclara', 'Zymine', 'Zyprexa',
       'Zyprexa Zydis', 'Zyrtec', 'Zyvox', 'ella', 'femhrt'],
      dtype='object', length=2174)

In [143]:
 # Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [144]:
# Check the number of dimensions, make it a variable so it passes into

num_dimensions = embedded_df.shape[1]-1
print(num_dimensions)

2174


In [145]:
# Import our dependencies
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


nn = tf.keras.models.Sequential()

# Input layer
nn.add(tf.keras.layers.Dense(units=332, activation="sigmoid", input_dim=num_dimensions))

# Second hidden layer
nn.add(Dense(units=250, activation='relu'))

# Third hidden layer (example of adding an additional hidden layer)
nn.add(Dense(units=200, activation='relu'))

#Fourth hidden layer (example of adding another hidden layer)
nn.add(Dense(units=100, activation='relu'))

#Sixth hidden layer (example of adding another hidden layer)
nn.add(Dense(units=50, activation='relu'))

#Seventh hidden layer (example of adding another hidden layer)
nn.add(Dense(units=25, activation='relu'))

#Eighth hidden layer (example of adding another hidden layer)
nn.add(Dense(units=15, activation='relu'))

# Output layer
nn.add(Dense(units=10, activation='softmax'))  # 3 units for 3 classes, softmax activation

# Check the structure of the model
nn.summary()



Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_58 (Dense)            (None, 332)               722100    
                                                                 
 dense_59 (Dense)            (None, 250)               83250     
                                                                 
 dense_60 (Dense)            (None, 200)               50200     
                                                                 
 dense_61 (Dense)            (None, 100)               20100     
                                                                 
 dense_62 (Dense)            (None, 50)                5050      
                                                                 
 dense_63 (Dense)            (None, 25)                1275      
                                                                 
 dense_64 (Dense)            (None, 15)               

In [146]:
#compile the model
nn.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [150]:
 # Fit the model to the training data
fit_model = nn.fit(X_train_scaled, y_train_encoded, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [151]:
# Evaluate the model using the test data
test_loss, test_accuracy = nn.evaluate(X_test_scaled, y_test_encoded)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)

Test Loss: 0.6478509306907654
Test Accuracy: 0.9469808340072632


In [51]:
import numpy as np
# Get predicted probabilities for each class
y_pred_probabilities = nn.predict(X_test_scaled)
# Convert probabilities to predicted classes by selecting the class with the highest probability
y_pred = np.argmax(y_pred_probabilities, axis=1)
# Convert one-hot encoded y_test_encoded back to categorical labels
y_test_categorical = np.argmax(y_test_encoded, axis=1)
# Create a DataFrame to compare predicted and actual classes
result_df = pd.DataFrame({'Actual': y_test_categorical, 'Predicted': y_pred})
# Add a column indicating whether the prediction was correct
result_df['Correct'] = result_df['Actual'] == result_df['Predicted']
# Display the DataFrame
print(result_df)
# Calculate accuracy manually
accuracy = result_df['Correct'].mean()
print("Manual Test Accuracy:", accuracy)

      Actual  Predicted  Correct
0          1          1     True
1          3          3     True
2          4          4     True
3          4          0    False
4          4          4     True
...      ...        ...      ...
3390       1          1     True
3391       4          4     True
3392       7          7     True
3393       0          0     True
3394       3          3     True

[3395 rows x 3 columns]
Manual Test Accuracy: 0.9522827687776141


In [54]:

from google.colab import files
# Export result_df to a CSV file
result_df.to_csv('DrugName_prediction2_results.csv', index=False)

In [56]:
# Download the CSV file
files.download('DrugName_prediction2_results.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>