### Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, GRU
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
nltk.download('stopwords')

### Load Data

In [None]:
from google.colab import files
files.upload()
data = pd.read_excel('drug_review.xlsx')
df = pd.DataFrame(data)
df.head()

### Data Cleaning

In [None]:
REPLACE_BY_SPACE_RE = re.compile('[!"#$%&()*+,-./:;<=>?@[\]^_`{|}~]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))
def clean_text(text):
    """
        text: a string
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = text.replace('x', '')
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwords from text
    return text
for col in df.columns:
    df[col] = df[col].apply(clean_text)
df.head()

In [None]:
df['Effectiveness'].unique()

### Tokenize

In [None]:
max_words = 500
max_sequence = 250
embedding_size = 100
tokenizer = Tokenizer(num_words = max_words, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower = True)

### Effectiveness/Side Effects: Data/Labels

In [None]:
df_se = df[['Side_Effects_Review', 'Side_Effects']]
df_eff = df[['Effectiveness_Review', 'Effectiveness']]
df_se_data, df_se_label = df_se['Side_Effects_Review'], df_se['Side_Effects']
df_eff_data, df_eff_label = df_eff['Effectiveness_Review'], df_eff['Effectiveness']
X_se, X_eff = tokenizer.texts_to_sequences(df_se['Side_Effects_Review']), tokenizer.texts_to_sequences(df_eff_data.values)
X_se, X_eff = pad_sequences(X_se, maxlen=max_sequence), pad_sequences(X_eff, maxlen=max_sequence)
Y_se, Y_eff = pd.get_dummies(df_se_label).values, pd.get_dummies(df_eff_label).values

### Train/Test Split

In [None]:
X_train_se, X_test_se, Y_train_se, Y_test_se = train_test_split(X_se, Y_se, test_size = 0.2, random_state = 802)
X_train_eff, X_test_eff, Y_train_eff, Y_test_eff = train_test_split(X_eff, Y_eff, test_size = 0.2, random_state = 802)
Y_test_eff.shape

### RNN Model

In [None]:
model_se, model_eff = Sequential(), Sequential()

model_se.add(Embedding(max_words, embedding_size, input_length = 250))
model_eff.add(Embedding(max_words, embedding_size, input_length = 250))

model_se.add(LSTM(100, activation = 'relu', kernel_initializer = 'glorot_uniform', return_sequences=True))
model_eff.add(LSTM(100, activation = 'relu', kernel_initializer = 'glorot_uniform', return_sequences = True))

model_se.add(Dropout(0.4)), model_eff.add(Dropout(0.4))

model_se.add(LSTM(100, activation = 'relu', kernel_initializer = 'glorot_uniform', return_sequences=True))
model_eff.add(LSTM(100, activation = 'relu', kernel_initializer = 'glorot_uniform', return_sequences=True))

model_se.add(Dropout(0.4)), model_eff.add(Dropout(0.4))

model_se.add(LSTM(50, activation = 'tanh', dropout = 0.3, recurrent_dropout = 0.2, return_sequences = True))
model_eff.add(LSTM(50, activation = 'tanh', dropout = 0.3, recurrent_dropout = 0.2, return_sequences = True))

model_se.add(GRU(70, activation = 'relu', recurrent_activation = 'tanh'))
model_eff.add(GRU(70, activation = 'relu', recurrent_activation = 'tanh'))

model_se.add(Dense(5, activation = 'softmax'))
model_eff.add(Dense(5, activation = 'softmax'))

model_eff.summary()

In [None]:
model_se.compile(optimizer = 'adam',loss = 'categorical_crossentropy', metrics = ['accuracy'])
model_eff.compile(optimizer = 'adam',loss = 'categorical_crossentropy', metrics = ['accuracy'])

In [None]:
se_history = model_se.fit(X_train_se, Y_train_se, batch_size = 100,epochs = 10,validation_split = 0.15)

In [None]:
eff_history = model_eff.fit(X_train_se, Y_train_se, batch_size = 1000,epochs = 10,validation_split = 0.15)

### Loss/Accuracy Plot

In [None]:
plt.subplot(3,1,1)
plt.plot(se_history.history['val_loss'],'-o', label = 'val_loss')
plt.plot(se_history.history['loss'],'-o', label = 'loss')
plt.title('Side effects loss')
plt.legend(loc = 1)
plt.subplot(3,1,3)
plt.plot(se_history.history['val_accuracy'], '-o', label = 'val_acc')
plt.plot(se_history.history['accuracy'], '-o', label = 'acc')
plt.title('Side effects accuracy')
plt.legend(loc = 4)

### Loss/Accuracy Plot

In [None]:
plt.subplot(3,1,1)
plt.plot(eff_history.history['val_loss'],'-o', label = 'val_loss')
plt.plot(eff_history.history['loss'],'-o', label = 'loss')
plt.title('Effectiveness loss')
plt.legend(loc = 1)
plt.subplot(3,1,3)
plt.plot(eff_history.history['val_accuracy'], '-o', label = 'val_acc')
plt.plot(eff_history.history['accuracy'], '-o', label = 'acc')
plt.title('Effectiveness accuracy')
plt.legend(loc = 4)
print('Epoch to get best effectiveness training accuracy:', np.argmax(eff_history.history['accuracy'])+1)
print('Epoch to get best effectiveness validation accuracy:', np.argmax(eff_history.history['val_accuracy'])+1)

### Test

In [None]:
Y_hat_se = model_se.predict(X_test_se) 
print(Y_hat_se[0:3])
Yhat_se = np.argmax(a=Y_hat_se, axis=1)
Ytest_se = np.argmax(Y_test_se, axis = 1)
conf_matrix = confusion_matrix(Ytest_se, Yhat_se)
print("Side effects test set confusion matrix:\n", conf_matrix)
print("Side effects test accuracy:", accuracy_score(Ytest_se, Yhat_se))

### Test

In [None]:
Y_hat_eff = model_eff.predict(X_test_eff) 
print(Y_hat_eff[0:3])
Yhat_eff = np.argmax(a=Y_hat_eff, axis=1)
Ytest_eff = np.argmax(Y_test_eff, axis = 1)
conf_matrix = confusion_matrix(Ytest_eff, Yhat_eff)
print("Effictiveness test set confusion matrix:\n", conf_matrix)
print("Effectiveness test accuracy:", accuracy_score(Ytest_eff, Yhat_eff))

### RNN Model2

In [None]:
model_se2, model_eff2 = Sequential(), Sequential()

model_se2.add(Embedding(max_words, embedding_size, input_length = 250))
model_eff2.add(Embedding(max_words, embedding_size, input_length = 250))

model_se2.add(LSTM(1000, activation = 'relu', kernel_initializer = 'glorot_uniform', return_sequences=True))
model_eff2.add(LSTM(1000, activation = 'relu', kernel_initializer = 'glorot_uniform', return_sequences=True))

model_se2.add(Dropout(0.35)), model_eff.add(Dropout(0.35))

model_se2.add(LSTM(200, activation = 'relu', kernel_initializer = 'glorot_uniform', return_sequences=True))
model_eff2.add(LSTM(200, activation = 'relu', kernel_initializer = 'glorot_uniform', return_sequences=True))

model_se2.add(LSTM(100, activation = 'relu', dropout = 0.3, recurrent_dropout = 0.2, return_sequences = True))
model_eff2.add(LSTM(50, activation = 'relu', dropout = 0.3, recurrent_dropout = 0.2, return_sequences = True))

model_se2.add(GRU(70, activation = 'relu', recurrent_activation = 'relu'))
model_eff2.add(GRU(70, activation = 'relu', recurrent_activation = 'relu'))

model_se2.add(Dense(20, activation = 'relu'))
model_eff2.add(Dense(20, activation = 'relu'))

model_se2.add(Dense(5, activation = 'softmax'))
model_eff2.add(Dense(5, activation = 'softmax'))

model_se2.summary()

In [None]:
model_se2.compile(optimizer = 'adam',loss = 'categorical_crossentropy', metrics = ['accuracy'])
model_eff2.compile(optimizer = 'adam',loss = 'categorical_crossentropy', metrics = ['accuracy'])

In [None]:
se2_history = model_se2.fit(X_train_se, Y_train_se, batch_size = 1000,epochs = 10,validation_split = 0.2)

In [None]:
eff2_history = model_eff2.fit(X_train_se, Y_train_se, batch_size = 1000,epochs = 10,validation_split = 0.15)

In [None]:
plt.subplot(3,1,1)
plt.plot(se2_history.history['val_loss'],'-o', label = 'val_loss')
plt.plot(se2_history.history['loss'],'-o', label = 'loss')
plt.title('Side effects loss')
plt.legend(loc = 1)
plt.subplot(3,1,3)
plt.plot(se2_history.history['val_accuracy'], '-o', label = 'val_acc')
plt.plot(se2_history.history['accuracy'], '-o', label = 'acc')
plt.title('Side effects accuracy')
plt.legend(loc = 4)

In [None]:
plt.subplot(3,1,1)
plt.plot(eff2_history.history['val_loss'],'-o', label = 'val_loss')
plt.plot(eff2_history.history['loss'],'-o', label = 'loss')
plt.title('Effectiveness loss')
plt.legend(loc = 1)
plt.subplot(3,1,3)
plt.plot(eff2_history.history['val_accuracy'], '-o', label = 'val_acc')
plt.plot(eff2_history.history['accuracy'], '-o', label = 'acc')
plt.title('Effectiveness accuracy')
plt.legend(loc = 4)
print('Epoch to get best effectiveness training accuracy:', np.argmax(eff_history.history['accuracy'])+1)
print('Epoch to get best effectiveness validation accuracy:', np.argmax(eff_history.history['val_accuracy'])+1)

In [None]:
Y_hat_se2 = model_se2.predict(X_test_se) 
print(Y_hat_se2[0:3])
Yhat_se2 = np.argmax(a=Y_hat_se2, axis=1)
Ytest_se2 = np.argmax(Y_test_se, axis = 1)
conf_matrix = confusion_matrix(Ytest_se2, Yhat_se2)
print("Side effects test set confusion matrix:\n", conf_matrix)
print("Side effects test accuracy:", accuracy_score(Ytest_se2, Yhat_se2))

In [None]:
Y_hat_eff2 = model_eff2.predict(X_test_eff) 
print(Y_hat_eff2[0:3])
Yhat_eff2 = np.argmax(a=Y_hat_eff2, axis=1)
Ytest_eff2 = np.argmax(Y_test_eff, axis = 1)
conf_matrix = confusion_matrix(Ytest_eff2, Yhat_eff2)
print("Effictiveness test set confusion matrix:\n", conf_matrix)
print("Effectiveness test accuracy:", accuracy_score(Ytest_eff2, Yhat_eff2))