In [23]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import nltk
from itertools import cycle, islice
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, BatchNormalization
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
import gc
import os

In [6]:
dtrain_df = pd.read_csv('./drugsComTrain_raw.csv')
dtest_df = pd.read_csv('./drugsComTest_raw.csv')

In [8]:
dtrain_df.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37


In [9]:
dtest_df.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,163740,Mirtazapine,Depression,"""I&#039;ve tried a few antidepressants over th...",10,28-Feb-12,22
1,206473,Mesalamine,"Crohn's Disease, Maintenance","""My son has Crohn&#039;s disease and has done ...",8,17-May-09,17
2,159672,Bactrim,Urinary Tract Infection,"""Quick reduction of symptoms""",9,29-Sep-17,3
3,39293,Contrave,Weight Loss,"""Contrave combines drugs that were used for al...",9,5-Mar-17,35
4,97768,Cyclafem 1 / 35,Birth Control,"""I have been on this birth control for one cyc...",9,22-Oct-15,4


In [11]:
# Creates TF-IDF vectorizer and transforms the corpus
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(dtrain_df.review)

# transforms test reviews to above vectorized format
X_test = vectorizer.transform(dtest_df.review)

In [13]:
y_train = dtrain_df.rating.copy(deep=True)
y_test = dtest_df.rating.copy(deep=True)

def conver_op3(labels):
    for i in range(len(labels)):
        if(labels[i]<3):
            labels[i]=0
        elif(labels[i]<7):
            labels[i]=1
        elif(labels[i]<=10):
            labels[i]=2
    return labels

def conver_op10(labels):
    for i in range(len(labels)):
        if(labels[i] == 1):
            labels[i] = 0
        elif(labels[i] == 2):
            labels[i] = 1
        elif(labels[i] == 3):
            labels[i] = 2
        elif(labels[i] == 4):
            labels[i] = 3
        elif(labels[i] == 5):
            labels[i] = 4
        elif(labels[i] == 6):
            labels[i] = 5
        elif(labels[i] == 7):
            labels[i] = 6
        elif(labels[i] == 8):
            labels[i] = 7
        elif(labels[i] == 9):
            labels[i] = 8
        elif(labels[i] == 10):
            labels[i] = 9
    return labels
    
# y_train = conver_op3(y_train)
# y_test = conver_op3(y_test)

In [14]:
y_tr = conver_op10(y_train)
y_ts = conver_op10(y_test)

y_tr = keras.utils.to_categorical(y_tr)
y_ts = keras.utils.to_categorical(y_ts)

In [24]:
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.00001)

def lr_schedule(epoch):
    if(epoch <= 5):
        return 1e-2
    elif(epoch <= 10):
        return 1e-3
    elif(epoch <= 15):
        return 1e-4
    else:
        return 1e-5

In [25]:
model = Sequential()

# Dense-1
model.add(Dense(500, input_shape=(X_train.shape[1],)))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.5))

# Dense-2
model.add(Dense(300))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.5))

# Dense-3
model.add(Dense(10, activation='sigmoid'))

model.compile(loss='categorical_crossentropy', metrics=['acc'], optimizer=Adam(lr=0.01))

model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 500)               24950000  
_________________________________________________________________
batch_normalization_5 (Batch (None, 500)               2000      
_________________________________________________________________
activation_5 (Activation)    (None, 500)               0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 500)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 300)               150300    
_________________________________________________________________
batch_normalization_6 (Batch (None, 300)               1200      
_________________________________________________________________
activation_6 (Activation)    (None, 300)              

In [26]:
keras.callbacks.LearningRateScheduler(lr_schedule)
model_history = model.fit(X_train, y_tr, epochs=20, batch_size=256, validation_data=(X_test, y_ts))

Train on 161297 samples, validate on 53766 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [27]:
new_model = Sequential()

# Dense-1
new_model.add(Dense(500, input_shape=(X_train.shape[1],)))
new_model.add(BatchNormalization())
new_model.add(Activation('relu'))
new_model.add(Dropout(0.5))

# Dense-2
new_model.add(Dense(300))
new_model.add(BatchNormalization())
new_model.add(Activation('relu'))
new_model.add(Dropout(0.5))

# Dense-3
new_model.add(Dense(10, activation='sigmoid'))

new_model.compile(loss='categorical_crossentropy', metrics=['acc'], optimizer=Adam(lr=0.01))

new_model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_10 (Dense)             (None, 500)               24950000  
_________________________________________________________________
batch_normalization_7 (Batch (None, 500)               2000      
_________________________________________________________________
activation_7 (Activation)    (None, 500)               0         
_________________________________________________________________
dropout_7 (Dropout)          (None, 500)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 300)               150300    
_________________________________________________________________
batch_normalization_8 (Batch (None, 300)               1200      
_________________________________________________________________
activation_8 (Activation)    (None, 300)              

In [29]:
new_model_history = new_model.fit(X_train, y_tr, epochs=20, batch_size=256, validation_data=(X_test, y_ts), callbacks=[reduce_lr])

Train on 161297 samples, validate on 53766 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
 23552/161297 [===>..........................] - ETA: 1:18 - loss: 0.1287 - acc: 0.9579

KeyboardInterrupt: 