In [1]:
import pandas as pd
import numpy as np
import review_preprocess
from tensorflow.keras.models import Model,Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Flatten, Dense, Dropout,Embedding,SpatialDropout1D,LSTM,Input, concatenate
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.python.keras.layers import CuDNNLSTM
from sklearn import preprocessing

In [2]:
train_file_name = 'drugsComTrain_raw.csv'
test_file_name = 'drugsComTest_raw.csv'
file_path = './data/'

df_test = pd.read_csv(file_path+test_file_name)
df_train = pd.read_csv(file_path+train_file_name)

In [3]:
df_processed_train = review_preprocess.clean_reviews(df_train,is_lstm=True)

Review cleanup Completed...aracters from review column--> 100.0 percentage complete
Removing row with nan values
Percentage of nan rows in dataset--> 0.56 %
Removed 899 rows with na values


In [4]:
df_processed_test = review_preprocess.clean_reviews(df_test,is_lstm=True)

Review cleanup Completed...aracters from review column--> 100.0 percentage complete
Removing row with nan values
Percentage of nan rows in dataset--> 0.55 %
Removed 295 rows with na values


In [5]:
df_processed_train = pd.concat([df_processed_train, df_processed_test])

In [6]:
df_processed_train.describe()

Unnamed: 0,uniqueID,rating,usefulCount
count,213869.0,213869.0,213869.0
mean,116076.924786,6.991149,28.094118
std,67016.705794,3.275792,36.401377
min,0.0,1.0,0.0
25%,58122.0,5.0,6.0
50%,115972.0,8.0,16.0
75%,174018.0,10.0,36.0
max,232291.0,10.0,1291.0


In [7]:
m = 0
for t in df_processed_train.review:
    a = t.split()
    if(len(a)>m):
        m=len(a)
print("Max words in a review inside train set is ",m)
m = 0
for t in df_processed_test.review:
    a = t.split()
    if(len(a)>m):
        m=len(a)
print("Max words in a review inside test set is ",m)

Max words in a review inside train set is  945
Max words in a review inside test set is  540


In [8]:
# number of unique words in review are 
# len(.values.split()))
all_review = ' '.join(df_processed_train['review'])
print("Total words combined in all train review ",len(all_review))
print("Total unique words in all train review ",len(set(all_review.split(" "))))

Total words combined in all train review  60624716
Total unique words in all train review  98456


In [9]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 550
# This is fixed.
EMBEDDING_DIM = 100
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df_processed_train['review'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 98228 unique tokens.


In [10]:
X_tr = tokenizer.texts_to_sequences(df_processed_train['review'].values)
X_tr = pad_sequences(X_tr, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X_tr.shape)

Shape of data tensor: (213869, 550)


In [11]:
# from sklearn.feature_extraction.text import HashingVectorizer
# from sklearn.feature_extraction.text import TfidfVectorizer

# tfidf_vectorizer = TfidfVectorizer(max_features=2500)#max_features=4000
# x_tfidf = tfidf_vectorizer.fit_transform(df_processed_train.review)

# hash_vectorizer = HashingVectorizer(n_features=1000)#n_features=100
# x_hash = hash_vectorizer.fit_transform(df_processed_train.review)

In [12]:
# print("TFIDF  ",x_tfidf.shape)
# print("hash  ",x_hash.shape)

In [13]:
df_processed_train.rating
y_train = pd.get_dummies(df_processed_train.rating).values


In [14]:
y_train.shape

(213869, 10)

In [15]:
# model1 = Sequential()
# model1.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X_tr.shape[1],name="E1"))
# model1.add(SpatialDropout1D(0.2))
# model1.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
# model1.add(Dense(10, activation='softmax',name='d1'))
# model1.summary()

In [16]:
# MAX_NB_WORDS = 500
# # Max number of words in each complaint.
# MAX_SEQUENCE_LENGTH = 200
# # This is fixed.
# EMBEDDING_DIM = 128
meta_in = Input(shape=(3,))
x = Dense(5, activation="relu")(meta_in)
x = Model(inputs=meta_in, outputs=x)


model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X_tr.shape[1],name="E1"))
model.add(SpatialDropout1D(0.2))
model.add(CuDNNLSTM(250))
model.add(Dense(25, activation='relu',name='d1'))
# model.add(Dense(10, activation='softmax'))
combined = concatenate([x.output, model.output])
prediction = Dense(10, activation='softmax')(combined)
model_meta = Model(inputs=[model.input, meta_in], outputs=prediction)

In [17]:
model_meta.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
E1_input (InputLayer)           [(None, 550)]        0                                            
__________________________________________________________________________________________________
E1 (Embedding)                  (None, 550, 100)     5000000     E1_input[0][0]                   
__________________________________________________________________________________________________
spatial_dropout1d (SpatialDropo (None, 550, 100)     0           E1[0][0]                         
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 3)]          0                                            
____________________________________________________________________________________________

In [18]:
# df_processed_train[["usefulCount","condition","drugName"]]

le = preprocessing.LabelEncoder()
df_processed_train["drugName"] = le.fit_transform(df_processed_train["drugName"])
df_processed_train["condition"] = le.fit_transform(df_processed_train["condition"])
df_processed_train[["usefulCount","condition","drugName"]].values.shape

(213869, 3)

In [19]:
df_processed_train[["usefulCount","condition","drugName"]].values.shape

(213869, 3)

In [20]:
print(X_tr.shape)
print(y_train.shape)
print(df_processed_test[["usefulCount","condition","drugName"]].values.shape)
print(df_processed_train[["usefulCount","condition","drugName"]].values.shape)

(213869, 550)
(213869, 10)
(53471, 3)
(213869, 3)


In [None]:
model_meta.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
 
epochs = 10
batch_size = 256
history = model_meta.\
fit((X_tr,df_processed_train[["usefulCount","condition","drugName"]].values), y_train,\
                         epochs=epochs, batch_size=batch_size,\
                            validation_split=0.2,\
                         callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Train on 171095 samples, validate on 42774 samples
Epoch 1/10