In [50]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, BatchNormalization, Flatten
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

In [2]:

EMBEDDING_FILE='rnn/glove.6B/glove.6B.100d.txt'
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [3]:
embed_size = 100 # how big is each word vector
max_features = 20000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 463

In [4]:
train.categories.unique()

array(['QUERIES FROM PHARMACY', 'NEW APPOINTMENT', 'OTHERS',
       'MEDICATION RELATED',
       'SHARING OF HEALTH RECORDS (FAX, E-MAIL, ETC.)', 'REFILL',
       'PRIOR AUTHORIZATION', 'RESCHEDULING', 'SYMPTOMS', 'LAB RESULTS',
       'FOLLOW UP ON PREVIOUS REQUEST', 'PROVIDER', 'CHANGE OF PROVIDER',
       'SHARING OF LAB RECORDS (FAX, E-MAIL, ETC.)',
       'QUERY ON CURRENT APPOINTMENT', 'RUNNING LATE TO APPOINTMENT',
       'CANCELLATION', 'CHANGE OF PHARMACY', 'QUERIES FROM INSURANCE FIRM',
       'JUNK', 'CHANGE OF HOSPITAL'], dtype=object)

In [9]:
train_labels = train['categories']
#test_labels = test['categories']

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()                  # converts the character array to numeric array. Assigns levels to unique labels.
le.fit(train_labels)
train_labels = le.transform(train_labels)
#test_labels = le.transform(test_labels)

print(le.classes_)


['CANCELLATION' 'CHANGE OF HOSPITAL' 'CHANGE OF PHARMACY'
 'CHANGE OF PROVIDER' 'FOLLOW UP ON PREVIOUS REQUEST' 'JUNK' 'LAB RESULTS'
 'MEDICATION RELATED' 'NEW APPOINTMENT' 'OTHERS' 'PRIOR AUTHORIZATION'
 'PROVIDER' 'QUERIES FROM INSURANCE FIRM' 'QUERIES FROM PHARMACY'
 'QUERY ON CURRENT APPOINTMENT' 'REFILL' 'RESCHEDULING'
 'RUNNING LATE TO APPOINTMENT'
 'SHARING OF HEALTH RECORDS (FAX, E-MAIL, ETC.)'
 'SHARING OF LAB RECORDS (FAX, E-MAIL, ETC.)' 'SYMPTOMS']


In [10]:
print(np.unique(train_labels, return_counts=True))

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20]), array([ 564,  127,   47,  811,  304,   18, 2253, 9010, 8907, 6253, 1043,
       1677,   91, 1464,  559, 8347, 1383,  590, 3018, 1212, 1021]))


In [15]:
train_labels=pd.DataFrame(train_labels)

In [16]:
train_labels.head()

Unnamed: 0,0
0,13
1,8
2,9
3,9
4,7


In [17]:
list_sentences_train = train["converse"].fillna("_na_").values

y = train_labels
list_sentences_test = test["converse"].fillna("_na_").values

In [19]:
pd.DataFrame(list_sentences_train).isnull().any()

0    False
dtype: bool

In [20]:
pd.DataFrame(list_sentences_test).isnull().any()

0    False
dtype: bool

In [21]:
from keras.utils import to_categorical
y=to_categorical(y)

In [22]:
y.shape

(48699, 21)

In [23]:
(~y.any(axis=0)).any()

False

array([0])

In [24]:
y.shape

(48699, 21)

In [25]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [26]:
GLOVE_DIR = 'glove.6B/'

print('Indexing word vectors.')
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 400000 word vectors.


In [29]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std
print(emb_mean)

0.00445201


In [30]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [None]:
from keras.models import Sequential
embedding_size = 100
model = Sequential()
model.add(Embedding(max_features, embedding_size, input_length=maxlen))
model.add((LSTM(100)))
model.add(Dense(21, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
print(model.summary())
model.fit(X_t, y, validation_split=0.2, epochs=5, batch_size=64)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 463, 100)          2000000   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 200)               160800    
_________________________________________________________________
dense_14 (Dense)             (None, 21)                4221      
Total params: 2,165,021
Trainable params: 2,165,021
Non-trainable params: 0
_________________________________________________________________
None
Train on 38959 samples, validate on 9740 samples
Epoch 1/5

In [53]:
test_pred = model.predict_classes(X_te)
test_pred



array([ 7,  8,  7, ...,  7, 19,  8])

In [54]:
test_pred_classes = le.inverse_transform(test_pred)

In [55]:
df_test_pred_classes = pd.DataFrame(test_pred_classes)
df_test_pred_classes.to_csv("manali.csv")