In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

import tensorflow as tf
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Embedding, Dropout
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences

In [2]:
data = pd.read_csv('C:\\Users\\user1\\Documents\\repo\\dialog act RNN\\MRDA\\full_set.csv')
print(data.shape)
data.head(10)

(108201, 6)


Unnamed: 0.1,Unnamed: 0,Speaker,Utterances,Basic,General,Full
0,0,fe016,so um,F,fh,fh
1,1,fe016,i was going to try to get out of here like in ...,S,s,rt
2,2,fe016,um,F,fh,fh
3,3,fe016,because i really appreciate people coming.,S,s,s
4,4,fe016,and the main thing that i was going to ask peo...,S,s,s
5,5,fe016,so anything that transcribers or discourse cod...,S,s,e
6,6,fe016,so we have this um,D,fh,fh
7,7,fe016,i think a starting point is clearly the the ch...,S,s,s
8,8,fe016,which don brought a copy of.,S,s,e
9,9,me011,yeah.,B,b,b


In [3]:
data=data.drop(columns=["Unnamed: 0"])
data.head()

Unnamed: 0,Speaker,Utterances,Basic,General,Full
0,fe016,so um,F,fh,fh
1,fe016,i was going to try to get out of here like in ...,S,s,rt
2,fe016,um,F,fh,fh
3,fe016,because i really appreciate people coming.,S,s,s
4,fe016,and the main thing that i was going to ask peo...,S,s,s


In [4]:
data['Utterances'] = data['Utterances'].apply(lambda x: x.lower())
data['Utterances'] = data['Utterances'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
data['Utterances'].head()

0                                                so um
1    i was going to try to get out of here like in ...
2                                                   um
3            because i really appreciate people coming
4    and the main thing that i was going to ask peo...
Name: Utterances, dtype: object

In [5]:
print((data.groupby('Full')['Full'].agg('count')).size)
data.groupby('Full')['Full'].agg('count')

52


Full
%       3103
2        841
aa      5898
aap      219
am       349
ar       908
arp      150
b      15013
ba      2216
bc        51
bd       387
bh       154
bk      7177
br       236
bs       141
bsc      150
bu      2091
by        11
cc       371
co       674
cs      2662
d       1805
df      3724
e       3200
f        128
fa       259
fe       307
fg      3091
fh      8362
ft       119
fw         6
g         87
h        792
j        463
m        293
na      1112
nd       483
ng       351
no       828
qh       214
qo        74
qr       127
qrr      345
qw       951
qy       669
r        208
rt      3101
s      33472
t        253
t1       198
t3       165
tc       212
Name: Full, dtype: int64

In [6]:
tokenizer = Tokenizer(num_words = 5000, split=" ")
tokenizer.fit_on_texts(data['Utterances'].values)
X = tokenizer.texts_to_sequences(data['Utterances'].values)
X = pad_sequences(X, padding='post')
print(X.shape)
X[:5]

(108201, 77)


array([[  11,   16,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   2,   36,   59,    4,  168,    4,   61,   81,   10,   97,   35,
          15,  438,   86,  659,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,

In [7]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(5000, 256, input_length=X.shape[1]))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, dropout=0.3, recurrent_dropout=0.2)))
model.add(tf.keras.layers.Dense(52, activation='softmax'))

In [8]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 77, 256)           1280000   
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               394240    
_________________________________________________________________
dense (Dense)                (None, 52)                13364     
Total params: 1,687,604
Trainable params: 1,687,604
Non-trainable params: 0
_________________________________________________________________


In [9]:
y = pd.get_dummies(data['Full']).values
[print(data['Full'][i],y[i]) for i in range (0,5)]

fh [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
rt [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
fh [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
s [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
s [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]


[None, None, None, None, None]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [11]:
model.fit(X_train, y_train, epochs=3, batch_size=32, verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1b71ae88100>

In [12]:
score = model.evaluate(X_test, y_test)



In [13]:
print("Accuracy:", score[1]*100)
print(X_test[1])
print(y_test[1])

Accuracy: 59.06843543052673
[ 15   1 517   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]


In [14]:
a = ["will you go"]
a = tokenizer.texts_to_sequences(a)
a = np.array(a)
a = pad_sequences(a, padding='post', maxlen=77)
prediction = model.predict(np.array(a))
print("Shape of example ", a.shape)
print("Shape of predicted ", prediction.shape)
print("Prediction", prediction)
print("Max value", np.max(prediction))
print("Index of the max value " , prediction.argmax(1))

Shape of example  (1, 77)
Shape of predicted  (1, 52)
Prediction [[7.1221138e-03 5.5972203e-03 2.8162072e-03 6.6223170e-04 9.9269368e-05
  1.3409514e-04 9.0393369e-05 9.3638641e-04 2.0981581e-04 1.0505388e-03
  3.8876518e-04 2.7268802e-04 2.2456307e-02 1.0824620e-03 5.1039975e-04
  6.8743102e-04 6.7253292e-02 2.0213748e-04 2.3870561e-02 4.5463655e-02
  5.9154327e-03 1.3112171e-02 2.0384802e-03 4.9960944e-03 2.2367816e-03
  1.4323568e-04 2.0093452e-03 5.9087463e-03 3.3070071e-04 3.4107273e-03
  1.0595585e-04 2.2188662e-02 4.3964834e-04 1.1821263e-02 2.8931550e-03
  4.5766276e-03 4.2744662e-04 2.4412938e-04 9.8336532e-05 3.5657329e-03
  1.6062617e-03 1.2648373e-03 9.0188354e-05 2.8606199e-03 3.1287685e-01
  9.1986703e-03 3.2964134e-01 3.6706567e-02 8.3281631e-03 9.6608149e-03
  1.5339998e-02 5.0570872e-03]]
Max value 0.32964134
Index of the max value  [46]
