In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd

import os
import tensorflow as tf
print(tf.__version__)

import sys
import keras
import keras.backend as K
import json
np.random.seed(42)

 

2.3.0


In [None]:
conv_df = pd.read_csv("./drive/My Drive/data/movie_dialogs_dataset/movie_conversations.txt", sep=r" \+\+\+\$\+\+\+ ", header=None)
conv_df.columns = ["pers1", "pers2", "movie", "replies"]

print(conv_df.head())
len(conv_df)

  """Entry point for launching an IPython kernel.


  pers1 pers2 movie                           replies
0    u0    u2    m0  ['L194', 'L195', 'L196', 'L197']
1    u0    u2    m0                  ['L198', 'L199']
2    u0    u2    m0  ['L200', 'L201', 'L202', 'L203']
3    u0    u2    m0          ['L204', 'L205', 'L206']
4    u0    u2    m0                  ['L207', 'L208']


83097

In [None]:
lines_df = pd.read_csv("./drive/My Drive/data/movie_dialogs_dataset/movie_lines.txt", sep=r" \+\+\+\$\+\+\+ ", header=None)
lines_df.columns = ["lineid", "pers", "movie", "pers_name", "line"]
lines_df.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,lineid,pers,movie,pers_name,line
0,L1045,u0,m0,BIANCA,They do not!
1,L1044,u2,m0,CAMERON,They do to!
2,L985,u0,m0,BIANCA,I hope so.
3,L984,u2,m0,CAMERON,She okay?
4,L925,u0,m0,BIANCA,Let's go.


In [None]:
QA_df = pd.DataFrame(columns=["q", "a"])
for i, row in conv_df.iterrows():
    arr = eval(row["replies"])
    pers1 = row["pers1"]
    pers2 = row["pers2"]
    nrow = {'q': None, 'a': None}
    for lineid in arr:
        line = lines_df.loc[lines_df['lineid'] == lineid].values[0]        
        if line is not None:
            if line[1] == pers1:
                nrow['q'] = line[4]
            else:
                nrow['a'] = line[4]
            if nrow['q'] is not None and nrow['a'] is not None:
                QA_df = QA_df.append(nrow, ignore_index=True)
                nrow = {'q': None, 'a': None}
                if len(QA_df)%100 == 0:
                    print("Writing ", len(QA_df), " qa pairs...")
print(len(QA_df))


In [None]:
QA_df.to_csv("./drive/My Drive/data/movie_dialogs_dataset/QA.txt", sep=r"©")

In [None]:
QA_df = pd.read_csv("./drive/My Drive/data/movie_dialogs_dataset/QA.txt", sep=r"©")

  """Entry point for launching an IPython kernel.


In [None]:
QA_df.head()

Unnamed: 0.1,Unnamed: 0,q,a
0,0,Can we make this quick? Roxanne Korrine and A...,"Well, I thought we'd start with pronunciation,..."
1,1,Not the hacking and gagging and spitting part....,Okay... then how 'bout we try out some French ...
2,2,You're asking me out. That's so cute. What's ...,Forget it.
3,3,"No, no, it's my fault -- we didn't have a prop...",Cameron.
4,4,"The thing is, Cameron -- I'm at the mercy of a...",Seems like she could get a date easy enough...


In [None]:
MIN_LEN = 10
MAX_LEN = 100
N_VOCAB = 256
PAD = "\u0016"
START = "\u0017"
END = "\u0018"

from collections import Counter    

#Фильтруем короткие и длинные строки
def filter_length(q,a, minlen=MIN_LEN, maxlen=MAX_LEN):
    q1 = []
    a1 = []
    for i in range(len(q)):
        qi = q[i].replace("<u>", "").replace("</u>", "").replace("<b>", "").replace("</b>", "").replace("<s>", "").replace("</s>", "").replace("<i>", "").replace("</i>", "")
        ai = a[i].replace("<u>", "").replace("</u>", "").replace("<b>", "").replace("</b>", "").replace("<s>", "").replace("</s>", "").replace("<i>", "").replace("</i>", "")
        if len(qi) >= minlen and len(qi)<=maxlen and len(ai)>=minlen and len(ai)<=maxlen:
            q1.append(qi)
            a1.append(ai)
    return q1, a1

def gen_vocabs(lines, vocab = None, log_vocab=False):    
    counter = Counter()
    for line in lines:
        chars = [c for c in line]
        counter.update(chars)
    #генерируем словарь, если необходимо
    if vocab is None:
        vocab  = sorted([x for (x, count) in counter.most_common(N_VOCAB-3)])
        if log_vocab:
            with open("./vocab.txt", "w", encoding="utf-8") as f:
                for k,v in counter.most_common(N_VOCAB-3):
                    f.write("%s : %d\n" % (k, v))
        vocab = ([PAD, START, END] + vocab)
        vocab = {w: idx for idx, w in enumerate(vocab)}
        vocab_inverse = {idx: w for w, idx in vocab.items()}
    
    return vocab, vocab_inverse

def to_sequence(line):
    return [vocab[c] if c in vocab else vocab[PAD] for c in line] + [2] #END

def gen_sequences(lines):
    return [to_sequence(line) for line in lines]


In [None]:
Q = QA_df['q'].values
A = QA_df['a'].values

Q, A = filter_length(Q, A, minlen=10, maxlen=60)

vocab, vocab_inverse = gen_vocabs(Q+A)
Qt = gen_sequences(Q)
At = gen_sequences(A)

len(vocab), len(Qt), len(At), Qt[:1], At[:1]
#словарь длиной всего 96, притом мы сохраняем кейс, не делаем .lower()

(96,
 53020,
 53020,
 [[41,
   79,
   83,
   72,
   15,
   4,
   73,
   70,
   4,
   79,
   78,
   76,
   89,
   4,
   87,
   69,
   4,
   67,
   79,
   85,
   76,
   68,
   4,
   70,
   73,
   78,
   68,
   4,
   45,
   65,
   84,
   4,
   65,
   4,
   66,
   79,
   89,
   70,
   82,
   73,
   69,
   78,
   68,
   17,
   17,
   17,
   2]],
 [[46,
   69,
   84,
   4,
   77,
   69,
   4,
   83,
   69,
   69,
   4,
   87,
   72,
   65,
   84,
   4,
   43,
   4,
   67,
   65,
   78,
   4,
   68,
   79,
   17,
   2]])

In [None]:
MAX_LEN = 1000
N_LSTM = 512
N_EMBED = 32
N_VOCAB = len(vocab)

import keras.layers as L

def build_model():
    X = L.Input(shape=(None,))
    D = L.Input(shape=(None,))
    e = L.Embedding(N_VOCAB, N_EMBED, mask_zero=True)(X)
    d = L.Embedding(N_VOCAB, N_EMBED, mask_zero=True)(D)

    enc, fh, fc, bh, bc = L.Bidirectional(L.LSTM(units=N_LSTM, return_sequences=True, return_state=True, dropout=0.1))(e)   
    h = L.Concatenate()([fh, bh])
    c = L.Concatenate()([fc, bc])

    #decoder
    dec = L.LSTM(N_LSTM*2, return_sequences=True, dropout=0.1)(d, initial_state=[h, c])
    
    #attention
    att = L.Attention(use_scale=True)([dec, enc])                                      
    out = L.Concatenate()([att, dec])

    #выходные слои 
    d = L.TimeDistributed(L.Dense(2048, activation='relu'))(out)
    d = L.Dropout(0.1)(d)
    Y = L.TimeDistributed(L.Dense(N_VOCAB, activation='softmax'))(d)
    return keras.models.Model(inputs=[X,D], outputs=Y)
    
model = build_model()
opt = keras.optimizers.Adam(lr=0.0001)
model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()

Model: "functional_23"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_23 (InputLayer)           [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_22 (Embedding)        (None, None, 32)     3072        input_23[0][0]                   
__________________________________________________________________________________________________
input_24 (InputLayer)           [(None, None)]       0                                            
__________________________________________________________________________________________________
bidirectional_11 (Bidirectional [(None, None, 1024), 2232320     embedding_22[0][0]               
______________________________________________________________________________________

In [None]:
BATCH_SIZE = 32
N_VOCAB = len(vocab)

def to_text(seq):
    return "".join([vocab_inverse[idx] for idx in seq])

def to_matrix(texts, maxlen=0):
    seqs = texts
    if maxlen == 0:
        maxlen = min(MAX_LEN, max(list(map(len, seqs))))
    return keras.preprocessing.sequence.pad_sequences(seqs, maxlen=maxlen, dtype='int32', padding='post', truncating='post', value=0)

import random

def shuffleXY(x, y):
    Z = list(zip(x, y))
    random.shuffle(Z)
    return zip(*Z)

def train_gen(x, y):
    offset = 0
    count = BATCH_SIZE
    while True:
        xt = to_matrix(x[offset:offset+count], 0)
        yt = to_matrix(y[offset:offset+count], 0)
        dt = np.zeros_like(yt)
        dt[:, 1:] = yt[:,:-1]
        dt[:, 0] = 1 # START
        yield [xt, dt], yt
        offset += count
        if offset >= len(x)//BATCH_SIZE*BATCH_SIZE:
            offset = 0
            
def _schedule(epoch, lr):
    if epoch < 3:
        return lr
    else:
        return lr * 0.97**epoch

def mend_proba(p):
    p = np.log(p)
    p[0] = -100
    p = np.exp(p)/np.sum(np.exp(p))
    return p

def sample(proba, temp=1.0, mend=False, method='choice'):
    if mend:
        proba = mend_proba(proba)
    if temp != 1.0:
        proba = np.log(proba) / temp
        proba = np.exp(proba)/np.sum(np.exp(proba))    
    if method == 'choice':
        return np.random.choice(len(proba), p=proba)
    elif method == 'multinomial':
        proba *= 0.9999
        return np.argmax(np.random.multinomial(1, proba, 1))

MAX_LEN = 100

def get_generation(seed="", temp=1.0, mend=False, method='choice'):
    seq = [1] # <START>
    x = np.array([to_sequence(seed)])
    idx = 0
    while len(seq) < MAX_LEN:
        d = np.array([seq])
        y = model.predict([x,d])
        p = y[0][-1]
        idx = sample(p, temp=temp, mend=mend, method=method)
        seq.append(idx)
        if idx in [0,2]: #<PAD><END>
            break
    return to_text(seq[1:-1])    

def generate_epoch_end(e, logs):
    print("\n\nEpoch = ", e+1, ", Sample generation = -Hello! How are you? -" + get_generation(seed="Hello! How are you?") + "\n")

def train_model(initial_epoch, n_epochs, train_gen):
    model.fit(train_gen, epochs=n_epochs, steps_per_epoch=len(Qt)//BATCH_SIZE,  initial_epoch=initial_epoch, 
                callbacks=[
                  tf.keras.callbacks.LearningRateScheduler(_schedule),
                  tf.keras.callbacks.ModelCheckpoint(filepath='./model.{epoch:03d}.hdf5'),
                  tf.keras.callbacks.LambdaCallback(on_epoch_end=generate_epoch_end)
              ])

len(Qt), len(At)

(53020, 53020)

In [None]:
opt = keras.optimizers.Adam(lr=0.001)
model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
train_model(0, 30, train_gen(Qt, At))

Epoch 1/30

Epoch =  1 , Sample generation = -Hello! How are you? -I dat son mis?

Epoch 2/30

Epoch =  2 , Sample generation = -Hello! How are you? -I mily Wace ce warey duthe purill, do> men.

Epoch 3/30

Epoch =  3 , Sample generation = -Hello! How are you? -Nag It there intsy sre sis?

Epoch 4/30

Epoch =  4 , Sample generation = -Hello! How are you? -Geathed. Jill there-ne wim timna?

Epoch 5/30

Epoch =  5 , Sample generation = -Hello! How are you? -Ho poull them bossine, ure... So you wear it?

Epoch 6/30

Epoch =  6 , Sample generation = -Hello! How are you? -Jeybcyour Ker forsehing a lithlh ofr ifyer rave <u> redalia...

Epoch 7/30

Epoch =  7 , Sample generation = -Hello! How are you? -Him anoaK madn of courlifat E0

Epoch 8/30

Epoch =  8 , Sample generation = -Hello! How are you? -Foonn, brare.

Epoch 9/30

Epoch =  9 , Sample generation = -Hello! How are you? -It's it.  I'm anyead wark. O-

Epoch 10/30

Epoch =  10 , Sample generation = -Hello! How are you? -Don't crime an

KeyboardInterrupt: ignored

In [None]:
opt = keras.optimizers.Adam(lr=0.0001)
model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
train_model(30, 50, train_gen(Qt, At))

Epoch 31/50

Epoch =  31 , Sample generation = -Hello! How are you? -Yesy! Hustenyul!

Epoch 32/50

Epoch =  32 , Sample generation = -Hello! How are you? -How do you know who the man you ton?

Epoch 33/50

Epoch =  33 , Sample generation = -Hello! How are you? -Sigh the clazy?

Epoch 34/50

Epoch =  34 , Sample generation = -Hello! How are you? -Thank you.

Epoch 35/50

Epoch =  35 , Sample generation = -Hello! How are you? -Then found one off of the hands! How did you tell you?

Epoch 36/50
 370/1656 [=====>........................] - ETA: 1:06 - loss: 0.6632 - accuracy: 0.6261

KeyboardInterrupt: ignored

In [None]:
model.reset_states()

seed = "Where is the money?"

print("Dialog question: ", seed)
print("Dialog possible answers:")

for i in range(20):
    print(str(i+1)+'.', get_generation(seed))

Dialog question:  Where is the money?
Dialog possible answers:
1. Yes, they dign't even tund to be all right.
2. What's up, Bamem...
3. ...do you think I'm very suspised.
4. It's all digler's around.  Louis?
5. What's the point?
6. Flack in your par.
7. What'll you take any time?  Tele�non, and I can't find it.
8. Oh ...-never hands a maroon car.
9. Yes.  From the bardrocn dick here!
10. I think I'm not going to speak about this brave...
11. Shrapminel? How go you want going to get in?
12. It's Hine.
13. Guess she talked to Maxle Party.
14. You don't know...
15. This is good news...
16. I thought it's the justom, fester. There are ode what?..
17. Well being Mictor?
18. You can't Marter, Nick I come --
19. Who's the river reaw things agend?
20. Will you finally sue doing it?
