In [None]:
from google.colab import drive
drive.mount("/content/drive")
HOME = 'drive/MyDrive'

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

import nltk
nltk.download('punkt')
import random
import pandas as pd
import numpy as np

In [None]:
dataset_quotes = []

import os
with open("drive/MyDrive/Data/PlainSimpleGarak-data.txt", 'r') as f: # open in readonly mode
  while True:
    line = f.readline().strip()
    if line == '':
        break
    else:
      dataset_quotes.append(line)

dataset_startwords = []
for s in dataset_quotes:
  dataset_startwords.append(nltk.word_tokenize(s)[0].lower())

dataset_raw = pd.read_csv('drive/MyDrive/Data/TNG.csv', error_bad_lines=False, engine='python')

dataset_quotes_nonascii = list(dataset_raw[dataset_raw['type']=="speech"]["text"])

for s in dataset_quotes_nonascii:
  if (len(s) == len(s.encode())):
    if ("$" not in s and "&" not in s):
      dataset_quotes.append(s)

In [None]:
def preprocess_txt(dataset_quotes, append=True):
  for i in range(len(dataset_quotes)):
    dataset_quotes[i] = dataset_quotes[i].lower()
    if (append == True):
      dataset_quotes[i] = dataset_quotes[i] + " END"

preprocess_txt(dataset_quotes)
preprocess_txt(dataset_startwords, False)

#dataset_quotes.sort()
dataset_startwords = list(set(dataset_startwords))
#dataset_startwords.sort()

In [None]:
def encode(string, word2index):
  return torch.LongTensor([[word2index[wd] for wd in nltk.word_tokenize(string)]])

def decode(vec, index2word):
  return [index2word.get(x) for x in vec]

class Dataset(torch.utils.data.Dataset):
  def __init__(self, txt, seq_len, word2index):
    self.encoded = [word2index[wd] for wd in txt]
    self.seq_len = seq_len

  def __len__(self):
    return len(self.encoded) - self.seq_len

  def __getitem__(self, index):
    return ( torch.tensor(self.encoded[index:index+self.seq_len]), torch.tensor(self.encoded[index+1:index+self.seq_len+1]) )

txt_quotes = " ".join(dataset_quotes)
txt_quotes = nltk.word_tokenize(txt_quotes)

list_words = list(set(txt_quotes))
list_words.sort()

word2index = {tkn: i for i, tkn in enumerate(list_words, 1)}
word2index['UNKNOWN']=0
index2word = {v: k for k, v in word2index.items()}

In [None]:
class Net_variant(nn.Module):
  def __init__(self, embed_size, input_dim, hidden_dim, batch_first=True, n_layers = 1, dropout = 0.2):
    super(Net_variant, self).__init__()

    self.n_layers = n_layers #unused
    self.hidden_dim = hidden_dim

    #shared embedding layer
    self.embedding_layer = nn.Embedding(num_embeddings=embed_size, embedding_dim=input_dim)
    
    #1
    self.rnn_layer1 = nn.GRU(input_dim, hidden_dim, batch_first=batch_first, num_layers=n_layers, dropout=dropout)
    self.linear1 = nn.Linear(hidden_dim, embed_size)

    #2
    self.rnn_layer2 = nn.GRU(input_dim, hidden_dim, batch_first=batch_first, num_layers=n_layers, dropout=dropout)
    self.linear2 = nn.Linear(hidden_dim, embed_size)

  def forward(self, x):
    output = self.embedding_layer(x)

    if (random.randrange(2) == 0):
      output1, hidden1 = self.rnn_layer1(output)
      output1 = self.linear1(output1)
      return output1
    else:
      output2, hidden2 = self.rnn_layer2(output)
      output2 = self.linear2(output2)
      return output2

    #return output
    #return output.view(-1, output.size(2))

In [None]:
vocab_size = len(word2index)
input_size =  128
hidden_size = 256

In [None]:
model = Net_variant(vocab_size, input_size, hidden_size, batch_first=True)
model.cuda()

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=model.parameters())

dataset = Dataset(txt_quotes, 6, word2index)
dataloader = DataLoader(dataset, batch_size=128, shuffle=True)

In [None]:
epoch_to_load = 0

if epoch_to_load != 0:
  x = torch.load("drive/MyDrive/Data/Checkpoint1/" + "CPOINT-" + str(epoch_to_load))
  model.load_state_dict(x['model_state_dict'])
  optimizer.load_state_dict(x['optimizer_state_dict'])
  epoch_to_load = epoch_to_load + 1

In [None]:
def test_model(model, word2index, index2word, string="", maxlen=25, verbose=False):
  model.eval()

  eval_input = encode(string, word2index).cuda()
  print("INITIAL INPUT: " + string)

  if verbose:
    print("---")

  for i in range(maxlen):
    output = model(eval_input)
    pred = output.softmax(-1).argmax(-1)

    if verbose:
      print("INPUT: " + " ".join( decode(eval_input.tolist()[0],index2word)))
      print("OUTPUT: " + " ".join( decode(pred[0].tolist(), index2word)))

    eval_input = torch.cat((eval_input,pred[:,-1].unsqueeze(0)), 1)

    if word2index['END'] in eval_input:
      break

  print("GENERATED SEQUENCE: " + " ".join( decode(eval_input.tolist()[0],index2word)))
  print("")

In [None]:
for epoch in range(epoch_to_load, 126):
  model.train()

  for batch, (input, target) in enumerate(dataloader):
    optimizer.zero_grad()
    output = model(input.cuda())
    loss = criterion(output.transpose(1, 2), target.cuda())
    loss.backward()
    optimizer.step()

  model.eval()
  print("Epoch {:02d} / 126 Loss {:.4f}".format(epoch+1, loss))
  
  print("===========================================================================")

  print("TARGET: " + " ".join( decode(target[0].tolist(),index2word)))
  print("INPUT: " + " ".join( decode(input[0].tolist(),index2word)))
  pred = output[0].softmax(-1).argmax(-1)
  print("PREDICTION: " + " ".join(decode(pred.tolist(),index2word)))

  print("---------------------------------------------------------------------------")

  print("TARGET: " + " ".join( decode(target[0].tolist(),index2word)))
  print("INPUT: " + " ".join( decode(input[0].tolist(),index2word)))
  pred = output[0].softmax(-1).argmax(-1)
  print("PREDICTION: " + " ".join(decode(pred.tolist(),index2word)))

  print("===========================================================================")

  randword = dataset_startwords[random.randrange(0,len(dataset_startwords))]

  test_model(model, word2index, index2word, randword)
  test_model(model, word2index, index2word, randword)

  test_model(model, word2index, index2word, "i am plain simple garak . " + randword)
  test_model(model, word2index, index2word, "i am plain simple garak . " + randword)

  print("===========================================================================")

  if (epoch % 5 == 0):
    #torch.save(model.state_dict(), "drive/MyDrive/Data/" + "Checkpoint-" + str(epoch) )
    torch.save(
        {'model_state_dict': model.state_dict(),
         'optimizer_state_dict': optimizer.state_dict(),},
        'drive/MyDrive/Data/Checkpoint1/' + "CPOINT-" + str(epoch)
               )

  print()

In [None]:
dataset_quotes = []
with open("drive/MyDrive/Data/PlainSimpleGarak-data.txt", 'r') as f: # open in readonly mode
  while True:
    line = f.readline().strip()
    if line == '':
        break
    else:
      dataset_quotes.append(line)

In [None]:
def preprocess_txt(dataset_quotes, append=True):
  for i in range(len(dataset_quotes)):
    dataset_quotes[i] = dataset_quotes[i].lower()
    if (append == True):
      dataset_quotes[i] = dataset_quotes[i] + " END"

preprocess_txt(dataset_quotes)
preprocess_txt(dataset_startwords, False)

#dataset_quotes.sort()
dataset_startwords = list(set(dataset_startwords))
#dataset_startwords.sort()

In [None]:
def encode(string, word2index):
  return torch.LongTensor([[word2index[wd] for wd in nltk.word_tokenize(string)]])

def decode(vec, index2word):
  return [index2word.get(x) for x in vec]

class Dataset(torch.utils.data.Dataset):
  def __init__(self, txt, seq_len, word2index):
    self.encoded = [word2index[wd] for wd in txt]
    self.seq_len = seq_len

  def __len__(self):
    return len(self.encoded) - self.seq_len

  def __getitem__(self, index):
    return ( torch.tensor(self.encoded[index:index+self.seq_len]), torch.tensor(self.encoded[index+1:index+self.seq_len+1]) )

txt_quotes = " ".join(dataset_quotes)
txt_quotes = nltk.word_tokenize(txt_quotes)

In [None]:
model = Net_variant(vocab_size, input_size, hidden_size, batch_first=True)
model.cuda()

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=model.parameters(), lr=0.001)

dataset = Dataset(txt_quotes, 10, word2index)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

In [None]:
epoch_to_load = 125

if epoch_to_load != 0:
  x = torch.load("drive/MyDrive/Data/Checkpoint1/" + "CPOINTB-" + str(epoch_to_load))
  model.load_state_dict(x['model_state_dict'])
  #optimizer.load_state_dict(x['optimizer_state_dict'])
  epoch_to_load = epoch_to_load + 1

In [None]:
randword = dataset_startwords[random.randrange(0,len(dataset_startwords))]

test_model(model, word2index, index2word, "good doctor")

In [None]:
for epoch in range(epoch_to_load, 201):
  model.train()

  for batch, (input, target) in enumerate(dataloader):
    optimizer.zero_grad()
    output = model(input.cuda())
    loss = criterion(output.transpose(1, 2), target.cuda())
    loss.backward()
    optimizer.step()

  model.eval()
  print("Epoch {:02d} / 201 Loss {:.4f}".format(epoch+1, loss))
  
  print("===========================================================================")

  print("TARGET: " + " ".join( decode(target[0].tolist(),index2word)))
  print("INPUT: " + " ".join( decode(input[0].tolist(),index2word)))
  pred = output[0].softmax(-1).argmax(-1)
  print("PREDICTION: " + " ".join(decode(pred.tolist(),index2word)))

  print("---------------------------------------------------------------------------")

  print("TARGET: " + " ".join( decode(target[0].tolist(),index2word)))
  print("INPUT: " + " ".join( decode(input[0].tolist(),index2word)))
  pred = output[0].softmax(-1).argmax(-1)
  print("PREDICTION: " + " ".join(decode(pred.tolist(),index2word)))

  print("===========================================================================")

  randword = dataset_startwords[random.randrange(0,len(dataset_startwords))]

  test_model(model, word2index, index2word, randword)
  test_model(model, word2index, index2word, randword)

  test_model(model, word2index, index2word, "i am plain simple garak . " + randword)
  test_model(model, word2index, index2word, "i am plain simple garak . " + randword)

  print("===========================================================================")

  if (epoch % 50 == 0):
    #torch.save(model.state_dict(), "drive/MyDrive/Data/" + "Checkpoint-" + str(epoch) )
    torch.save(
        {'model_state_dict': model.state_dict(),
         'optimizer_state_dict': optimizer.state_dict(),},
        'drive/MyDrive/Data/Checkpoint1/' + "CPOINT_FINETUNE-" + str(epoch)
               )

  print()

In [None]:
epoch_to_load = 1000

if epoch_to_load != 0:
  x = torch.load("drive/MyDrive/Data/Checkpoint1/" + "CPOINT_FINETUNE-" + str(epoch_to_load))
  model.load_state_dict(x['model_state_dict'])
  #optimizer.load_state_dict(x['optimizer_state_dict'])
  epoch_to_load = epoch_to_load + 1

In [None]:
randword = dataset_startwords[random.randrange(0,len(dataset_startwords))]

test_model(model, word2index, index2word, "doctor bashir lying is a skill .")

In [None]:
test_model(model, word2index, index2word, "i am plain simple garak .")
test_model(model, word2index, index2word, "i am plain simple garak .")

In [None]:
test_model(model, word2index, index2word, "i'm not a spy doctor .")
test_model(model, word2index, index2word, "i'm not a spy doctor .")