<a href="https://colab.research.google.com/github/princoo/P_chatbot/blob/main/P_chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# and put in a ``data/`` directory under the current directory.
#
# After that, let’s import some necessities.
#

import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math
import json


USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")
device

device(type='cpu')

In [3]:
corpus_name = "movie-corpus"
drive_folder = os.path.join("drive", "MyDrive")
data_folder = os.path.join(drive_folder, "data")
corpus = os.path.join(data_folder, corpus_name)
def printLines(file, n=10):
    with open(file, 'rb') as datafile:
        lines = datafile.readlines()
    for line in lines[:n]:
        print(line)
printLines(os.path.join(corpus, "utterances.jsonl"))


b'{"id": "L1045", "conversation_id": "L1044", "text": "They do not!", "speaker": "u0", "meta": {"movie_id": "m0", "parsed": [{"rt": 1, "toks": [{"tok": "They", "tag": "PRP", "dep": "nsubj", "up": 1, "dn": []}, {"tok": "do", "tag": "VBP", "dep": "ROOT", "dn": [0, 2, 3]}, {"tok": "not", "tag": "RB", "dep": "neg", "up": 1, "dn": []}, {"tok": "!", "tag": ".", "dep": "punct", "up": 1, "dn": []}]}]}, "reply-to": "L1044", "timestamp": null, "vectors": []}\n'
b'{"id": "L1044", "conversation_id": "L1044", "text": "They do to!", "speaker": "u2", "meta": {"movie_id": "m0", "parsed": [{"rt": 1, "toks": [{"tok": "They", "tag": "PRP", "dep": "nsubj", "up": 1, "dn": []}, {"tok": "do", "tag": "VBP", "dep": "ROOT", "dn": [0, 2, 3]}, {"tok": "to", "tag": "TO", "dep": "dobj", "up": 1, "dn": []}, {"tok": "!", "tag": ".", "dep": "punct", "up": 1, "dn": []}]}]}, "reply-to": null, "timestamp": null, "vectors": []}\n'
b'{"id": "L985", "conversation_id": "L984", "text": "I hope so.", "speaker": "u0", "meta": {

In [4]:
# Splits each line of the file to create lines and conversations
def loadLinesAndConversations(fileName):
    lines = {}
    conversations = {}
    with open(fileName, 'r', encoding='utf-8') as f:
        for line in f:
            lineJson = json.loads(line)
            # Extract fields for line object
            lineObj = {}
            lineObj["lineID"] = lineJson["id"]
            lineObj["characterID"] = lineJson["speaker"]
            lineObj["text"] = lineJson["text"]
            lines[lineObj['lineID']] = lineObj

            # Extract fields for conversation object
            if lineJson["conversation_id"] not in conversations:
                convObj = {}
                convObj["conversationID"] = lineJson["conversation_id"]
                convObj["movieID"] = lineJson["meta"]["movie_id"]
                convObj["lines"] = [lineObj]
            else:
                convObj = conversations[lineJson["conversation_id"]]
                convObj["lines"].insert(0, lineObj)
            conversations[convObj["conversationID"]] = convObj
    return lines, conversations


# Extracts pairs of sentences from conversations
def extractSentencePairs(conversations):
    qa_pairs = []
    for conversation in conversations.values():
        # Iterate over all the lines of the conversation
        for i in range(len(conversation["lines"]) - 1):  # We ignore the last line (no answer for it)
            inputLine = conversation["lines"][i]["text"].strip()
            targetLine = conversation["lines"][i+1]["text"].strip()
            # Filter wrong samples (if one of the lists is empty)
            if inputLine and targetLine:
                qa_pairs.append([inputLine, targetLine])
    return qa_pairs

In [5]:
# Define path to new file
datafile = os.path.join(corpus, "formatted_movie_lines.txt")

delimiter = '\t'
# Unescape the delimiter
delimiter = str(codecs.decode(delimiter, "unicode_escape"))

# Initialize lines dict and conversations dict
lines = {}
conversations = {}
# Load lines and conversations
print("\nProcessing corpus into lines and conversations...")
lines, conversations = loadLinesAndConversations(os.path.join(corpus, "utterances.jsonl"))

# Write new csv file
print("\nWriting newly formatted file...")
with open(datafile, 'w', encoding='utf-8') as outputfile:
    writer = csv.writer(outputfile, delimiter=delimiter, lineterminator='\n')
    for pair in extractSentencePairs(conversations):
        writer.writerow(pair)

# Print a sample of lines
print("\nSample lines from file:")
printLines(datafile)


Processing corpus into lines and conversations...

Writing newly formatted file...

Sample lines from file:
b'They do to!\tThey do not!\n'
b'She okay?\tI hope so.\n'
b"Wow\tLet's go.\n"
b'"I\'m kidding.  You know how sometimes you just become this ""persona""?  And you don\'t know how to quit?"\tNo\n'
b"No\tOkay -- you're gonna need to learn how to lie.\n"
b'"The ""real yWhat good stuff?ou""."\tLike my fear of wearing pastels?\n'
b'do you listen to this crap?\tWhat crap?\n'
b"What crap?\tMe.  This endless ...blonde babble. I'm like, boring myself.\n"
b"Me.  This endless ...blonde babble. I'm like, boring myself.\tThank God!  If I had to hear one more story about your coiffure...\n"
b'"Then Guillermo says, ""If you go any lighter, you\'re gonna look like an extra on 90210."""\tNo...\n'


In [1]:
PAD_TOKEN = 0 # Used for padding short sentences
SOS_TOKEN = 1 # Start-of-sentence token
EOS_TOKEN = 2 # End-of-sentence token

class Voc:
  def __init__(self,name):
    self.name = name
    self.trimmed = False
    self.word2index = {}
    self.word2count = {}
    self.index2word = {PAD_TOKEN:"PAD",SOS_TOKEN:"SOS",EOS_TOKEN:"EOS"}
    self.num_words = 3 # we are counting SOS,PAD,EOS

  def addSentence(self,sentence):
    for word in sentence.split(' '):
      self.addWord(word)

  def addWord(self,word):
    if word not in self.word2index:
      self.word2index[word] = self.num_words
      self.word2count[word] = 1
      self.index2word[self.num_words] = word
      self.num_words +=1
    else:
      self.word2count[word] +=1

  def trim(self, min_count):
    if self.trimmed:
      return
    self.trimmed = True
    keep_words=[]
    for k, v in self.word2count.items():
      if v>=min_count:
        keep_words.append(k)

    print('keep_words {} / {} = {:.4f}'.format(
            len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
    ))

    #  we now re initialize the dictionaries to add only the words in keep_words array
    self.word2index={}
    self.word2count={}
    self.index2word= {PAD_TOKEN:"PAD",SOS_TOKEN:"SOS",EOS_TOKEN:"EOS"}
    self.num_words=3

    for word in keep_words:
      self.addWord(word)




In [None]:
MAX_LENGTH = 10 # maximun sentence length

#  func to turn a unicode string to a plain ascii
def unicodeToAscii(s):
  return ''.join(c for c in unicodedata.normalize('NFD',s) if unicodedata.category(c) !='Mn')

def normalizeString(s):
  s = unicodeToAscii(s.lower().strip()) # uses the above function to normalize the string first
  s = re.sub(r"([.!?])",r" \1", s) # adds a space before these punctuations
  s = re.sub(r"[^a-zA-z]+", r" ", s) # any character thats not in these is replaced with a space
  s = re.sub(r"\s+",r" ",s).strip() # multiple charcts are replaced by a single one  and applies strip()


def readVocs(datafile, corpus_name):
  #  read file and split into lines
  lines = open(datafile, encoding='utf-8').read().strip().split('\n')
  # split every line into pairs and normalize
  pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
  voc = Voc(corpus_name)
  return voc, pairs

def filterPair(pair):
  return len(pair[0].split(' ')) < MAX_LENGTH and len(pair[1].split(' ')) < MAX_LENGTH

def filterpairs(pairs):
  return [pair for pair in pairs if filterPair(pair)]

# function to populate voc object and pairs list
def loadPrepareData(corpus, corpus_name, datafile, save_dir):
  print("Start preparing data ...")
  voc, pairs = readVocs(datafile, corpus_name)
  print("Read {!s} sentence pairs".format(len(pairs)))
  pairs = filterpairs(pairs)
  print("Trimmed to {!s} sentence pairs".format(len(pairs)))
  print("counting words ...")
  for pair in pairs:
    voc.addSentence(pair[0])
    voc.addSentence(pair[1])
  print("counted words:", voc.num_words)
  return voc, pairs



