## CORNELL MOVIE CORPUS

In [63]:
import json
import re
import ast
from tqdm import tqdm
import pickle

In [56]:
movie_lines_path = "/Users/rohan/Desktop/projects/python-chatbot/movie_lines.txt"
movie_conversations_path = "/Users/rohan/Desktop/projects/python-chatbot/movie_conversations.txt"
movie_conversation_fields = ['char1ID', 'char2ID', 'movieID', 'utterances']
movie_lines_fields = ['lineId', 'charId', 'movieId', 'charname', 'text']

def splitConversations(max_len=20, fast_preprocessing=True):
    conversations = getconv()
    data = []
    for i, conversation in enumerate(tqdm(conversations)):
        lines = conversation['lines']
        for i in range(len(lines) - 1):
            request = preprocess_text(lines[i])
            reply = preprocess_text(lines[i + 1])
            if 0 < len(request.split()) <= max_len and 0 < len(reply.split()) <= max_len:
                data += [{'request': request,'reply':reply}]
    return data

def getlines():
    with open(movie_lines_path, 'r', encoding='iso-8859-1') as f:
        lines = {}
        for line in f:
            values = line.split(' +++$+++ ')
            lineobj = {}
            for i, field in enumerate(movie_lines_fields):
                lineobj[field] = values[i]
                #lineobj['id'] = int(re.sub('L','',lineobj['lineId']))
            lines[lineobj['lineId']] = lineobj
    return(lines)

def getconv():
    lines = getlines()
    with open(movie_conversations_path, 'r', encoding='iso-8859-1') as f:
        conversations = []
        for line in f:
            values = line.split(' +++$+++ ')
            #print(len(values))
            lineobj = {}
            for i, field in enumerate(movie_conversation_fields):
                lineobj[field] = values[i]
            lineIds = ast.literal_eval(lineobj["utterances"])
            lineobj['lines'] = []
            for lineid in lineIds:
                lineobj['lines'].append(lines[lineid]['text'])
            conversations.append(lineobj)
    return conversations

def preprocess_text(line):
    GOOD_SYMBOLS_RE = re.compile('[^0-9a-z ]')
    REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;#+_]')
    REPLACE_SEVERAL_SPACES = re.compile('\s+')
    line = line.lower()
    line = re.sub('\n','', line)
    line = re.sub('can\'t','cannot', line)
    line = re.sub('won\'t','will not', line)
    line = re.sub('\'ll',' will', line)
    line = re.sub('n\' t',' not',line)
    line = re.sub('\'m',' am', line)
    line = re.sub('\'d',' would', line)
    line = re.sub('\'re',' are', line)
    line = re.sub('\'ve',' have', line)
    line = re.sub('\'s',' is', line)
    line = REPLACE_BY_SPACE_RE.sub(' ', line)
    line = GOOD_SYMBOLS_RE.sub('', line)
    line = REPLACE_SEVERAL_SPACES.sub(' ', line)
    return line.strip()

In [57]:
cornell_data = splitConversations()

100%|██████████| 83097/83097 [00:07<00:00, 10548.63it/s]


In [58]:
cornell_data[1:5]

[{'request': 'not the hacking and gagging and spitting part please',
  'reply': 'okay then how bout we try out some french cuisine saturday night'},
 {'request': 'you are asking me out that is so cute what is your name again',
  'reply': 'forget it'},
 {'request': 'no no it is my fault we didnt have a proper introduction',
  'reply': 'cameron'},
 {'request': 'gosh if only we could find kat a boyfriend',
  'reply': 'let me see what i can do'}]

In [59]:
len(cornell_data)

167497

In [60]:
pickle.dump(cornell_data, open('cornell_data.pkl','wb'))

## -----------------------------------------------------------------------------------------------------
## -----------------------------------------------------------------------------------------------------
## -----------------------------------------------------------------------------------------------------

## OpenSubs

In [200]:
import pprint
import json
import os
import sys
import xml.etree.ElementTree as ET
import datetime
from time import time
import nltk

In [209]:
open_subtitles_path = "/Users/rohan/Desktop/projects/python-chatbot/OpenSubtitles/xml/en/"

In [210]:
files = os.listdir(open_subtitles_path)
xml_files = []
for i in files:
    try:
        filelist = os.listdir(open_subtitles_path+i)
        for file in filelist:
            try:
                finallist = os.listdir(open_subtitles_path+i+"/"+file)
                for x in finallist:
                    if x.endswith('.xml'):
                        xml_files.append(open_subtitles_path+i+"/"+file+"/"+x)
            except NotADirectoryError:
                if file.endswith('.xml'):
                    xml_files.append(open_subtitles_path+i+"/"+file)
    except NotADirectoryError:
        if file.endswith('.xml'):
                xml_files.append(open_subtitles_path+i)

In [211]:
def genList(tree):
    root = tree.getroot()
    timeFormat = '%H:%M:%S'
    maxDelta = datetime.timedelta(seconds=1)
    startTime = datetime.datetime.min
    strbuf = ''
    sentList = []
    for child in root:
        for elem in child:
            if elem.tag == 'time':
                try:
                    elemID = elem.attrib['id']
                    elemVal = elem.attrib['value'][:-4]
                    if elemID[-1] == 'S':
                        startTime = datetime.datetime.strptime(elemVal, timeFormat)
                    else:
                        sentList.append((strbuf.strip(), startTime, datetime.datetime.strptime(elemVal, timeFormat)))
                        strbuf = ''
                except:
                    continue
            else:
                try:
                    strbuf = strbuf + " " + elem.text
                except:
                    pass
    conversations = []
    for idx in range(0, len(sentList) - 1):
        cur = sentList[idx]
        nxt = sentList[idx + 1]
        if nxt[1] - cur[2] <= maxDelta and cur and nxt:
            tmp = {}
            tmp["lines"] = []
            tmp["lines"].append(getLine(cur[0]))
            tmp["lines"].append(getLine(nxt[0]))
            if filterqa(tmp):
                conversations.append(tmp)
    return conversations

def getLine(sentence):
    tag_re = re.compile(r'(<!--.*?-->|<[^>]*>)')
    line = {}
    line["text"] = tag_re.sub('', sentence).replace('\\\'','\'').strip().lower()
    return line


def filterqa(lines):
    # Use the followint to customize filtering of QA pairs
    startwords = ("what", "how", "when", "why", "where", "do", "did", "is", "are", "can", "could", "would", "will")
    question = lines["lines"][0]["text"]
    if not question.endswith('?'):
        return False
    if not question.split(' ')[0] in startwords:
        return False
    return True

In [215]:
def getConversation(files):
    converations = []
    for i,file in enumerate(tqdm(files)):
        converations.extend(genList(ET.parse(file)))
    return converations

In [216]:
opensub_conversations = getConversation(xml_files)

100%|██████████| 2317/2317 [02:10<00:00, 17.73it/s]


In [222]:
for i, pair in enumerate(tqdm(opensub_conversations)):
    questions.append(pair['lines'][0]['text'])
    answers.append(pair['lines'][1]['text'])
#    print(pair['lines'][0])

100%|██████████| 68367/68367 [00:00<00:00, 965488.39it/s]


In [225]:
cornell = pd.DataFrame(columns=['question','answer'])
cornell['question'] = questions
cornell['answer'] = answers

In [226]:
cornell.to_csv("cornell_pairs.csv",index=False)

## -----------------------------------------------------------------------------------------------------
## -----------------------------------------------------------------------------------------------------
## -----------------------------------------------------------------------------------------------------

## N-GRAM MODEL

In [149]:
from collections import *

In [181]:
def nlg_train(questions, answers, order=4):
    lm = defaultdict(Counter)
   # for cur_order in range(order):
    pad = '~' * order
    for i in range(len(questions)):
        data = pad + questions[i] + answers[i]
        for j in range(len(data) - order):
            history, char = data[j:j+order], data[j+order]
            lm[history][char] +=1
    def normalize(counter):
        s = float(sum(counter.values()))
        return [(c,cnt/s) for c,cnt in counter.items()]
    outlm = {hist:normalize(chars) for hist, chars in lm.items()}
    return outlm

In [173]:
from random import random

def gen_letter(lm, history, order):
    history = history[-order:]
    dist = lm[history]
    x = random()
    for c, v in dist:
        x = x - v
        if x<=0: return c

In [187]:
def gen_text(question,lm, order, nletters=60):
    pad = '~' * order
    history = pad + question
    out = []
    for i in range(nletters):
        c = gen_letter(lm, history, order)
        history = history[-order:] + c
        out.append(c)
    return "".join(out)

In [186]:
lm = nlg_train(questions, answers, order=10)
print( gen_text(lm,10))

no no trouble back theyll stay in touchthats more like thatw


In [189]:
print(gen_text('hi there',lm,10))

KeyError: 'rk herehuh'