In [1]:
import requests
from bs4 import BeautifulSoup
import re
import numpy as np

In [2]:
req = requests.get('https://www.imsdb.com/scripts/Matrix,-The.html')
soup = BeautifulSoup(req.content, 'lxml')

In [5]:
script = soup.find('pre')

In [157]:
script.text[:1000]

"\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\t\t\t\t\tTHE MATRIX\n\n\n\n\t\t\t\t\tWritten by\n\n\t\t\t\tLarry and Andy Wachowski\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\t\t\t\t\tApril 8, 1996\n\n\n\n\n\n\n\n\tFADE IN ON:\n\n\tCOMPUTER SCREEN\n\n\tSo close it has no boundaries.\n\n\tA blinking cursor pulses in the electric darkness like a\n\theart coursing with phosphorous light, burning beneath\n\tthe derma of black-neon glass.\n\n\tA PHONE begins to RING, we hear it as though we were \n\tmaking the call.  The cursor continues to throb,\n\trelentlessly patient, until --\n\n\t\t\t\t\tMAN (V.O.)\n\t\t\tHello?\n\n\tData now slashes across the screen, information flashing\n\tfaster than we read.\n\n\t\t\t\t\tSCREEN\n\t\t\tCall trans opt:  received.\n\t\t\t2-19-96  13:24:18  REC:Log>\n\n\t\t\t\t\tWOMAN (V.O.)\n\t\t\tI'm inside.  Anything to report?\n\n\tWe listen to the phone conversation as though we were on\n\ta third line.  The man's name is CYPHER.  The woman, \n\tTRINITY.\n\n\t\

names are preceded by 2 \n\n\t\t\t\t\t, dialogue is preceded by \n\t\t\t, stage direction is preceded by \n\n\t.  Problems that immediately come to mind: 
- making sure preceding spoken line is dialogue by someone else and not some transition
- making sure the format is the same between all scripts

With the neat formatting, that's all I can think of.

Leap of faith: conditions listed are going to apply to other scripts.  Unlikely, but let's see how it goes.

In [156]:
possible_characters = set([re.sub('[\n\t]', '', tagged.text) for tagged in script.findAll('b') if 'INT.' not in tagged.text and 'EXT.' not in tagged.text and ':' not in tagged.text])

In [114]:
all_lines = [chunk.split('\n\t\t\t') for chunk in script.text.split('\n\n\t\t\t\t\t')]

In [122]:
all_lines[101]

['NEO',
 'Shitshitshit.\n\n\n\tEXT.  SKYSCRAPER\n\n\tThe downtown office of CorTechs, a software development\n\tcompany.\n\n\n\tINT.  CORTECHS OFFICE\n\n\tThe main offices are along each wall, the windows\n\toverlooking downtown Chicago.\n\n\tRHINEHEART, the ultimate company man, lectures Neo\n\twithout looking at him, typing at his computer\n\tcontinuously.\n\n\tNeo stares at two window cleaners on a scaffolding\n\toutside, dragging their rubber squeegees down across the\n\tsurface of the glass.']

In [160]:
void_directions = lambda line: re.sub('\n\n+\t.*$', '', line, flags=re.S)
response = [(lines[0], ' '.join([void_directions(line) for line in lines[1:]])) for lines in all_lines if 'NEO' in lines[0]]

In [161]:
neo_lines_index = [index - 1 for index, lines in enumerate(all_lines) if 'NEO' in lines[0]]

In [178]:
probable_characters = [(character, len([line for line in all_lines if line[0]==character])) for character in possible_characters]
probable_characters = [character[0] for character in probable_characters if character[1] > 0]

In [163]:
prompts = [(lines[0], ' '.join([void_directions(line) for line in lines[1:]])) for index, lines in enumerate(all_lines) if index in neo_lines_index]

There is so much that will make this data garbage as far as naming it dialogue goes.  Movies will have a lot of stage direction that will influence the conversation.  Looking up a line doesn't guarantee that it's the same interaction by any means.  Right now, I'm just hoping to have grammar show up in the model, realizing how difficult it's going to be.  I don't want to cut off conversations just for the presence of stage direction, but they often signal change in conversation.  Maybe count line breaks in the future.

In [296]:
voidDirections = lambda line: re.sub('\n\n+\t.*$', '', line, flags=re.S)

def displayDialogue(url):
    req = requests.get(url)
    soup = BeautifulSoup(req.content, 'lxml')
    script = soup.find('pre')
    all_lines = [chunk.split('\n\t\t\t') for chunk in script.text.split('\n\n\t\t\t\t\t')]

    possible_characters = set([re.sub('[\n\t]', '', tagged.text) for tagged in script.findAll('b') if 'INT.' not in tagged.text and 'EXT.' not in tagged.text and ':' not in tagged.text])
    probable_characters = [(character, len([line for line in all_lines if line[0]==character])) for character in possible_characters]
    probable_characters = [character[0] for character in probable_characters if character[1] > 10]

    dialogues = []
    for character in probable_characters:
        response = [(lines[0], ' '.join([voidDirections(line) for line in lines[1:]])) for lines in all_lines if lines[0] == character]
        char_indicies = [index - 1 for index, lines in enumerate(all_lines) if lines[0] == character]
        prompts = [(lines[0], ' '.join([voidDirections(line) for line in lines[1:]])) for index, lines in enumerate(all_lines) if index in char_indicies]
        char_dialogues = [(prompt[1], response[1]) for prompt, response in zip(prompts, response) 
                             if prompt[1] and response[1] 
                             and prompt[0] != response[0] 
                             and ':' not in ' '.join([prompt[1], response[1]])]
        dialogues.extend(char_dialogues)
    return dialogues

## Attempt 2

Well that's not going to generalize well apparently.

In [241]:
url = 'https://www.imsdb.com/scripts/Alien.html'

In [242]:
req = requests.get(url)
soup = BeautifulSoup(req.content, 'lxml')
script = soup.find('pre')

In [243]:
possible_characters = [re.sub('[\n\t]|\s{2,}', '', tagged.text) for tagged in script.findAll('b') 
                       if 'INT' not in tagged.text 
                       and 'EXT' not in tagged.text 
                       and ':' not in tagged.text]

In [246]:
probable_characters = [(character, len([line for line in possible_characters if line==character])) for character in set(possible_characters)]
probable_characters = [character[0] for character in probable_characters if character[1] > 5]

In [247]:
probable_characters

['BROUSSARD',
 'COMPUTER',
 'ROBY',
 "BROUSSARD (CONT'D)",
 'MELKONIS',
 "ROBY (CONT'D)",
 'STANDARD',
 'HUNTER',
 "STANDARD (CONT'D)",
 'FAUST']

# New site

https://www.weeklyscript.com/ .  No random tabs and spaces to format here, so screw you imsdb. **Edit** Mein Gott 

Notes:
- remove (.*)'s


In [103]:
url = 'https://www.weeklyscript.com/Big+Lebowski,%20The.html'

In [124]:
req = requests.get(url)
soup = BeautifulSoup(req.content, 'lxml')
script = soup.find('center')

In [174]:
len(re.findall('(?<=[^\n])\n{1}(?=[^\n])', script.text))

2790

In [153]:
script1 = re.sub('([A-Z]+\)*)\n', '\\1:\n', script.text)
script1 = re.sub('(?<=[^\n])\n{1}', '#', script1).replace('  ', ' ')

In [154]:
#get rid of 
script1 = re.sub('\([a-z].*\)', '', script1).split('\n')

In [9]:
def lineType(line):
    if re.search('^[^a-z]*:[^a-z]*$', line):
        return '@setting'
    if re.match('^[A-Z.,\'\ ]+:', line):
        return '@speak'
    else:
        return '@scene'

In [10]:
lineType('INT. WELTON ACADAMY DINING HALL - DAY - VARIOUS SHOTS:#')

'@setting'

In [11]:
lineType('He moves off, then stops in front of Charlie Dalton.#')

'@scene'

In [12]:
script_split = [lineType(line) for line in script1]

## Further Processing to be done:
- cut entire list up to very first spoken line
- split by setting lines
- write something to tell whether to split by scene lines

In [14]:
first_spoken_line = np.where([line=='@speak' for line in script_split])[0][0]

In [15]:
setting_lines = np.where([line=='@setting' for line in script_split])[0]

In [16]:
scene_lines = np.where([line=='@scene' for line in script_split])[0]

In [17]:
script1[23]

"CHUCKIE:#Well you know how he loves animals #right? Anyway, last week he's drivin' #home...##"

In [19]:
a = script1[first_spoken_line]

In [20]:
a[:a.index(':#')]

'CHUCKIE'

wondering if scene changes accompanied by scene description is what constitutes change of context.  We'll try this for now.

In [21]:
dialogues = []
dialogue = []
current_type = 'none'
current_speaker = 'none'

for line in script1[first_spoken_line:]:
    previous_type = current_type
    current_type = lineType(line)
    
    if previous_type == '@setting' and current_type == '@scene' and dialogue:
        dialogues.append(dialogue)
        dialogue = []
        current_speaker = 'none'
        
    #at some point u should refactor so that u take care of (cont.) and justify the existence of the if statement
    #also take out the check try except
    if current_type == '@speak':
        try:
            name_break = line.index(':#')
            previous_speaker = current_speaker
            current_speaker = line[:name_break]
            current_line = line[name_break+2:]
            if previous_speaker in current_speaker:
                dialogue[-1] = dialogue[-1] + ' ' + current_line
            else:
                dialogue.append(current_line)
        except:
            print(line)

In [22]:
interactions = []
for dialogue in dialogues:
    if len(dialogue) > 1:
        interactions.extend([(prompt.replace('#', ' '), response.replace('#', ' ')) for prompt, response in zip(dialogue[:-1], dialogue[1:])])

In [23]:
def get_words(line):
    result = re.sub('[^a-z^A-Z\s]', '', line).lower()
    result = re.sub('\s+', ' ', result)
    return result[:-1] # all of these end in new line, this gets rid of the last space

In [24]:
interactions = [(get_words(prompt), get_words(response)) for prompt, response in interactions]

yeah probably get rid of names and stuff.

In [29]:
interactions[399]

('not a chance', 'yup youre lookin at lucky thirteen')

In [25]:
interactions[250][0].split(' ')

['youre', 'in', 'a', 'safe', 'place', 'will']

In [26]:
len(interactions)

955

## Ideas to assist in convergence

Maybe replace all names with pronouns or sir and stuff and introducing yourself.  Leave it in for now, hoping most dialogue isn't just introductions or circle jerks.

For longer bits, it might be worth taking the first sentence as the reply and the last sentence as prompting the next response.

More noise: when two people respond to the same question.

In [152]:
dialogues[17]

['You suck.#',
 'What?#',
 "I've been sitting over there for #forty-five minutes waiting for you #to come talk to me. But I'm just #tired now and I have to go home and #I wasn't going to keep sitting there #waiting for you.#",
 "I'm Will.#",
 "Skylar. And by the way. That guy #over there is a real dick and I just #wanted you to know he didn't come #with us.#",
 'I kind of got that impression.#',
 "Well, look, I have to go. Gotta' get #up early and waste some more money #on my overpriced education.#",
 "I didn't mean you. Listen, maybe...#",
 "Here's my number.# Maybe we could go out for coffee #sometime?#",
 'Great, or maybe we could go somewhere #and just eat a bunch of caramels.#',
 'What?#',
 "When you think about it, it's just #as arbitrary as drinking coffee.#",
 '#Okay, sounds good.#',
 'Five minutes.#',
 'What?#',
 'I was trying to be smooth.##But at twelve-fifteen I was gonna #come over there and talk to you.#',
 "See, it's my life story. Five more #minutes and I would have got

## Download Scripts

In [30]:
path = 'scripts/'

In [270]:
main_url = 'https://www.weeklyscript.com/'

In [250]:
movies_url = 'https://www.weeklyscript.com/movies.htm'

In [251]:
req = requests.get(movies_url)
soup = BeautifulSoup(req.content, 'lxml')

In [256]:
link_tags = soup.find('table').find_all('a')[:-1]

In [259]:
tag = link_tags[0]

In [292]:
tag.parent

<b><a href="movies_A.htm">A</a></b>

In [271]:
alpha_links = [main_url + tag.attrs['href'] for tag in link_tags]

In [281]:
alpha_links[0]

'https://www.weeklyscript.com/movies_A.htm'

In [291]:
def makeScript(url):
    req = requests.get(url)
    soup = BeautifulSoup(req.content, 'lxml')
    script = soup.find('center')
    return script

In [299]:
for alpha_link in alpha_links:
    req = requests.get(alpha_link)
    soup = BeautifulSoup(req.content, 'lxml')
    file_names = [tag.attrs['href'] for tag in soup.find_all('a') if tag.contents[0] == '.TXT']

    for file_name in file_names:
        link = main_url + file_name
        script = makeScript(link)
        path = './scripts/' + file_name
        with open(path, 'w') as f:
            f.write(script.text)

what if something goes wrong during the downloading?  gonna find out i guess.  probably jumped the gun on that.

## Barely Cleaned Up

In [5]:
#needs refactor to be way shorter and more readable
import re
import numpy as np

def makeDialogue(script_text):
    num_breaks_regex = getNormalLineBreaks(script_text)
    script1 = re.sub('([A-Z]+\)*)\n', '\\1:\n', script_text)
    script1 = re.sub(num_breaks_regex, '#', script1)#.replace('  ', ' ')
    #replaces those stage direction aside stuffs
    script1 = re.sub('\([a-z].*\)', '', script1).split('\n')
    #not sure this is actually important try pulling the first line another way
    script_split = [lineType(line) for line in script1]
    
    dialogues = []
    dialogue = []
    current_type = 'none'
    current_speaker = 'none'
    first_spoken_line = np.where([line=='@speak' for line in script_split])[0][0]

    for line in script1[first_spoken_line:]:
        previous_type = current_type
        current_type = lineType(line)

        if previous_type == '@setting' and current_type == '@scene' and dialogue:
            dialogues.append(dialogue)
            dialogue = []
            current_speaker = 'none'

        #at some point u should refactor so that u take care of (cont.) and justify the existence of the if statement
        #also take out the check try except
        if current_type == '@speak':
            try:
                name_break = line.index(':#')
                previous_speaker = current_speaker
                current_speaker = line[:name_break]
                current_line = line[name_break+2:]
                if previous_speaker in current_speaker:
                    dialogue[-1] = dialogue[-1] + ' ' + current_line
                else:
                    dialogue.append(current_line)
            except:
                print(line)
    
    interactions = []
    for dialogue in dialogues:
        if len(dialogue) > 1:
            interactions.extend([(prompt.replace('#', ' '), response.replace('#', ' ')) for prompt, response in zip(dialogue[:-1], dialogue[1:])])
    interactions = [(getWords(prompt), getWords(response)) for prompt, response in interactions]
    
    return interactions

def lineType(line):
    if re.search('^[^a-z]*:[^a-z]*$', line):
        return '@setting'
    if re.match('^[A-Z.,\'\ ]+:', line):
        return '@speak'
    else:
        return '@scene'

def getNormalLineBreaks(script_text):
    current_lines = len(re.findall('(?<=[^\n])\n{1}(?=[^\n])', script_text))
    #this will work for the small sample i've seen but maybe consider changing break condition
    for lines_for_break in range(1, 10) :
        expression = r'(?<=[^\n])\n{' + str(lines_for_break+1) + r'}(?=[^\n])'
        next_lines = len(re.findall(expression, script_text))
        if (current_lines - next_lines) > 200 and current_lines > 1000 and next_lines > 1000:
            regex_return = '(?<=[^\n])\n{' + str(lines_for_break) + '}'
            return regex_return
        current_lines = next_lines
    raise RuntimeError('Could not find difference between dialogue line breaks')
            
def getWords(line):
    result = re.sub('[^a-z^A-Z\s]', '', line).lower()
    result = re.sub('\s+', ' ', result)
    result = result[:-1].split() # all of these end in new line, this gets rid of the last space
    return ['BOS'] + result + ['EOS']

In [6]:
with open('./scripts/Aliens.txt', 'r') as f:
    script = f.read()
    interactions = makeDialogue(script)

In [8]:
from getInputs import WordIndex

In [None]:
language = WordIndex(0)