In [1]:
import requests
from bs4 import BeautifulSoup
import re

In [2]:
req = requests.get('https://www.imsdb.com/scripts/Matrix,-The.html')
soup = BeautifulSoup(req.content, 'lxml')

In [5]:
script = soup.find('pre')

In [157]:
script.text[:1000]

"\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\t\t\t\t\tTHE MATRIX\n\n\n\n\t\t\t\t\tWritten by\n\n\t\t\t\tLarry and Andy Wachowski\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\t\t\t\t\tApril 8, 1996\n\n\n\n\n\n\n\n\tFADE IN ON:\n\n\tCOMPUTER SCREEN\n\n\tSo close it has no boundaries.\n\n\tA blinking cursor pulses in the electric darkness like a\n\theart coursing with phosphorous light, burning beneath\n\tthe derma of black-neon glass.\n\n\tA PHONE begins to RING, we hear it as though we were \n\tmaking the call.  The cursor continues to throb,\n\trelentlessly patient, until --\n\n\t\t\t\t\tMAN (V.O.)\n\t\t\tHello?\n\n\tData now slashes across the screen, information flashing\n\tfaster than we read.\n\n\t\t\t\t\tSCREEN\n\t\t\tCall trans opt:  received.\n\t\t\t2-19-96  13:24:18  REC:Log>\n\n\t\t\t\t\tWOMAN (V.O.)\n\t\t\tI'm inside.  Anything to report?\n\n\tWe listen to the phone conversation as though we were on\n\ta third line.  The man's name is CYPHER.  The woman, \n\tTRINITY.\n\n\t\

names are preceded by 2 \n\n\t\t\t\t\t, dialogue is preceded by \n\t\t\t, stage direction is preceded by \n\n\t.  Problems that immediately come to mind: 
- making sure preceding spoken line is dialogue by someone else and not some transition
- making sure the format is the same between all scripts

With the neat formatting, that's all I can think of.

Leap of faith: conditions listed are going to apply to other scripts.  Unlikely, but let's see how it goes.

In [156]:
possible_characters = set([re.sub('[\n\t]', '', tagged.text) for tagged in script.findAll('b') if 'INT.' not in tagged.text and 'EXT.' not in tagged.text and ':' not in tagged.text])

In [114]:
all_lines = [chunk.split('\n\t\t\t') for chunk in script.text.split('\n\n\t\t\t\t\t')]

In [122]:
all_lines[101]

['NEO',
 'Shitshitshit.\n\n\n\tEXT.  SKYSCRAPER\n\n\tThe downtown office of CorTechs, a software development\n\tcompany.\n\n\n\tINT.  CORTECHS OFFICE\n\n\tThe main offices are along each wall, the windows\n\toverlooking downtown Chicago.\n\n\tRHINEHEART, the ultimate company man, lectures Neo\n\twithout looking at him, typing at his computer\n\tcontinuously.\n\n\tNeo stares at two window cleaners on a scaffolding\n\toutside, dragging their rubber squeegees down across the\n\tsurface of the glass.']

In [160]:
void_directions = lambda line: re.sub('\n\n+\t.*$', '', line, flags=re.S)
response = [(lines[0], ' '.join([void_directions(line) for line in lines[1:]])) for lines in all_lines if 'NEO' in lines[0]]

In [161]:
neo_lines_index = [index - 1 for index, lines in enumerate(all_lines) if 'NEO' in lines[0]]

In [178]:
probable_characters = [(character, len([line for line in all_lines if line[0]==character])) for character in possible_characters]
probable_characters = [character[0] for character in probable_characters if character[1] > 0]

In [163]:
prompts = [(lines[0], ' '.join([void_directions(line) for line in lines[1:]])) for index, lines in enumerate(all_lines) if index in neo_lines_index]

In [180]:
[(prompt, response) for prompt, response in zip(prompts, response) 
     if prompt[1] and response[1] 
     and prompt[0] != response[0] 
     and ':' not in ' '.join([prompt[1], response[1]])]

[(('VOICE (O.S.)', 'Hey, Tommy-boy!  You in there?'),
  ('NEO', 'What do you want, Anthony?')),
 (('ANTHONY',
   'I need your help, man.  Desperate. They got me, man.  The shackles of  fascism.'),
  ('NEO', 'You got the money this time?')),
 (('DUJOUR', "Why don't you come to the party with us?"),
  ('NEO', "I don't know.  I have to work tomorrow.")),
 (('DUJOUR', "Come on.  It'll be fun."),
  ('NEO', "Yeah, yeah.  Sure, I'll go.")),
 (('TRINITY', 'Hello, Neo.'), ('NEO', 'How did you know that --')),
 (('TRINITY',
   "I know a lot about you.  I've been wanting to meet you for some time."),
  ('NEO', 'Who are you?')),
 (('TRINITY', 'My name is Trinity.'),
  ('NEO',
   'Trinity?  The Trinity?  The Trinity that cracked the I.R.S. Kansas City D-Base?')),
 (('TRINITY', 'That was a long time ago.'), ('NEO', 'Gee-zus.')),
 (('TRINITY', 'What?'), ('NEO', 'I just thought... you were a guy.')),
 (('TRINITY', 'Most guys do.'),
  ('NEO', 'Do you want to go sorewhere and talk?')),
 (('TRINITY', "No

There is so much that will make this data garbage as far as naming it dialogue goes.  Movies will have a lot of stage direction that will influence the conversation.  Looking up a line doesn't guarantee that it's the same interaction by any means.  Right now, I'm just hoping to have grammar show up in the model, realizing how difficult it's going to be.  I don't want to cut off conversations just for the presence of stage direction, but they often signal change in conversation.  Maybe count line breaks in the future.

In [296]:
voidDirections = lambda line: re.sub('\n\n+\t.*$', '', line, flags=re.S)

def displayDialogue(url):
    req = requests.get(url)
    soup = BeautifulSoup(req.content, 'lxml')
    script = soup.find('pre')
    all_lines = [chunk.split('\n\t\t\t') for chunk in script.text.split('\n\n\t\t\t\t\t')]

    possible_characters = set([re.sub('[\n\t]', '', tagged.text) for tagged in script.findAll('b') if 'INT.' not in tagged.text and 'EXT.' not in tagged.text and ':' not in tagged.text])
    probable_characters = [(character, len([line for line in all_lines if line[0]==character])) for character in possible_characters]
    probable_characters = [character[0] for character in probable_characters if character[1] > 10]

    dialogues = []
    for character in probable_characters:
        response = [(lines[0], ' '.join([voidDirections(line) for line in lines[1:]])) for lines in all_lines if lines[0] == character]
        char_indicies = [index - 1 for index, lines in enumerate(all_lines) if lines[0] == character]
        prompts = [(lines[0], ' '.join([voidDirections(line) for line in lines[1:]])) for index, lines in enumerate(all_lines) if index in char_indicies]
        char_dialogues = [(prompt[1], response[1]) for prompt, response in zip(prompts, response) 
                             if prompt[1] and response[1] 
                             and prompt[0] != response[0] 
                             and ':' not in ' '.join([prompt[1], response[1]])]
        dialogues.extend(char_dialogues)
    return dialogues

In [297]:
display_dialogue('https://www.imsdb.com/scripts/Matrix,-The.html')

[('Did you get anything from the room?',
  'Their next target.  The name is Neo.'),
 ("We'll need a search running.", "It's already begun."),
 ('No!', 'The others were lost.'),
 ("I'm sorry, Tank.",
  'There is a problem.  Reagan has failed to secure the hardware.'),
 ("Never send a human to do a machine's job.",
  "But if Reagan has failed, why haven't they pulled the plug?"),
 ('Perhaps we are asking the wrong questions.', "Or he doesn't know."),
 ('What were you doing?', "You don't know."),
 ('Know what?', "I think they're trying to save him."),
 ('There is no spoon.', 'Lower level --'),
 ('... help.', 'Only human...'),
 ("Sir!  Sir!  There was gunfire -- we've lost communication with the roof!",
  'Remain at your posts.'),
 ('But, sir -- the fire -- we should evacuate!',
  'You will do as you are ordered!'),
 ('The trace was completed.', 'We have their position.'),
 ('The extermination unit is in place.', 'Order the strike.'),
 ("You're the Oracle?",
  "Bingo.  I got to say I love 

In [210]:
display_dialogue('https://www.imsdb.com/scripts/Alien.html')



## Attempt 2

Well that's not going to generalize well apparently.

In [241]:
url = 'https://www.imsdb.com/scripts/Alien.html'

In [242]:
req = requests.get(url)
soup = BeautifulSoup(req.content, 'lxml')
script = soup.find('pre')

In [243]:
possible_characters = [re.sub('[\n\t]|\s{2,}', '', tagged.text) for tagged in script.findAll('b') 
                       if 'INT' not in tagged.text 
                       and 'EXT' not in tagged.text 
                       and ':' not in tagged.text]

In [246]:
probable_characters = [(character, len([line for line in possible_characters if line==character])) for character in set(possible_characters)]
probable_characters = [character[0] for character in probable_characters if character[1] > 5]

In [247]:
probable_characters

['BROUSSARD',
 'COMPUTER',
 'ROBY',
 "BROUSSARD (CONT'D)",
 'MELKONIS',
 "ROBY (CONT'D)",
 'STANDARD',
 'HUNTER',
 "STANDARD (CONT'D)",
 'FAUST']

In [284]:
str(script)



In [294]:
re.findall('\ *ROBY.*(\n.+\n*.*)', script.text)

['\n          Executive Officer.......Cautious but intelligent -- a survivor.\n\n     DELL BROUSSARD,',
 '\n               Oh... God... am I cold... \n\n                              BROUSSARD',
 '\n               I feel like shit... \n\n                              BROUSSARD',
 '\n     seats.\n\n                              BROUSSARD',
 '\n                    (putting down the cat)\n               Cattle ranch!',
 '\n               Right.  Fire up all systems.\n\n     They begin to throw switches, lighting up their consoles.  The control',
 '\n               Where are we?\n\n                              STANDARD',
 '\n               What the hell?\n\n     Standard picks up a microphone.',
 "\n               Chaz, I've got something here on my\n               security alert.  A high priority from",
 '\n                    (punches buttons)\n               Computer, you have signalled a',
 '\n               What?  Why?\n\n                              COMPUTER',
 '\n               Un

In [269]:
print(re.sub('\s{2,}', '', str(script)))

<pre><title>"Alien", early draft, by Dan O'Bannon</title><pre><u>ALIEN</u>(project formerly titled <u>STARBEAST</u>)Story by Dan O'Bannon &amp; Ronald ShusettScreenplay by Dan O'Bannon<b>1976
</b><b>SYNOPSIS
</b>En route back to Earth from a far part of the galaxy, the crew of thestarship SNARK intercepts a transmission in an <u>alien language</u>,originating from a nearby storm-shrouded planet.Mankind has waited centuries to contact another form of intelligentlife in the universe -- they decide to land and investigate.Theirsearch takes them to a wrecked alien spacecraft whose doors gape open-- it is dead and abandoned.Inside they find, among other strangethings, the skeleton of one of the unearthly space travellers.Certain clues in the wrecked ship lead them across the hostile surfaceof the planet to a primitive stone pyramid, the only remnant of avanished civilization.Beneath this pyramid they find an ancient tombfull of fantastic artifacts.Lying dormant in the tomb are centuries-old

# New site

https://www.weeklyscript.com/ .  No random tabs and spaces to format here, so screw you imsdb. **Edit** Mein Gott 

Notes:
- remove (.*)'s


In [4]:
url = 'https://www.weeklyscript.com/Good+Will+Hunting.html'

In [5]:
req = requests.get(url)
soup = BeautifulSoup(req.content, 'lxml')
script = soup.find('center')

In [6]:
script1 = re.sub('([A-Z]+\)*)\n', '\\1:\n', script.text)\
            .replace('\n\n', '#')\
            .replace('  ', ' ')

In [7]:
script1 = re.sub('\([a-z].*\)', '', script1).split('\n')

In [9]:
def lineType(line):
    if re.search('^[^a-z]*:[^a-z]*$', line):
        return '@setting'
    if re.match('^[A-Z.,\'\ ]+:', line):
        return '@speak'
    else:
        return '@scene'

In [10]:
lineType('INT. WELTON ACADAMY DINING HALL - DAY - VARIOUS SHOTS:#')

'@setting'

In [11]:
lineType('He moves off, then stops in front of Charlie Dalton.#')

'@scene'

In [12]:
script_split = [lineType(line) for line in script1]

## Further Processing to be done:
- cut entire list up to very first spoken line
- split by setting lines
- write something to tell whether to split by scene lines

In [13]:
import numpy as np

In [14]:
first_spoken_line = np.where([line=='@speak' for line in script_split])[0][0]

In [15]:
setting_lines = np.where([line=='@setting' for line in script_split])[0]

In [16]:
scene_lines = np.where([line=='@scene' for line in script_split])[0]

In [17]:
script1[23]

"CHUCKIE:#Well you know how he loves animals #right? Anyway, last week he's drivin' #home...##"

In [19]:
a = script1[first_spoken_line]

In [20]:
a[:a.index(':#')]

'CHUCKIE'

wondering if scene changes accompanied by scene description is what constitutes change of context.  We'll try this for now.

In [21]:
dialogues = []
dialogue = []
current_type = 'none'
current_speaker = 'none'

for line in script1[first_spoken_line:]:
    previous_type = current_type
    current_type = lineType(line)
    
    if previous_type == '@setting' and current_type == '@scene' and dialogue:
        dialogues.append(dialogue)
        dialogue = []
        current_speaker = 'none'
        
    #at some point u should refactor so that u take care of (cont.) and justify the existence of the if statement
    #also take out the check try except
    if current_type == '@speak':
        try:
            name_break = line.index(':#')
            previous_speaker = current_speaker
            current_speaker = line[:name_break]
            current_line = line[name_break+2:]
            if previous_speaker in current_speaker:
                dialogue[-1] = dialogue[-1] + ' ' + current_line
            else:
                dialogue.append(current_line)
        except:
            print(line)

In [22]:
interactions = []
for dialogue in dialogues:
    if len(dialogue) > 1:
        interactions.extend([(prompt.replace('#', ' '), response.replace('#', ' ')) for prompt, response in zip(dialogue[:-1], dialogue[1:])])

In [23]:
def get_words(line):
    result = re.sub('[^a-z^A-Z\s]', '', line).lower()
    result = re.sub('\s+', ' ', result)
    return result[:-1] # all of these end in new line, this gets rid of the last space

In [24]:
interactions = [(get_words(prompt), get_words(response)) for prompt, response in interactions]

yeah probably get rid of names and stuff.

In [29]:
interactions[399]

('not a chance', 'yup youre lookin at lucky thirteen')

In [25]:
interactions[250][0].split(' ')

['youre', 'in', 'a', 'safe', 'place', 'will']

In [26]:
len(interactions)

955

## Ideas to assist in convergence

Maybe replace all names with pronouns or sir and stuff and introducing yourself.  Leave it in for now, hoping most dialogue isn't just introductions or circle jerks.

For longer bits, it might be worth taking the first sentence as the reply and the last sentence as prompting the next response.

More noise: when two people respond to the same question.

In [152]:
dialogues[17]

['You suck.#',
 'What?#',
 "I've been sitting over there for #forty-five minutes waiting for you #to come talk to me. But I'm just #tired now and I have to go home and #I wasn't going to keep sitting there #waiting for you.#",
 "I'm Will.#",
 "Skylar. And by the way. That guy #over there is a real dick and I just #wanted you to know he didn't come #with us.#",
 'I kind of got that impression.#',
 "Well, look, I have to go. Gotta' get #up early and waste some more money #on my overpriced education.#",
 "I didn't mean you. Listen, maybe...#",
 "Here's my number.# Maybe we could go out for coffee #sometime?#",
 'Great, or maybe we could go somewhere #and just eat a bunch of caramels.#',
 'What?#',
 "When you think about it, it's just #as arbitrary as drinking coffee.#",
 '#Okay, sounds good.#',
 'Five minutes.#',
 'What?#',
 'I was trying to be smooth.##But at twelve-fifteen I was gonna #come over there and talk to you.#',
 "See, it's my life story. Five more #minutes and I would have got

## Download the scripts

In [30]:
path = 'scripts/'

In [96]:
def grab_dialogues(url):
    req = requests.get(url)
    soup = BeautifulSoup(req.content, 'lxml')
    script = soup.find('center')
    script1 = re.sub('([A-Z]+\)*)\n', '\\1:\n', script.text)\
            .replace('\n\n', '#')\
            .replace('  ', ' ')
    script1 = re.sub('\([a-z].*\)', '', script1).split('\n')
    script_split = [lineType(line) for line in script1]
    
    dialogues = []
    dialogue = []
    current_type = 'none'
    current_speaker = 'none'
    first_spoken_line = np.where([line=='@speak' for line in script_split])[0][0]

    for line in script1[first_spoken_line:]:
        previous_type = current_type
        current_type = lineType(line)

        if previous_type == '@setting' and current_type == '@scene' and dialogue:
            dialogues.append(dialogue)
            dialogue = []
            current_speaker = 'none'

        #at some point u should refactor so that u take care of (cont.) and justify the existence of the if statement
        #also take out the check try except
        if current_type == '@speak':
            try:
                name_break = line.index(':#')
                previous_speaker = current_speaker
                current_speaker = line[:name_break]
                current_line = line[name_break+2:]
                if previous_speaker in current_speaker:
                    dialogue[-1] = dialogue[-1] + ' ' + current_line
                else:
                    dialogue.append(current_line)
            except:
                print(line)
    
    interactions = []
    for dialogue in dialogues:
        if len(dialogue) > 1:
            interactions.extend([(prompt.replace('#', ' '), response.replace('#', ' ')) for prompt, response in zip(dialogue[:-1], dialogue[1:])])
    interactions = [(get_words(prompt), get_words(response)) for prompt, response in interactions]
    
    return interactions

In [97]:
test = grab_dialogues('https://www.weeklyscript.com/Antitrust.html')

MONTAGE: Programmers play competitive games at an Outpost #picnic; Toddlers play on computers in an Outpost Day Care #Center; Geeks confer at a diagram-covered whiteboard; #Employees listen/dance to the Seattle band we've been hearing, #on-stage, at the Outpost '98 launch.#
ON MILO: pressing himself into the alcove.#
RANDY: brings up a shot of a Programmer at work, zooms in on #his screen to begin collecting his code.#
MILO: begins to lose his grip as sweat forms on his brow #and, worse, on his hands. They're giving way.#
RANDY: clicks. A printer across the room start making a hard-#copy of the purloined code.#
MILO: stares in horror as the data tower his toes are jammed #on top of starts to shift -- threatening to disengage from #its ports. He tries to lift it back up with heels of his #shoes.#
RANDY: knits his brow as his screen flickers. He keeps typing, #but starts to lower his head: he's going to look under the #table.#
MILO: winces as he presses the tower upward with his heels, #

In [98]:
len(test)

567

In [99]:
test

[('it bums me out when the media say were cultish or whatever why cause we care about each other',
  'love you too bro'),
 ('love you too bro', 'how gary gets this superbad rap'),
 ('how gary gets this superbad rap',
  'a kid working in his garage can create the next outpost the new ibm all it takes is a great idea thats why nobody can have a monopoly in a business built on ideas'),
 ('can we go starting to get nauseated', 'do you mind'),
 ('do you mind', 'did anybody mention the beverages'),
 ('is it over',
  'they still have to give em refreshments laced with mindaltering drugs'),
 ('they still have to give em refreshments laced with mindaltering drugs',
  'you are a fanatic'),
 ('you are a fanatic', 'gonna wait outside'),
 ('make him', ' you know what i mean'),
 ('im just screwed',
  ' you know what hes like he just wants to work on stuff thats cool'),
 (' you know what hes like he just wants to work on stuff thats cool',
  'you dont wanna move do you'),
 ('you dont wanna move do yo