# MCU Scripts

## Configuration

In [1]:
%load_ext autotime

In [2]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import os

In [3]:
import re

## Analysis

### About the Data

#### Processed Scripts

In [4]:
lines = pd.read_csv('../clean_data/mcu_data_clean_all.csv', index_col=0).reset_index(drop=True)[['character', 'line', 'movie', 'year']]
print('Entries: ', len(lines))
lines.head()

Entries:  18387


Unnamed: 0,character,line,movie,year
0,TONY STARK,"Oh, I get it. You guys aren’t allowed to talk...",Iron Man,2008
1,IRON MAN JIMMY,No. We’re allowed to talk.,Iron Man,2008
2,TONY STARK,Oh. I see. So it’s personal.,Iron Man,2008
3,RAMIREZ,I think they’re intimidated.,Iron Man,2008
4,TONY STARK,"Good God, you’re a woman. I, honestly, I could...",Iron Man,2008


In [5]:
processed_movies = lines.groupby(['movie', 'year']).head(1)[['movie', 'year']].sort_values(['movie']).reset_index(drop=True)
processed_movies

Unnamed: 0,movie,year
0,Ant-Man,2015
1,Ant-Man and the Wasp,2018
2,Avengers: Age of Ultron,2015
3,Avengers: Endgame,2019
4,Avengers: Infinity War,2018
5,Black Panther,2018
6,Captain America: Civil War,2016
7,Captain America: The First Avenger,2011
8,Captain America: The Winter Soldier,2014
9,Captain Marvel,2019


#### Raw Scripts

In [6]:
df = pd.read_csv('../raw_data/mcu_scipts.csv', index_col=0).merge(processed_movies, right_on=['movie'], left_on=['title'], how='left')
df['script_length'] = df['script'].apply(len)
print('Entries: ', len(df))
df.head(23)

Entries:  23


Unnamed: 0,title,script,movie,year,script_length
0,Ant-Man,Previous transcript:\n Next transcript:\n\n\n ...,Ant-Man,2015.0,82517
1,Ant-Man and the Wasp,This transcript is not finished!This page does...,Ant-Man and the Wasp,2018.0,67957
2,The Avengers,This transcript isn't tidy!This page's transcr...,The Avengers,2012.0,163543
3,Avengers: Age of Ultron,Previous transcript:\n Next transcript:\n\n\n ...,Avengers: Age of Ultron,2015.0,91399
4,Avengers: Endgame,Previous transcript:\n Next transcript:\n\n\n ...,Avengers: Endgame,2019.0,138200
5,Avengers: Infinity War,Previous transcript:\n Next transcript:\n\n\n ...,Avengers: Infinity War,2018.0,141191
6,Black Panther,This transcript isn't tidy!This page's transcr...,Black Panther,2018.0,201332
7,Captain America: Civil War,Previous transcript:\n Next transcript:\n\n\n ...,Captain America: Civil War,2016.0,127046
8,Captain America: The First Avenger,Previous transcript:\n Next transcript:\n\n\n ...,Captain America: The First Avenger,2011.0,71770
9,Captain America: The Winter Soldier,Previous transcript:\n Next transcript:\n\n\n ...,Captain America: The Winter Soldier,2014.0,98173


In [7]:
from pdfminer3.layout import LAParams, LTTextBox
from pdfminer3.pdfpage import PDFPage
from pdfminer3.pdfinterp import PDFResourceManager
from pdfminer3.pdfinterp import PDFPageInterpreter
from pdfminer3.converter import PDFPageAggregator
from pdfminer3.converter import TextConverter
import io

resource_manager = PDFResourceManager()
fake_file_handle = io.StringIO()
converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())
page_interpreter = PDFPageInterpreter(resource_manager, converter)

with open('../raw_data/incredible20hulk2c20the.pdf', 'rb') as fh:

    for page in PDFPage.get_pages(fh,
                                  caching=True,
                                  check_extractable=True):
        page_interpreter.process_page(page)

    script = fake_file_handle.getvalue()

# close open handles
converter.close()
fake_file_handle.close()

In [8]:
movie_idx=19
# script = df.script.values[movie_idx]
movie_year = 2008
movie_title = df.title.values[movie_idx]
print(movie_title, movie_year)
print(script[:200])

The Incredible Hulk 2008
TIH

Part 1:  Flight

Edward Norton

First Draft
05-13-07

We have only to follow the thread of the hero 
path, 

and where we had thought to find an 
Abomination, we shall find a God...

And where w


# Cleaning Up Script

## Find Actual Start of Script

In [9]:
start_script = 59
script_clean = script[start_script:]
print(script_clean[10000:20000])

get to work...
He sits at his roughly fashioned WORKTABLE. On it seems to be his 
only possessions of note:  rows of various SMALL GLASS BOTTLES 
with liquid in them, neatly self-labeled and arranged in rows.  
One DECENT LOOKING MICROSCOPE.  A SMALL FIELD SATELLITE LINK 
ANTENNA and a LAPTOP COMPUTER.  
He turns on the SATELLITE LINK and the COMPUTER and starts to 
Instant Message:
BANNER’S SCREEN: (Banner’s writing in Green, his collaborator’s in 
Blue)
“Mr. Blue?”
(a pause)
“Mr. Green...hello. Did you find what you were looking for?”
“Yes.”
“And you have my notes on derivation of the inhibitor?”
“Yes.”
“For most cellular exposures a concentration of 50-80 parts per 
million will suffice. Keep me posted.  And good luck :)”
And Banner begins to work:
MONTAGE:  BANNER EXTRACTING A CHEMICAL FROM HIS FLOWERS
Over many hours, Banner works.  Methodically and painstakingly 
extracting tiny amounts of juice from his precious flowers, mixing 
it carefully with chemicals from self-labeled bott

In [10]:
xxx

NameError: name 'xxx' is not defined

## Find Characters

In [None]:
characters = np.unique(re.findall(string=script_clean, pattern='((?:[A-Z]+ ?)+)\n'), return_counts=True)
characters = pd.DataFrame(zip(characters[0], characters[1]), columns=['character', 'line_count'])
print(len(characters))
characters.sort_values(['line_count'], ascending=False)

## Remove Narration

In [None]:
script_lines_only = re.sub(string=script_clean, pattern='(\[.*\])\n?', repl='')
script_lines_only = re.sub(string=script_lines_only, pattern='\n(\[.*\]?)\n', repl='\n')
script_lines_only = re.sub(string=script_lines_only, pattern='\n(\(.*\)?)\n', repl='\n')
script_lines_only = re.sub(string=script_lines_only, pattern='\xa0', repl=' ')

## Charactater Line Extractions

In [None]:
character_lines = re.findall(string=script_lines_only, pattern='((?:[A-Z][a-z]+[\-\.]? ?)+):\s*(.*)')
character_lines = pd.DataFrame(character_lines, columns=['character', 'line'])
character_lines['character'] = ['Doctor Strange' if x =='Stephen' or x == 'Strange'\
                                else x.rstrip().lstrip().replace('Dr.', 'Doctor').replace('Karl ', '').replace(' Palmer', '').replace('Crhstine', 'Christine').replace('Stephen ', '').replace('Doctor Billy', 'Billy') \
                                for x in character_lines['character']]
character_lines['character'] = character_lines['character'].apply(str.upper)
character_lines['line'] = [x.rstrip().lstrip() for x in character_lines['line']]
print(character_lines['character'].value_counts())
character_lines.head()

## Save Lines as CSV

In [None]:
character_lines.to_csv('../clean_data/dr_strange.csv', index=False)