# MCU Scripts

## Configuration

In [1]:
%load_ext autotime

In [2]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import os

In [3]:
import re

## Analysis

### About the Data

#### Processed Scripts

In [4]:
lines = pd.read_csv('../clean_data/mcu_data_clean_all.csv', index_col=0).reset_index(drop=True)[['character', 'line', 'movie', 'year']]
print('Entries: ', len(lines))
lines.head()

Entries:  17164


Unnamed: 0,character,line,movie,year
0,TONY STARK,"Oh, I get it. You guys aren’t allowed to talk...",Iron Man,2008
1,IRON MAN JIMMY,No. We’re allowed to talk.,Iron Man,2008
2,TONY STARK,Oh. I see. So it’s personal.,Iron Man,2008
3,RAMIREZ,I think they’re intimidated.,Iron Man,2008
4,TONY STARK,"Good God, you’re a woman. I, honestly, I could...",Iron Man,2008


In [5]:
processed_movies = lines.groupby(['movie', 'year']).head(1)[['movie', 'year']].sort_values(['movie']).reset_index(drop=True)
processed_movies

Unnamed: 0,movie,year
0,Ant-Man,2015
1,Ant-Man and the Wasp,2018
2,Avengers: Age of Ultron,2015
3,Avengers: Endgame,2019
4,Avengers: Infinity War,2018
5,Black Panther,2018
6,Captain America: Civil War,2016
7,Captain America: The First Avenger,2011
8,Captain America: The Winter Soldier,2014
9,Captain Marvel,2019


#### Raw Scripts

In [6]:
df = pd.read_csv('../raw_data/mcu_scipts.csv', index_col=0).merge(processed_movies, right_on=['movie'], left_on=['title'], how='left')
df['script_length'] = df['script'].apply(len)
print('Entries: ', len(df))
df.head(23)

Entries:  23


Unnamed: 0,title,script,movie,year,script_length
0,Ant-Man,Previous transcript:\n Next transcript:\n\n\n ...,Ant-Man,2015.0,82517
1,Ant-Man and the Wasp,This transcript is not finished!This page does...,Ant-Man and the Wasp,2018.0,67957
2,The Avengers,This transcript isn't tidy!This page's transcr...,The Avengers,2012.0,163543
3,Avengers: Age of Ultron,Previous transcript:\n Next transcript:\n\n\n ...,Avengers: Age of Ultron,2015.0,91399
4,Avengers: Endgame,Previous transcript:\n Next transcript:\n\n\n ...,Avengers: Endgame,2019.0,138200
5,Avengers: Infinity War,Previous transcript:\n Next transcript:\n\n\n ...,Avengers: Infinity War,2018.0,141191
6,Black Panther,This transcript isn't tidy!This page's transcr...,Black Panther,2018.0,201332
7,Captain America: Civil War,Previous transcript:\n Next transcript:\n\n\n ...,Captain America: Civil War,2016.0,127046
8,Captain America: The First Avenger,Previous transcript:\n Next transcript:\n\n\n ...,Captain America: The First Avenger,2011.0,71770
9,Captain America: The Winter Soldier,Previous transcript:\n Next transcript:\n\n\n ...,Captain America: The Winter Soldier,2014.0,98173


In [8]:
movie_idx=18
script = df.script.values[movie_idx]
movie_year = 2019
movie_title = df.title.values[movie_idx]
print(movie_title, movie_year)
print(script[:100])

Spider-Man: Far From Home 2019
This transcript is not finished!This page does not have the entire transcript.It is recommended to a


# Cleaning Up Script

## Find Actual Start of Script

In [19]:
start_script = 280
script_clean = script[start_script:]
print(script_clean[:300])
print(script_clean[-300:])

[The Columbia Pictures logo plays normally. At the end, the Torch Lady fades into a statue in Ixtenco, Mexico.]
[A car drives up to a destroyed village in that place. Maria Hill and Nick Fury get out]
Maria Hill: Nick, this was a tragedy, but it's not why we're here. What, are we fighting the weathe
s his hands) Everybody, back to work! Who's got my shoes?
[The last message shows a "dedicated to" message for Spider-Man's creators, Stan Lee and Steve Ditko. After that, the movie ends]
[Closing title 1: Sony]
[Closing title 2: Columbia Pictures A Columbia Pictures Release A Sony Company]
The End.


## Find Characters

In [20]:
characters = np.unique(re.findall(string=script_clean, pattern='\n((?:[A-Z][a-z]+\.? ?)+):'), return_counts=True)
characters = pd.DataFrame(zip(characters[0], characters[1]), columns=['character', 'line_count'])
print(len(characters))
characters.sort_values(['line_count'], ascending=False)

48


Unnamed: 0,character,line_count
33,Peter Parker,295
35,Quentin Beck,132
29,Ned Leeds,64
30,Nick Fury,54
19,Happy Hogan,47
26,Mr. Harrington,35
7,Betty Brant,26
3,Aunt May,23
8,Brad Davis,19
23,Maria Hill,19


## Remove Narration

In [21]:
script_lines_only = re.sub(string=script_clean, pattern='(\[.*\])\n?', repl='')
script_lines_only = re.sub(string=script_lines_only, pattern='\n(\[.*\]?)\n', repl='\n')
script_lines_only = re.sub(string=script_lines_only, pattern='\n(\(.*\)?)\n', repl='\n')
script_lines_only = re.sub(string=script_lines_only, pattern='\xa0', repl=' ')

## Charactater Line Extractions

In [22]:
character_lines = re.findall(string=script_lines_only, pattern='((?:[A-Z][a-z]+[\-\.]? ?)+):\s*(.*)')
character_lines = pd.DataFrame(character_lines, columns=['character', 'line'])
character_lines['character'] = ['Doctor Strange' if x =='Stephen' or x == 'Strange'\
                                else x.rstrip().lstrip().replace('Dr.', 'Doctor').replace('Karl ', '').replace(' Palmer', '').replace('Crhstine', 'Christine').replace('Stephen ', '').replace('Doctor Billy', 'Billy') \
                                for x in character_lines['character']]
character_lines['character'] = character_lines['character'].apply(str.upper)
character_lines['line'] = [x.rstrip().lstrip() for x in character_lines['line']]
print(character_lines['character'].value_counts())
character_lines.head()

PETER PARKER            291
QUENTIN BECK            132
NED LEEDS                66
NICK FURY                54
HAPPY HOGAN              46
MR. HARRINGTON           34
BETTY BRANT              24
AUNT MAY                 22
MARIA HILL               19
BRAD DAVIS               19
FLASH THOMPSON           16
MR. DELL                 15
WILLIAM GINTHER RIVA     12
CREW                     12
JASON IONELLO             7
MR. HARRISON              5
WILLIAM                   5
TALOS                     5
STUDENT                   4
JANICE                    4
CHEESE FARMER             4
GUTERMAN                  3
WOMAN                     3
FEMALE AGENT              3
GUARD                     2
HAPPY                     2
BUS DRIVER                2
CLASSMATE                 2
TONY STARK                2
PAT KIERNAN               2
FLIGHT ATTENDANT          2
MYSTERIO                  2
MAN                       2
SHOPKEEPER                2
SOREN                     2
SPIDER-MAN          

Unnamed: 0,character,line
0,MARIA HILL,"Nick, this was a tragedy, but it's not why we'..."
1,NICK FURY,Locals say the cyclone had a face.
2,MARIA HILL,People say things when they're under stress. O...
3,QUENTIN BECK,Who are you?
4,QUENTIN BECK,You don't want any part of this. Betty Brant: ...


## Save Lines as CSV

In [23]:
character_lines.to_csv('../clean_data/spiderman_ffh.csv', index=False)