# MCU Scripts

## Configuration

In [1]:
%load_ext autotime

In [2]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import os

In [3]:
import re

## Analysis

### About the Data

#### Processed Scripts

In [4]:
lines = pd.read_csv('../clean_data/mcu_data_clean_all.csv', index_col=0).reset_index(drop=True)[['character', 'line', 'movie', 'year']]
print('Entries: ', len(lines))
lines.head()

Entries:  18387


Unnamed: 0,character,line,movie,year
0,TONY STARK,"Oh, I get it. You guys aren’t allowed to talk...",Iron Man,2008
1,IRON MAN JIMMY,No. We’re allowed to talk.,Iron Man,2008
2,TONY STARK,Oh. I see. So it’s personal.,Iron Man,2008
3,RAMIREZ,I think they’re intimidated.,Iron Man,2008
4,TONY STARK,"Good God, you’re a woman. I, honestly, I could...",Iron Man,2008


In [5]:
processed_movies = lines.groupby(['movie', 'year']).head(1)[['movie', 'year']].sort_values(['movie']).reset_index(drop=True)
processed_movies

Unnamed: 0,movie,year
0,Ant-Man,2015
1,Ant-Man and the Wasp,2018
2,Avengers: Age of Ultron,2015
3,Avengers: Endgame,2019
4,Avengers: Infinity War,2018
5,Black Panther,2018
6,Captain America: Civil War,2016
7,Captain America: The First Avenger,2011
8,Captain America: The Winter Soldier,2014
9,Captain Marvel,2019


#### Raw Scripts

In [6]:
df = pd.read_csv('../raw_data/mcu_scipts.csv', index_col=0).merge(processed_movies, right_on=['movie'], left_on=['title'], how='left')
df['script_length'] = df['script'].apply(len)
print('Entries: ', len(df))
df.head(23)

Entries:  23


Unnamed: 0,title,script,movie,year,script_length
0,Ant-Man,Previous transcript:\n Next transcript:\n\n\n ...,Ant-Man,2015.0,82517
1,Ant-Man and the Wasp,This transcript is not finished!This page does...,Ant-Man and the Wasp,2018.0,67957
2,The Avengers,This transcript isn't tidy!This page's transcr...,The Avengers,2012.0,163543
3,Avengers: Age of Ultron,Previous transcript:\n Next transcript:\n\n\n ...,Avengers: Age of Ultron,2015.0,91399
4,Avengers: Endgame,Previous transcript:\n Next transcript:\n\n\n ...,Avengers: Endgame,2019.0,138200
5,Avengers: Infinity War,Previous transcript:\n Next transcript:\n\n\n ...,Avengers: Infinity War,2018.0,141191
6,Black Panther,This transcript isn't tidy!This page's transcr...,Black Panther,2018.0,201332
7,Captain America: Civil War,Previous transcript:\n Next transcript:\n\n\n ...,Captain America: Civil War,2016.0,127046
8,Captain America: The First Avenger,Previous transcript:\n Next transcript:\n\n\n ...,Captain America: The First Avenger,2011.0,71770
9,Captain America: The Winter Soldier,Previous transcript:\n Next transcript:\n\n\n ...,Captain America: The Winter Soldier,2014.0,98173


In [7]:
movie_idx=12
script = df.script.values[movie_idx]
movie_year = 2017
movie_title = df.title.values[movie_idx]
print(movie_title, movie_year)
print(script[:100])

Guardians of the Galaxy 2017
Previous transcript:
 Next transcript:


 Captain America: The Winter Soldier
 Avengers: Age of Ultr


# Cleaning Up Script

## Find Actual Start of Script

In [8]:
start_script = 303
script_clean = script[start_script:]
print(script_clean[:10000])

 First lines. Earth 1988. Young Peter Quill sits in the waiting room of a hospital, listening to the "Awesome Mix" tape on his Walkman when his grandfather comes over to him


 Grandpa
 Peter, your mama wants to speak with you. Come on, Pete. Take these fool things off. [he takes the mixtape off of Peter, turns off the Walkman, takes Peter to see his mother, who’s lying sick in bed, she notices his eye is bruised]


 Meredith Quill
 Why have you been fighting with the other boys again, baby? [Peter shrugs his shoulders] Peter?


 Young Quill
 They killed a little frog that ain’t done nothin’. Smushed it with a stick.


 Meredith Quill
 You’re so like your daddy. You even look like him. And he was an angel. Composed of pure light [Meredith’s eyes close for a moment]


 Grandpa
 Mer? You got a present there for Peter, don’t you?


 Meredith Quill
 Of course. [she touches the small wrapped present and card, her father takes them and puts them in Peter’s backpack]


 Grandpa
 There. I’ve g

## Find Characters

In [9]:
characters1 = np.unique(re.findall(string=script_clean, pattern='\n\n ((?:[A-Z][a-z]+ ?)+)\n'), return_counts=True)
characters1 = pd.DataFrame(zip(characters1[0], characters1[1]), columns=['character', 'line_count'])
characters1['type'] = 1

characters2 = np.unique(re.findall(string=script_clean, pattern='((?:[A-Z][a-z]+[\-\.]? ?)+):\s*.*'), return_counts=True)
characters2 = pd.DataFrame(zip(characters2[0], characters2[1]), columns=['character', 'line_count'])
characters2['type'] = 2

characters = pd.concat([characters1, characters2]).groupby(['character']).sum().reset_index(drop=False)
del(characters1, characters2)

print(len(characters))
characters.sort_values(['line_count'], ascending=False)

37


Unnamed: 0,character,line_count,type
25,Peter Quill,149,2
29,Rocket,90,2
7,Gamora,88,2
5,Drax,43,2
35,Yondu Udonta,32,2
20,Nebula,23,2
30,Ronan,22,2
3,Corpsman Dey,17,2
33,The Collector,13,2
9,Groot,12,2


## Add New Line if UpperCase follows punctuation without space

In [10]:
script_clean = re.sub(string=script_clean, pattern="([\]\.\!\?])([A-Z])", repl=r"\g<1>\n\g<2>")

## Charactater Line Extractions

In [82]:
character_lines1 = re.findall(string=script_clean, pattern='((?:[A-Z][a-z]+[\-\.]? ?)+):\s*(.*)')
character_lines1 = pd.DataFrame(character_lines1, columns=['character', 'line'])

character_lines2 = re.findall(string=script_clean, pattern='\n\n ((?:[A-Z][a-z]+ ?)+)\n(.*)\n\n')
character_lines2 = pd.DataFrame(character_lines2, columns=['character', 'line'])

character_lines = pd.concat([character_lines1, character_lines2]).drop_duplicates().reset_index(drop=True)

character_lines['character'] = ['Doctor Strange' if x =='Stephen' or x == 'Strange'\
                                else x.rstrip().lstrip().replace('Dr.', 'Doctor').replace('Karl ', '').replace(' Palmer', '').replace('Crhstine', 'Christine').replace('Stephen ', '').replace('Doctor Billy', 'Billy') \
                                for x in character_lines['character']]

character_lines['character'] = character_lines['character'].apply(str.upper)
character_lines['line'] = [x.rstrip().lstrip() for x in character_lines['line']]
print(character_lines['character'].value_counts())
character_lines.head()

PETER QUILL             155
ROCKET                   90
GAMORA                   86
DRAX                     40
YONDU UDONTA             32
NEBULA                   23
RONAN                    21
CORPSMAN DEY             17
THE COLLECTOR            13
KORATH                   12
THE BROKER               11
NOVA PRIME               11
DENARIAN SAAL             9
KRAGLIN                   7
GROOT                     6
CARINA                    5
HEAD RIOT GUARD           5
YOUNG QUILL               5
GRANDPA                   4
BEREET                    3
NOVA ARRESTING PILOT      3
KREE AMBASSADOR           2
PRISON GUARD              2
MEREDITH QUILL            2
RAVAGER PILOT             2
THANOS                    2
XANDARIAN PRISONER        2
HORUZ                     2
KNOWHERE KID              2
KNOWHERE CIVILIAN         1
BAR BOUNCER               1
RAVAGER NAVIGATOR         1
MASKLESS SAKAARAN         1
NOVA PILOT                1
DUCK                      1
NEWS REPORTER       

Unnamed: 0,character,line
0,PETER QUILL,Uh… hey.[Korath instructs his henchmen to grab...
1,KORATH,"Drop it, now!"
2,PETER QUILL,"Hey, cool, man. No problem. [Peter drops the o..."
3,KORATH,How do you know about this?
4,PETER QUILL,I don’t even know what that is. I’m just a jun...


### Remove Narration

In [83]:
character_lines['line'] = [re.sub(string=x, pattern='(\[.*\])', repl='') for x in character_lines['line'].values]

## Save Lines as CSV

In [84]:
character_lines.to_csv('../clean_data/gotg.csv', index=False)