# MCU Scripts

## Configuration

In [1]:
%load_ext autotime

In [2]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import re

## Analysis

### About the Data

#### Processed Scripts

In [3]:
lines = pd.read_csv('../clean_data/mcu_data.csv', index_col=0).reset_index(drop=True)[['character', 'line', 'movie', 'year', 'words']]
print('Entries: ', len(lines))
lines.head()
processed_movies = lines.groupby(['movie', 'year']).head(1)[['movie', 'year']].sort_values(['movie']).reset_index(drop=True)
processed_movies

Entries:  6509


Unnamed: 0,movie,year
0,Ant-Man,2015
1,Avengers: Age of Ultron,2015
2,Avengers: Endgame,2019
3,Avengers: Infinity War,2018
4,Captain America: Civil War,2016
5,Captain America: The First Avenger,2011
6,Captain America: The Winter Soldier,2014
7,Captain Marvel,2019
8,Iron Man,2008
9,Iron Man 2,2010


#### Raw Scripts

In [5]:
df = pd.read_csv('../raw_data/mcu_scipts.csv', index_col=0).merge(processed_movies, right_on=['movie'], left_on=['title'], how='left')
df['script_length'] = df['script'].apply(len)
print('Entries: ', len(df))
df.head(23)

Entries:  23


Unnamed: 0,title,script,movie,year,script_length
0,Ant-Man,Previous transcript:\n Next transcript:\n\n\n ...,Ant-Man,2015.0,82517
1,Ant-Man and the Wasp,This transcript is not finished!This page does...,,,67957
2,The Avengers,This transcript isn't tidy!This page's transcr...,The Avengers,2012.0,163543
3,Avengers: Age of Ultron,Previous transcript:\n Next transcript:\n\n\n ...,Avengers: Age of Ultron,2015.0,91399
4,Avengers: Endgame,Previous transcript:\n Next transcript:\n\n\n ...,Avengers: Endgame,2019.0,138200
5,Avengers: Infinity War,Previous transcript:\n Next transcript:\n\n\n ...,Avengers: Infinity War,2018.0,141191
6,Black Panther,This transcript isn't tidy!This page's transcr...,,,201332
7,Captain America: Civil War,Previous transcript:\n Next transcript:\n\n\n ...,Captain America: Civil War,2016.0,127046
8,Captain America: The First Avenger,Previous transcript:\n Next transcript:\n\n\n ...,Captain America: The First Avenger,2011.0,71770
9,Captain America: The Winter Soldier,Previous transcript:\n Next transcript:\n\n\n ...,Captain America: The Winter Soldier,2014.0,98173


In [13]:
movie_idx=1
script = df.script.values[movie_idx]
movie_year = 2018
movie_title = df.title.values[movie_idx]
print(movie_title, movie_year)
print(script[:100])

Ant-Man and the Wasp 2018
This transcript is not finished!This page does not have the entire transcript.It is recommended to a


# Cleaning Up Script

## Find Actual Start of Script

In [14]:
start_script = 292
script_clean = script[start_script:]
print(script_clean[:10000])

Hank: [voiceover] I still think about the night your mother and I had to leave you...
[The camera cuts to a young Hope van Dyne, who is walking out of her bedroom into the hallway. There, Janet van Dyne and Hank Pym are talking to a woman.]
Janet: [to the woman, faintly] Hopefully, it's not for long, but I'll call you when we get settled.When I have a better indication of what's going on.
Young Hope: Mommy?
[Janet and Hank turn around to face their daughter. Janet sighs as Hank gets the bags situated.]
Janet: Oh, Jellybean. [She kneels down to meet eye-level with Hope.] Daddy and I have a last minute business trip, so Rose is going to stay with you for a few days.
Young Hope: No. I don't want you to go.
Janet: Ugh! It's gonna be so boring! I won't be able to keep my... eyes...
[Janet then pretends to have dozed off. Hope giggles as Janet snores. Hank walks up to his daughter.
Hank: Goodbye, sweetheart. Alright, we'll see you soon. [He kisses Hope on the forehead. He then touches Janet'

## Find Characters

In [8]:
characters = np.unique(re.findall(string=script, pattern='\n((?:[A-Z][a-z]+[\-\.]? ?)+):'), return_counts=True)
characters = pd.DataFrame(zip(characters[0], characters[1]), columns=['character', 'line_count'])
print(len(characters))
characters.sort_values(['line_count'], ascending=False)

20


Unnamed: 0,character,line_count
15,Scott,80
8,Hope,53
6,Hank,44
12,Luis,26
2,Cassie,19
17,Sonny,18
9,Janet,10
16,Scott Lang,7
13,Maggie,7
18,Woo,6


## Remove Narration

In [9]:
script_lines_only = re.sub(string=script, pattern='(\[.*\])\n?', repl='')
script_lines_only = re.sub(string=script_lines_only, pattern='\n(\[.*\]?)\n', repl='\n')
script_lines_only = re.sub(string=script_lines_only, pattern='\n(\(.*\)?)\n', repl='\n')
script_lines_only = re.sub(string=script_lines_only, pattern='\xa0', repl=' ')

print(script_lines_only[:1000])

This transcript is not finished!This page does not have the entire transcript.It is recommended to add the complete transcript.

This article is a stub. You can help Transcripts Wiki by  expanding it.     



 Previous transcript:
 Next transcript:


 Avengers: Infinity War
 Captain Marvel

Hank:  I still think about the night your mother and I had to leave you...
Janet:  Hopefully, it's not for long, but I'll call you when we get settled.When I have a better indication of what's going on.
Young Hope: Mommy?
Janet: Oh, Jellybean.  Daddy and I have a last minute business trip, so Rose is going to stay with you for a few days.
Young Hope: No. I don't want you to go.
Janet: Ugh! It's gonna be so boring! I won't be able to keep my... eyes...
Hank: Goodbye, sweetheart. Alright, we'll see you soon.  
Hank:  I wish we could have put down our bags, and tucked you back into your bed, but too many lives were at stake. 
Hank: Oh my god! They've already launched!
Janet: We have to stop it! Come on

## Charactater Line Extractions

In [10]:
character_lines = re.findall(string=script_lines_only, pattern='((?:[A-Z][a-z]+[\-\.]? ?)+):\s*(.*)')
character_lines = pd.DataFrame(character_lines, columns=['character', 'line'])
character_lines['character'] = [x.rstrip().lstrip().replace(' Lang', '').replace('Young ', '') if x != 'Hanks' else 'Hank' for x in character_lines['character']]
character_lines['character'] = character_lines['character'].apply(str.upper)
character_lines['line'] = [x.rstrip().lstrip() for x in character_lines['line']]
print(character_lines['character'].value_counts())
character_lines.head()

SCOTT        88
HOPE         57
HANK         44
LUIS         26
CASSIE       23
SONNY        18
JANET        10
WOO           6
MAGGIE        6
PAXTON        5
KURT          5
DAVE          3
AVA           3
ANT-MAN       2
GHOST         1
VIDEO         1
JIMMY WOO     1
AVENGERS      1
Name: character, dtype: int64


Unnamed: 0,character,line
0,AVENGERS,Infinity War
1,HANK,I still think about the night your mother and ...
2,JANET,"Hopefully, it's not for long, but I'll call yo..."
3,HOPE,Mommy?
4,JANET,"Oh, Jellybean. Daddy and I have a last minute..."


## Save Lines as CSV

In [11]:
character_lines.to_csv('../clean_data/ant_man_and_wasp.csv', index=False)