# MCU Scripts

## Configuration

In [1]:
%load_ext autotime

In [2]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import os

In [3]:
import re

## Analysis

### About the Data

#### Processed Scripts

In [4]:
lines = pd.read_csv('../clean_data/mcu_data_clean_all.csv', index_col=0).reset_index(drop=True)[['character', 'line', 'movie', 'year']]
print('Entries: ', len(lines))
lines.head()

Entries:  17164


Unnamed: 0,character,line,movie,year
0,TONY STARK,"Oh, I get it. You guys aren’t allowed to talk...",Iron Man,2008
1,IRON MAN JIMMY,No. We’re allowed to talk.,Iron Man,2008
2,TONY STARK,Oh. I see. So it’s personal.,Iron Man,2008
3,RAMIREZ,I think they’re intimidated.,Iron Man,2008
4,TONY STARK,"Good God, you’re a woman. I, honestly, I could...",Iron Man,2008


In [5]:
processed_movies = lines.groupby(['movie', 'year']).head(1)[['movie', 'year']].sort_values(['movie']).reset_index(drop=True)
processed_movies

Unnamed: 0,movie,year
0,Ant-Man,2015
1,Ant-Man and the Wasp,2018
2,Avengers: Age of Ultron,2015
3,Avengers: Endgame,2019
4,Avengers: Infinity War,2018
5,Black Panther,2018
6,Captain America: Civil War,2016
7,Captain America: The First Avenger,2011
8,Captain America: The Winter Soldier,2014
9,Captain Marvel,2019


#### Raw Scripts

In [6]:
df = pd.read_csv('../raw_data/mcu_scipts.csv', index_col=0).merge(processed_movies, right_on=['movie'], left_on=['title'], how='left')
df['script_length'] = df['script'].apply(len)
print('Entries: ', len(df))
df.head(23)

Entries:  23


Unnamed: 0,title,script,movie,year,script_length
0,Ant-Man,Previous transcript:\n Next transcript:\n\n\n ...,Ant-Man,2015.0,82517
1,Ant-Man and the Wasp,This transcript is not finished!This page does...,Ant-Man and the Wasp,2018.0,67957
2,The Avengers,This transcript isn't tidy!This page's transcr...,The Avengers,2012.0,163543
3,Avengers: Age of Ultron,Previous transcript:\n Next transcript:\n\n\n ...,Avengers: Age of Ultron,2015.0,91399
4,Avengers: Endgame,Previous transcript:\n Next transcript:\n\n\n ...,Avengers: Endgame,2019.0,138200
5,Avengers: Infinity War,Previous transcript:\n Next transcript:\n\n\n ...,Avengers: Infinity War,2018.0,141191
6,Black Panther,This transcript isn't tidy!This page's transcr...,Black Panther,2018.0,201332
7,Captain America: Civil War,Previous transcript:\n Next transcript:\n\n\n ...,Captain America: Civil War,2016.0,127046
8,Captain America: The First Avenger,Previous transcript:\n Next transcript:\n\n\n ...,Captain America: The First Avenger,2011.0,71770
9,Captain America: The Winter Soldier,Previous transcript:\n Next transcript:\n\n\n ...,Captain America: The Winter Soldier,2014.0,98173


In [7]:
movie_idx=11
script = df.script.values[movie_idx]
movie_year = 2016
movie_title = df.title.values[movie_idx]
print(movie_title, movie_year)
print(script[:100])

Doctor Strange 2016
This transcript isn't tidy!This page's transcript is incomplete for the following reason(s):unfixed/


# Cleaning Up Script

## Find Actual Start of Script

In [8]:
start_script = 351
script_clean = script[start_script:]
print(script_clean[:300])

[scene at the temple, sound of bell ringing. Some people are walking around out of the temple’s library. Showing the librarian are putting back a book to its shelves. the leader of ‘the people’ are showing off with a hood, making the librarian pay attention. two of them follow by, walking through th


## Find Characters

In [11]:
print(script_clean[:10000])

[scene at the temple, sound of bell ringing. Some people are walking around out of the temple’s library. Showing the librarian are putting back a book to its shelves. the leader of ‘the people’ are showing off with a hood, making the librarian pay attention. two of them follow by, walking through the librarian with their leader. All of them showing off. two of them making spell that hold the librarian’s two arms, and two others making spell by a stick that hold his two legs. making him lifted. the librarian grimacing in pain. someone put a jug below his head. the leader walking closely to the librarian. the leader took off his hood. the librarian has a look at the leader. the leader places his hands onto his back and holding a pair of blades as he chops the librarian’s head off which falls into the jug. the leader takes the book that had been placed by the librarian. he opens the book searching for a page then rips it from the book, and throws the book away. he walks from his place and

In [12]:
characters = np.unique(re.findall(string=script_clean, pattern='\n((?:[A-Z][a-z]+\.? ?)+):'), return_counts=True)
characters = pd.DataFrame(zip(characters[0], characters[1]), columns=['character', 'line_count'])
print(len(characters))
characters.sort_values(['line_count'], ascending=False)

26


Unnamed: 0,character,line_count
11,Dr. Stephen Strange,191
22,The Ancient One,67
7,Doctor Strange,59
19,Mordo,48
2,Christine Palmer,35
1,Christine,30
16,Kaecilius,28
24,Wong,19
9,Dormammu,12
0,Billy,7


## Remove Narration

In [47]:
script_lines_only = re.sub(string=script_clean, pattern='(\[.*\])\n?', repl='')
script_lines_only = re.sub(string=script_lines_only, pattern='\n(\[.*\]?)\n', repl='\n')
script_lines_only = re.sub(string=script_lines_only, pattern='\n(\(.*\)?)\n', repl='\n')
script_lines_only = re.sub(string=script_lines_only, pattern='\xa0', repl=' ')

## Charactater Line Extractions

In [53]:
character_lines = re.findall(string=script_lines_only, pattern='((?:[A-Z][a-z]+[\-\.]? ?)+):\s*(.*)')
character_lines = pd.DataFrame(character_lines, columns=['character', 'line'])
character_lines['character'] = ['Doctor Strange' if x =='Stephen' or x == 'Strange'\
                                else x.rstrip().lstrip().replace('Dr.', 'Doctor').replace('Karl ', '').replace(' Palmer', '').replace('Crhstine', 'Christine').replace('Stephen ', '').replace('Doctor Billy', 'Billy') \
                                for x in character_lines['character']]
character_lines['character'] = character_lines['character'].apply(str.upper)
character_lines['line'] = [x.rstrip().lstrip() for x in character_lines['line']]
print(character_lines['character'].value_counts())
character_lines.head()

DOCTOR STRANGE     255
THE ANCIENT ONE     67
CHRISTINE           64
MORDO               52
KAECILIUS           28
WONG                19
DORMAMMU            12
BILLY                8
PANGBORN             7
DOCTOR WEST          7
THOR                 6
YOUNG DOCTOR         5
ETIENNE              4
JONATHAN             4
SIGN                 1
DOCTOR TWO           1
MASTER               1
DOCTOR ONE           1
DOCTOR               1
Name: character, dtype: int64


Unnamed: 0,character,line
0,THE ANCIENT ONE,Master Kaecilius. That ritual will bring you o...
1,KAECILIUS,Hypocrite!
2,DOCTOR STRANGE,"Challenge round, Billy."
3,DOCTOR STRANGE,"Oh, come on, Billy. You’ve got to be messing w..."
4,BILLY,"Heheh. No, doctor."


## Save Lines as CSV

In [54]:
character_lines.to_csv('../clean_data/dr_strange.csv', index=False)