In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

## Importing csv's

In [2]:
avengers_endgame = pd.read_csv("./cleaned/avengers_endgame.csv")
avengers = pd.read_csv("./cleaned/avengers.csv")
ragnorak = pd.read_csv("./cleaned/ragnorak.csv")
iron_man = pd.read_csv("./cleaned/iron_man.csv")
guardians2 = pd.read_csv("./cleaned/guardians2.csv")
infinity_war = pd.read_csv("./cleaned/infinity_war.csv")
age_of_ultron = pd.read_csv("./cleaned/age_of_ultron.csv")
iron_man_2 = pd.read_csv("./cleaned/iron_man_2.csv")
iron_man_3 = pd.read_csv("./cleaned/iron_man_3.csv")
captain_america = pd.read_csv("./cleaned/captain_america.csv")
winter_soldier = pd.read_csv("./cleaned/winter_soldier.csv")
civil_war = pd.read_csv("./cleaned/civil_war.csv")
spider_man_homecoming = pd.read_csv("./cleaned/spider_man_homecoming.csv")
captain_marvel = pd.read_csv("./cleaned/captain_marvel.csv")

## Adding a Movie Column

In [3]:
avengers_endgame["movie"] = "Avengers: Endgame"
avengers["movie"] = "The Avengers"
ragnorak["movie"] = "Thor: Ragnarok"
iron_man["movie"] = "Iron Man"
guardians2["movie"] = "Guardians of the Galaxy Vol. 2"
infinity_war["movie"] = "Avengers: Infinity War"
age_of_ultron["movie"] = "Avengers: Age of Ultron"
iron_man_2["movie"] = "Iron Man 2"
iron_man_3["movie"] = "Iron Man 3"
captain_america["movie"] = "Captain America: The First Avenger"
winter_soldier["movie"] = "Captain America: The Winter Soldier"
civil_war["movie"] = "Captain America: Civil War"
spider_man_homecoming["movie"] = "Spider-Man: Homecoming"
captain_marvel["movie"] = "Captain Marvel"

## Adding Author Indicators

Indicator variables (0 = False, 1 = True) for if an author contributed to the screenplay that the line is from.  These names were taken from the Wikipedia articles for these movies.  The names used are listed in the right column in the "Screenplay By"  or "Written By" row.

In [4]:
avengers_endgame["Christopher Markus"] = True
avengers_endgame["Stephen McFeely"] = True

avengers["Joss Whedon"] = True

ragnorak["Eric Pearson"] = True
ragnorak["Craig Kyle"] = True
ragnorak["Christopher Yost"] = True

iron_man["Mark Fergus"] = True
iron_man["Hawk Ostby"] = True
iron_man["Art Marcum"] = True
iron_man["Matt Holloway"] = True

guardians2["James Gunn"] = True

infinity_war["Christopher Markus"] = True
infinity_war["Stephen McFeely"] = True

age_of_ultron["Joss Whedon"] = True

iron_man_2["Justin Theroux"] = True

iron_man_3["Shane Black"] = True
iron_man_3["Drew Pearce"] = True

captain_america["Christopher Markus"] = True
captain_america["Stephen McFeely"] = True

winter_soldier["Christopher Markus"] = True
winter_soldier["Stephen McFeely"] = True

civil_war["Christopher Markus"] = True
civil_war["Stephen McFeely"] = True

spider_man_homecoming["Jonathan Goldstein"] = True
spider_man_homecoming["John Francis Daley"] = True
spider_man_homecoming["Jon Watts"] = True
spider_man_homecoming["Christopher Ford"] = True
spider_man_homecoming["Chris McKenna"] = True
spider_man_homecoming["Erik Sommers"] = True

captain_marvel["Anna Boden"] = True
captain_marvel["Ryan Fleck"] = True
captain_marvel["Geneva Robertson-Dworet"] = True

## Adding a Year Column

In [5]:
avengers_endgame["year"] = 2019
avengers["year"] = 2012
ragnorak["year"] = 2017
iron_man["year"] = 2008
guardians2["year"] = 2017
infinity_war["year"] = 2018
age_of_ultron["year"] = 2015
iron_man_2["year"] = 2010
iron_man_3["year"] = 2013
captain_america["year"] = 2011
winter_soldier["year"] = 2014
civil_war["year"] = 2016
spider_man_homecoming["year"] = 2017
captain_marvel["year"] = 2019

## Specifying Duplicate Names

Some characters happen to have the same name as characters in a different movie, though they are not the same person.  This cell specifies the difference so that their lines aren't aggregated.  This won't really matter since these characters have so few lines anyway.

In [6]:
avengers_endgame['character'] = avengers_endgame['character'].replace(["JIMMY"],'AVENGERS JIMMY')
avengers_endgame['character'] = avengers_endgame['character'].replace(["KID"],'AVENGERS KID')
iron_man['character'] = iron_man['character'].replace(["JIMMY"],'IRON MAN JIMMY')
iron_man['character'] = iron_man['character'].replace(["MAN"],'PHIL COULSON')
iron_man['character'] = iron_man['character'].replace(["KID"],'IRON MAN KID')
spider_man_homecoming['character'] = spider_man_homecoming['character'].replace(["GARY"],'STAN LEE')

## Creating a Combined DataFrame

In [7]:
combined = pd.concat([avengers_endgame, avengers, ragnorak, iron_man, guardians2, infinity_war, age_of_ultron, iron_man_2, 
                      iron_man_3, captain_america, winter_soldier, civil_war, spider_man_homecoming, captain_marvel])

combined.fillna(False, inplace=True)

## Removing Character Name Aliases

Some characters go by different names in different movies.  Also, some characters start out as `MYSTERIOUS MAN` and be revealed to be `EGO`.  This cell fixes that by assigning the same lines to the same name.  Some of these choices are subjective.  For example, I've decided that `HULK` is a different character than `BRUCE BANNER`, and that all reporter characters are named `REPORTER`.

When adding a new movie to this dataset, **be careful** to see that the extra characters (`MAN`, `WOMAN`, `KID` etc.) are being assigned to who they actually are.  For example, in Iron Man, `MAN` is `AGENT COULSON`, but `MAN` could be somebody else in a different movie.

In [8]:
# Iron Man People
combined['character'].replace(['IRON MAN','TONY',"VOICE"],'TONY STARK', inplace=True)
combined['character'].replace(['PEPPER'],'PEPPER POTTS', inplace=True)
combined['character'].replace(["WOMAN’S VOICE"],'CHRISTINE', inplace=True)
combined['character'].replace(["GABRIEL"],'GENERAL GABRIEL', inplace=True)
combined['character'].replace(["HOGAN", "HAPPY"],'HAPPY HOGAN', inplace=True)
combined['character'].replace(["F.R.I.D.A.Y."],'FRIDAY', inplace=True)
combined['character'].replace(["OBADIAH"],'OBADIAH STANE', inplace=True)
combined['character'].replace(["RHODEY", "COLONEL JAMES RHODES"],'JAMES RHODES', inplace=True)
combined['character'].replace(["HOWARD"],'HOWARD STARK', inplace=True)
combined['character'].replace(["IVAN"],'IVAN VANKO', inplace=True)
combined['character'].replace(["JUSTIN", "TV JUSTIN"],'JUSTIN HAMMER', inplace=True)
combined['character'].replace(["STERN"],'SENATOR STERN', inplace=True)
combined['character'].replace(["CHRISTINE"],'CHRISTINE EVERHART', inplace=True)
combined['character'].replace(["MR MUSK"],'ELON MUSK', inplace=True)
combined['character'].replace(["ALRICH KILLIAN"],'ALDRICH KILLIAN', inplace=True)
combined['character'].replace(["CAMERAMAN"],'GARY', inplace=True)
combined['character'].replace(["SAVIN"],'ERIC SAVIN', inplace=True)
combined['character'].replace(["TAGGERT"],'JACK TAGGART', inplace=True)

# SHIELD People
combined['character'].replace(['BARTON'],'CLINT BARTON', inplace=True)
combined['character'].replace(['COULSON', 'AGENT COULSON', 'COULON'],'PHIL COULSON', inplace=True)
combined['character'].replace(['AGENT JASPER SITWELL', 'AGENT SITWELL'],'JAPSER SITWELL', inplace=True)
combined['character'].replace(["FURY"],'NICK FURY', inplace=True)
combined['character'].replace(["NATASHA", "NATALIE", "AGENT ROMANOFF", "NOTARY", "NATALIE/AGENT ROMANOFF"],'NATASHA ROMANOFF', inplace=True)

# Guardians of the Galaxy People
combined['character'].replace(['QUILL'],'PETER QUILL', inplace=True)
combined['character'].replace(['MONSTROUS RAVAGER'],'TASERFACE', inplace=True)
combined['character'].replace(['MYSTERIOUS MAN'],'EGO', inplace=True)
combined['character'].replace(["ADOLESCENT GROOT"],'GROOT', inplace=True)
combined['character'].replace(["MEREDITH"],'MEREDITH QUILL', inplace=True)
combined['character'].replace(["MEMORY GAMORA"],"GAMORA", inplace=True)
combined['character'].replace(["MEMORY NEBULA"],"NEBULA", inplace=True)

# Thor People
combined['character'].replace(['SCRAPPER #142'],'VALKYRIE', inplace=True)
combined['character'].replace(['LOKIE'],'LOKI', inplace=True)
combined['character'].replace(['SELVIG'],'ERIK SELVIG', inplace=True)

# Captain America People
combined['character'].replace(['STEVE','CAPTAIN AMERICA', 'OLD STEVE', "STEVE ROGER"],'STEVE ROGERS', inplace=True)
combined['character'].replace(["HOODED FIGURE", "STONEKEEPER", "JOHAN SCHMIDT", "JOHANN SCHMIDT", "JOHAN SCHMIDT/RED SKULL"],'RED SKULL', inplace=True)
combined['character'].replace(['JAMES BARNES', "WINTER SOLDIER"],'BUCKY BARNES', inplace=True)
combined['character'].replace(['LIST'],'DR. LIST', inplace=True)
combined['character'].replace(['STRUCKER'],'BARON WOLFGANG VON STRUCKER', inplace=True)
combined['character'].replace(['MRS SPENCER'],'MRS. SPENCER', inplace=True)

#Spider Man People
combined['character'].replace(['AARON'],'AARON DAVIS', inplace=True)
combined['character'].replace(['ABE'],'ABRAHAM BROWN', inplace=True)
combined['character'].replace(['BETTY'],'BETTY BRANT', inplace=True)
combined['character'].replace(['BRICE'],'JACKSON BRICE', inplace=True)
combined['character'].replace(['CHARLES'],'CHARLES MURPHY', inplace=True)
combined['character'].replace(['CINDY'],'CINDY MOON', inplace=True)
combined['character'].replace(['DORIS'],'DORIS TOOMES', inplace=True)
combined['character'].replace(['LIZ'],'LIZ TOOMES', inplace=True)
combined['character'].replace(['TOOMES', "VULTURE"],'ADRIAN TOOMES', inplace=True)
combined['character'].replace(['FLASH'],'FLASH THOMPSON', inplace=True)
combined['character'].replace(['GARGAN'],'MAC GARGAN', inplace=True)
combined['character'].replace(['JASON'],'JASON IONELLO', inplace=True)
combined['character'].replace(['MASON'],'PHINEAS MASON', inplace=True)
combined['character'].replace(['MAY'],'MAY PARKER', inplace=True)
combined['character'].replace(['MICHELLE'],'MICHELLE "MJ" JONES', inplace=True)
combined['character'].replace(['NED'],'NED LEEDS', inplace=True)
combined['character'].replace(['PETER', 'PETER PETER'],'PETER PARKER', inplace=True)
combined['character'].replace(['SALLY'],'SALLY AVRIL', inplace=True)
combined['character'].replace(['SCHULTZ'],'HERMAN SCHULTZ', inplace=True)
combined['character'].replace(['TINY'],'BRIAN "TINY" MCKEEVER', inplace=True)
combined['character'].replace(['SUIT LADY'],'KAREN', inplace=True)

# Captain Marvel People
combined['character'].replace(["CAROL", "'VERS", "VERS", "CVERS"],"CAROL DANVERS", inplace=True)
combined['character'].replace(["LAWSON"],"WENDY LAWSON", inplace=True)
combined['character'].replace(["MARIA"],"MARIA RAMBEAU", inplace=True)
combined['character'].replace(["MONICA"],"MONICA RAMBEAU", inplace=True)
combined['character'].replace(["GENERAL TALOS"],"TALOS", inplace=True)

# Other People
combined['character'].replace(['BANNER','SMART HULK', 'ASTRAL BANNER'],'BRUCE BANNER', inplace=True)
combined['character'].replace(['STRANGE', "DOCTOR STRANGE", "DR. STRANGE", "STEVEN STRANGE"],'STEPHEN STRANGE', inplace=True)
combined['character'].replace(["SCHOOL BUS DRIVER, STAN LEE", "LARRY", "FEDEX DRIVER STAN LEE"],'STAN LEE', inplace=True)
combined['character'].replace(["T’CHALLA", "KING T'CHALLA"],"T'CHALLA", inplace=True)
combined['character'].replace(["KING T'CHAKA"],"T'CHAKA", inplace=True)
combined['character'].replace(["WANDA"],"WANDA MAXIMOFF", inplace=True)
combined['character'].replace(["BARTON'S DAUGHTER"],"LILA BARTON", inplace=True)

# Extras
#combined['character'].replace(["REPORTER # 1", "REPORTER #2", "REPORTER #3","TV REPORTER’S VOICE",
#                                                      "MALE REPORTER", "NEWS REPORTER"],'REPORTER', inplace=True)
#combined['character'].replace(["INTERCOM VOICE", "HELICARRIER INTERCOM"],'INTERCOM', inplace=True)
#combined['character'].replace(["JET PILOT", "YOUNG SHIELD PILOT", "FACELESS PILOT"],'PILOT', inplace=True)

In [9]:
unique_names = pd.DataFrame(combined["character"].value_counts())

unique_names.rename(columns={"character": "line count"}, inplace=True)
unique_names.reset_index(inplace=True)
unique_names.rename(columns={"index": "character"}, inplace=True)

unique_names["words"] = unique_names["character"].str.split(" ").str.len()
unique_names.sort_values(by=["words", "character"], inplace=True)

unique_names

Unnamed: 0,character,line count,words
203,ABU,5,1
272,ACCUSER,3,1
213,ADMIRAL,4,1
193,AGENT,5,1
266,AKIHIKO,3,1
283,ALETA,2,1
519,ANNIE,1,1
426,ANNOUNCEMENT,1,1
186,ANNOUNCER,5,1
185,ATT-LASS,5,1


In [10]:
unique_names[unique_names["character"].str.contains("SHIELD", regex=True)]

Unnamed: 0,character,line count,words
113,SHIELD AGENT,12,2
108,SHIELD COMPUTER,13,2
487,SHIELD ENGINEER,1,2
176,SHIELD LIEUTENANT,6,2
481,SHIELD PILOT,1,2
161,SHIELD TECH,6,2
334,SHIELD WORKER,2,2
233,FEMALE SHIELD AGENT,4,3
435,SHIELD AGENT #1,1,3
281,SHIELD AGENT #2,3,3


## Adding a Words Column

Be aware, elipses and dashes aren't guaranteed to be counted as their own word.  For example, `down...bow` from the opening scene of Avengers Endgame might be counted as 1 word.  The same thing can happen with This shouldn't happen.  This might need to be corrected when constructing models.

In [11]:
combined["words"] = combined["line"].str.split(" ").str.len()

## Exporting mcu.csv

In [12]:
column_order = ["character", "line", "movie", "year", "words", 
                "Christopher Markus", "Stephen McFeely", "Joss Whedon", "Eric Pearson", "Craig Kyle", "Christopher Yost",
               "Mark Fergus", "Hawk Ostby", "Art Marcum", "Matt Holloway", "James Gunn"]

combined = combined[column_order]
#combined.astype('int32', copy=False)

combined.to_csv("./mcu.csv")

combined.head()

Unnamed: 0,character,line,movie,year,words,Christopher Markus,Stephen McFeely,Joss Whedon,Eric Pearson,Craig Kyle,Christopher Yost,Mark Fergus,Hawk Ostby,Art Marcum,Matt Holloway,James Gunn
0,CLINT BARTON,"Okay, you see where you’re going? Let’s work on how to get there.",Avengers: Endgame,2019,13,True,True,False,False,False,False,False,False,False,False,False
1,CLINT BARTON,"Okay, good...tip down...bow arm out...three fingers-",Avengers: Endgame,2019,7,True,True,False,False,False,False,False,False,False,False,False
2,LILA BARTON,Why three?,Avengers: Endgame,2019,2,True,True,False,False,False,False,False,False,False,False,False
3,CLINT BARTON,‘Cause two’s not enough and four’s too much-,Avengers: Endgame,2019,8,True,True,False,False,False,False,False,False,False,False,False
4,LAURA BARTON,"You guys want mustard or mayo, or both?",Avengers: Endgame,2019,8,True,True,False,False,False,False,False,False,False,False,False


## Movie Metadata Table

In [13]:
movies = pd.DataFrame()

movies = movies.append(pd.Series({"movie": "Avengers: Endgame", "year": 2019, "is transcript": False, "lines": avengers_endgame.shape[0], "url": "https://www.scriptslug.com/assets/uploads/scripts/avengers-endgame-2019.pdf"}), ignore_index=True)
movies = movies.append(pd.Series({"movie": "The Avengers", "year": 2012, "is transcript": False, "lines": avengers.shape[0], "url": "https://www.scriptslug.com/assets/uploads/scripts/the-avengers-2012.pdf"}), ignore_index=True)
movies = movies.append(pd.Series({"movie": "Thor: Ragnorak", "year": 2017, "is transcript": False, "lines": ragnorak.shape[0], "url" : "https://www.scriptslug.com/assets/uploads/scripts/thor-ragnorak-2017.pdf"}), ignore_index=True)
movies = movies.append(pd.Series({"movie": "Iron Man", "year": 2008, "is transcript": False, "lines": iron_man.shape[0], "url": "https://www.scriptslug.com/assets/uploads/scripts/iron-man-2008.pdf"}), ignore_index=True)
movies = movies.append(pd.Series({"movie": "Guardians of the Galaxy Vol. 2", "year": 2017, "is transcript": False, "lines": guardians2.shape[0], "url" : "https://www.scriptslug.com/assets/uploads/scripts/guardians-of-the-galaxy-vol-2-2017.pdf"}), ignore_index=True)
movies = movies.append(pd.Series({"movie": "Avengers: Infinity War", "year": 2018, "is transcript": True, "lines": infinity_war.shape[0], "url" : "https://transcripts.fandom.com/wiki/Avengers:_Infinity_War"}), ignore_index=True)
movies = movies.append(pd.Series({"movie": "Avengers: Age of Ultron", "year": 2015, "is transcript": True, "lines": age_of_ultron.shape[0], "url" : "https://transcripts.fandom.com/wiki/Avengers:_Age_of_Ultron"}), ignore_index=True)
movies = movies.append(pd.Series({"movie": "Iron Man 2", "year": 2010, "is transcript": True, "lines": iron_man_2.shape[0], "url" : "https://transcripts.fandom.com/wiki/Iron_Man_2"}), ignore_index=True)
movies = movies.append(pd.Series({"movie": "Iron Man 3", "year": 2013, "is transcript": True, "lines": iron_man_3.shape[0], "url" : "https://transcripts.fandom.com/wiki/Iron_Man_3"}), ignore_index=True)
movies = movies.append(pd.Series({"movie": "Captain America: The First Avenger", "year": 2011, "is transcript": True, "lines": captain_america.shape[0], "url" : "https://transcripts.fandom.com/wiki/Captain_America:_The_First_Avenger"}), ignore_index=True)
movies = movies.append(pd.Series({"movie": "Captain America: The Winter Soldier", "year": 2014, "is transcript": True, "lines": winter_soldier.shape[0], "url" : "https://transcripts.fandom.com/wiki/Captain_America:_The_Winter_Soldier"}), ignore_index=True)
movies = movies.append(pd.Series({"movie": "Captain America: Civil War", "year": 2016, "is transcript": True, "lines": civil_war.shape[0], "url" : "https://transcripts.fandom.com/wiki/Captain_America:_Civil_War"}), ignore_index=True)
movies = movies.append(pd.Series({"movie": "Spider-Man: Homecoming", "year": 2017, "is transcript": True, "lines": spider_man_homecoming.shape[0], "url" : "https://transcripts.fandom.com/wiki/Spider-Man:_Homecoming"}), ignore_index=True)
movies = movies.append(pd.Series({"movie": "Captain Marvel", "year": 2019, "is transcript": True, "lines": captain_marvel.shape[0], "url" : "https://transcripts.fandom.com/wiki/Captain_Marvel_(2019)"}), ignore_index=True)

movies.set_index('movie', inplace=True)
movies["is transcript"] = movies['is transcript'].astype('boolean', copy=False)
movies["lines"] = movies['lines'].astype('int64', copy=False)
movies["year"] = movies['year'].astype('int64', copy=False)

movies.sort_values(by="year", inplace=True)

movies.to_csv("./movies.csv")

movies

Unnamed: 0_level_0,is transcript,lines,url,year
movie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Iron Man,False,834,https://www.scriptslug.com/assets/uploads/scripts/iron-man-2008.pdf,2008
Iron Man 2,True,1010,https://transcripts.fandom.com/wiki/Iron_Man_2,2010
Captain America: The First Avenger,True,688,https://transcripts.fandom.com/wiki/Captain_America:_The_First_Avenger,2011
The Avengers,False,1027,https://www.scriptslug.com/assets/uploads/scripts/the-avengers-2012.pdf,2012
Iron Man 3,True,1043,https://transcripts.fandom.com/wiki/Iron_Man_3,2013
Captain America: The Winter Soldier,True,841,https://transcripts.fandom.com/wiki/Captain_America:_The_Winter_Soldier,2014
Avengers: Age of Ultron,True,980,https://transcripts.fandom.com/wiki/Avengers:_Age_of_Ultron,2015
Captain America: Civil War,True,987,https://transcripts.fandom.com/wiki/Captain_America:_Civil_War,2016
Thor: Ragnorak,False,961,https://www.scriptslug.com/assets/uploads/scripts/thor-ragnorak-2017.pdf,2017
Guardians of the Galaxy Vol. 2,False,993,https://www.scriptslug.com/assets/uploads/scripts/guardians-of-the-galaxy-vol-2-2017.pdf,2017
