In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

## Importing csv's

In [2]:
avengers_endgame = pd.read_csv("./cleaned/avengers_endgame.csv")
avengers = pd.read_csv("./cleaned/avengers.csv")
ragnorak = pd.read_csv("./cleaned/ragnorak.csv")
iron_man = pd.read_csv("./cleaned/iron_man.csv")
guardians2 = pd.read_csv("./cleaned/guardians2.csv")

## Adding a Movie Column

In [3]:
avengers_endgame["movie"] = "Avengers Endgame"
avengers["movie"] = "The Avengers"
ragnorak["movie"] = "Thor: Ragnarok"
iron_man["movie"] = "Iron Man"
guardians2["movie"] = "Guardians of the Galaxy Vol. 2"

## Adding Author Indicators

Indicator variables (0 = False, 1 = True) for if an author contributed to the screenplay that the line is from.  These names were taken from the Wikipedia articles for these movies.  The names used are listed in the right column in the "Screenplay By"  or "Written By" row.

In [4]:
avengers_endgame["Christopher Markus"] = 1
avengers_endgame["Stephen McFeely"] = 1

avengers["Joss Whedon"] = 1

ragnorak["Eric Pearson"] = 1
ragnorak["Craig Kyle"] = 1
ragnorak["Christopher Yost"] = 1

iron_man["Mark Fergus"] = 1
iron_man["Hawk Ostby"] = 1
iron_man["Art Marcum"] = 1
iron_man["Matt Holloway"] = 1

guardians2["James Gunn"] = 1

## Adding a Year Column

In [5]:
avengers_endgame["year"] = 2019
avengers["year"] = 2012
ragnorak["year"] = 2017
iron_man["year"] = 2008
guardians2["year"] = 2017

## Specifying Duplicate Names

Some characters happen to have the same name as characters in a different movie, though they are not the same person.  This cell specifies the difference so that their lines aren't aggregated.  This won't really matter since these characters have so few lines anyway.

In [6]:
avengers_endgame['character'] = avengers_endgame['character'].replace(["JIMMY"],'AVENGERS JIMMY')
avengers_endgame['character'] = avengers_endgame['character'].replace(["KID"],'AVENGERS KID')
iron_man['character'] = iron_man['character'].replace(["JIMMY"],'IRON MAN JIMMY')
iron_man['character'] = iron_man['character'].replace(["KID"],'IRON MAN KID')

## Creating a Combined DataFrame

In [7]:
combined = pd.concat([avengers_endgame, avengers, ragnorak, iron_man, guardians2])

combined.fillna(0, inplace=True)

## Removing Character Name Aliases

Some characters go by different names in different movies.  Also, some characters start out as `MYSTERIOUS MAN` and be revealed to be `EGO`.  This cell fixes that by assigning the same lines to the same name.  Some of these choices are subjective.  For example, I've decided that `HULK` is a different character than `BRUCE BANNER`, and that all reporter characters are named `REPORTER`.

When adding a new movie to this dataset, **be careful** to see that the extra characters (`MAN`, `WOMAN`, `KID` etc.) are being assigned to who they actually are.  For example, in Iron Man, `MAN` is `AGENT COULSON`, but `MAN` could be somebody else in a different movie.

In [8]:
combined['character'] = combined['character'].replace(['IRON MAN','TONY',"VOICE"],'TONY STARK')
combined['character'] = combined['character'].replace(['BANNER','SMART HULK', 'ASTRAL BANNER'],'BRUCE BANNER')
combined['character'] = combined['character'].replace(['STEVE','CAPTAIN AMERICA', 'OLD STEVE'],'STEVE ROGERS')
combined['character'] = combined['character'].replace(['BARTON'],'CLINT BARTON')
combined['character'] = combined['character'].replace(['PEPPER'],'PEPPER POTTS')
combined['character'] = combined['character'].replace(['COULSON', 'MAN'],'AGENT COULSON')
combined['character'] = combined['character'].replace(['AGENT JASPER SITWELL'],'AGENT SITWELL')
combined['character'] = combined['character'].replace(['STRANGE'],'DR. STRANGE')
combined['character'] = combined['character'].replace(['SCRAPPER #142'],'VALKYRIE')
combined['character'] = combined['character'].replace(['LOKIE'],'LOKI')
combined['character'] = combined['character'].replace(['QUILL'],'PETER QUILL')
combined['character'] = combined['character'].replace(['MONSTROUS RAVAGER'],'TASERFACE')
combined['character'] = combined['character'].replace(['MYSTERIOUS MAN'],'EGO')
combined['character'] = combined['character'].replace(["WOMAN’S VOICE"],'CHRISTINE')
combined['character'] = combined['character'].replace(["HOODED FIGURE"],'RED SKULL')
combined['character'] = combined['character'].replace(["ADOLESCENT GROOT"],'GROOT')
combined['character'] = combined['character'].replace(["GABRIEL"],'GENERAL GABRIEL')
combined['character'] = combined['character'].replace(["HOGAN"],'HAPPY HOGAN')

combined['character'] = combined['character'].replace(["REPORTER # 1", "REPORTER #2", "REPORTER #3","TV REPORTER’S VOICE",
                                                      "MALE REPORTER"],'REPORTER')
combined['character'] = combined['character'].replace(["INTERCOM VOICE", "HELICARRIER INTERCOM"],'INTERCOM')
combined['character'] = combined['character'].replace(["JET PILOT", "YOUNG SHIELD PILOT", "FACELESS PILOT"],'PILOT')

## Adding a Words Column

Be aware, elipses and dashes aren't guaranteed to be counted as their own word.  For example, `down...bow` from the opening scene of Avengers Endgame might be counted as 1 word.  The same thing can happen with This shouldn't happen.  This might need to be corrected when constructing models.

In [9]:
combined["words"] = combined["line"].str.split(" ").str.len()

## Exporting mcu.csv

In [10]:
column_order = ["character", "line", "movie", "year", "words", 
                "Christopher Markus", "Stephen McFeely", "Joss Whedon", "Eric Pearson", "Craig Kyle", "Christopher Yost",
               "Mark Fergus", "Hawk Ostby", "Art Marcum", "Matt Holloway", "James Gunn"]

combined = combined[column_order]

combined.to_csv("./mcu.csv")

combined

Unnamed: 0,character,line,movie,year,words,Christopher Markus,Stephen McFeely,Joss Whedon,Eric Pearson,Craig Kyle,Christopher Yost,Mark Fergus,Hawk Ostby,Art Marcum,Matt Holloway,James Gunn
0,CLINT BARTON,"Okay, you see where you’re going? Let’s work on how to get there.",Avengers Endgame,2019,13,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,CLINT BARTON,"Okay, good...tip down...bow arm out...three fingers-",Avengers Endgame,2019,7,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,LILA BARTON,Why three?,Avengers Endgame,2019,2,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CLINT BARTON,‘Cause two’s not enough and four’s too much-,Avengers Endgame,2019,8,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,LAURA BARTON,"You guys want mustard or mayo, or both?",Avengers Endgame,2019,8,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,LILA BARTON,Who puts mayo on a hot dog?,Avengers Endgame,2019,7,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,CLINT BARTON,"We’ll both have mustard, hon! Okay. Draw back, deep breath...",Avengers Endgame,2019,12,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,CLINT BARTON,"Good job, Hawkeye. Go get your arrow.",Avengers Endgame,2019,9,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,LAURA BARTON,Enough murder practice! Soup’s on!,Avengers Endgame,2019,6,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,CLINT BARTON,"One sec, babe. Be right there! We’re gonna kill some hot dogs. We’re hungry.",Avengers Endgame,2019,15,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
