In [1]:
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

## Script to CSV Process

1. Load script `.txt` file.  Use [PDF Extractor](https://pdfextractor.com/) to convert any `.pdf`s to `.txt`s.
2. Remove garbage rows
3. Inspect rows removed
4. Format lines
5. Create `uppercase_unique_form.csv`
6. Manually fill out `uppercase_unique_form.csv` columns `is_character_name` and `is_dialogue` in Excel or other `.csv` editor.
7. Load `uppercase_unique.csv` and filter out rows
8. Inspect rows removed
9. Merge lines into best guesses for columns "character" and "line"
10. Manually clean the "line" column by removing text that is not dialogue.  There will be a lot, so nearly every cell will have to be cleaned!


See the Guardians of the Galaxy script at the end of this notebook for an example of this process.

## Remove Garbage Rows

In [2]:
def remove_regex_rows(df,string):
    rows_to_remove = df["line"].str.contains(string,case=True,regex=True)
    print("Removed",rows_to_remove.sum(),"rows that match regex \"" + string + "\"")
    
    return df[~rows_to_remove], df[rows_to_remove]

def remove_whitespace_rows(df):
    whitespace_rows = df["line"].str.isspace()
    print(whitespace_rows.sum(),"whitespace rows removed")
    return df[~whitespace_rows], df[whitespace_rows]

def remove_movie_text_rows(df):
    
    regex_strings = ["^\[[^\[\]]*\]$"]
    
    removed_rows = pd.DataFrame()
    
    for regex_string in regex_strings:
        df, new_rows_to_remove = remove_regex_rows(df,regex_string)
        
        removed_rows = pd.concat([removed_rows, new_rows_to_remove])
    
    return df, removed_rows


def remove_page_number_rows(df):
    page_rows = df["line"].str.isnumeric()
    print(page_rows.sum(),"page number rows removed")
    
    return df[~page_rows], df[page_rows]

def remove_garbage_rows(df):
    df, whitespace_rows = remove_whitespace_rows(df)
    df, page_number_rows = remove_page_number_rows(df)
    df, movie_text_rows = remove_movie_text_rows(df)
    
    garbage_rows = pd.concat([whitespace_rows, page_number_rows, movie_text_rows], 
                            keys = ["whitespace","page_numbers","movie_text"])
    
    print("-----------------------------------------")
    print(garbage_rows.shape[0],"total rows removed\n")
    
    return df, garbage_rows

## Format Character Lines

In [3]:
def remove_regex(df,string):
    print("Removed",df["line"].str.count(string).sum(),"occurences of regex \"" + string + "\"")
    df["line"] = df["line"].str.replace(string,"",case=True,regex=True)

def remove_leading_trailing_whitespace(df):
    df["line"] = df["line"].str.replace("[ \t]+$","")
    df["line"] = df["line"].str.replace("^[ \t]+","")

def format_lines(df):
    remove_leading_trailing_whitespace(df)
    
    regex_strings = ["\[[^\[\]]*\]"]
    
    for regex_string in regex_strings:
        remove_regex(df, regex_string)
    
    print("")
    
    remove_leading_trailing_whitespace(df)

## Create Character / Line Columns

In [4]:
def create_character_line_columns(df):
    character = df.line.str.extract(r'(^[^:]*:)')[0]
    line = df.line.str.extract(r'(:.*$)')[0]
    
    character = character.str.replace(":$","",case=True,regex=True)
    line = line.str.replace("^[ \t]*:[ \t]*","",case=True,regex=True)
    
    character = character.str.upper()
    
    return pd.DataFrame({"character": character, "line": line})

## Avengers: Age of Ultron

In [5]:
age_of_ultron = pd.read_csv("./script txts/age_of_ultron.txt",sep="\n",header=None,names=["line"])

print(age_of_ultron.shape)

age_of_ultron, garbage_rows = remove_garbage_rows(age_of_ultron)
format_lines(age_of_ultron)

age_of_ultron = create_character_line_columns(age_of_ultron)

age_of_ultron.to_csv("./uncleaned/age_of_ultron_uncleaned.csv", index=False)

age_of_ultron = pd.read_csv("./cleaned/age_of_ultron.csv")

age_of_ultron.head()

age_of_ultron[age_of_ultron["line"].str.contains("[^a-zA-Z\d\.,\?! '\"-]", regex=True)]

(1032, 1)
0 whitespace rows removed
0 page number rows removed
Removed 51 rows that match regex "^\[[^\[\]]*\]$"
-----------------------------------------
51 total rows removed

Removed 355 occurences of regex "\[[^\[\]]*\]"



Unnamed: 0,character,line
303,ULTRON,"That was dramatic! I'm sorry, I know you mean well. You just didn't think it through. You want to protect the world, but you don't want it to change. How is humanity saved if it's not allowed to...evolve? With these? These puppets? There's only one path to peace: The Avengers' extinction. I had strings, but now I'm free. There are no strings on me, no strings on me."
454,TONY STARK,"News or footage, keyword: Hulk. Natasha, I could really use a lullaby."
488,LAURA BARTON,She's…Nathaniel.
716,ULTRON,"I wasn't sure you'd wake up. I hoped you would, I wanted to show you something. I don't have anyone else. I think a lot about meteors, the purity of them. Boom! The end, start again. The world made clean for the new man to rebuild. I was meant to be new. I was meant to be beautiful. The world would've looked to the sky and seen hope, seen mercy. Instead they'll look up in horror because of you. You've wounded me. I give you full marks for that. But, like the man said, ""What doesn't kill me… ""…just makes me stronger."""
796,FRIDAY,There's the rest of the Vibranium. Function: still unclear.
816,FRIDAY,"Right now the impact would kill thousands. Once it gets high enough: Global extinction. That building's not clear, Tenth floor."
820,STEVE ROGERS,"Incoming already came in. Stark, you worry about bringing the city back down safely. The rest of us have one job: tear these things apart. You get hurt, hurt 'em back. You get killed, walk it off."
