In [1]:
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

## Script to CSV Process

1. Load script `.txt` file.  Use [PDF Extractor](https://pdfextractor.com/) to convert any `.pdf`s to `.txt`s.
2. Remove garbage rows
3. Inspect rows removed
4. Format lines
5. Create `uppercase_unique_form.csv`
6. Manually fill out `uppercase_unique_form.csv` columns `is_character_name` and `is_dialogue` in Excel or other `.csv` editor.
7. Load `uppercase_unique.csv` and filter out rows
8. Inspect rows removed
9. Merge lines into best guesses for columns "character" and "line"
10. Manually clean the "line" column by removing text that is not dialogue.  There will be a lot, so nearly every cell will have to be cleaned!


See the Thor Ragnorak script at the end of this notebook for an example of this process.

## Remove Garbage Rows

In [2]:
def remove_regex_rows(df,string):
    rows_to_remove = df["line"].str.contains(string,case=True,regex=True)
    print("Removed",rows_to_remove.sum(),"rows that match regex \"" + string + "\"")
    
    return df[~rows_to_remove], df[rows_to_remove]

def remove_whitespace_rows(df):
    whitespace_rows = df["line"].str.isspace()
    print(whitespace_rows.sum(),"whitespace rows removed")
    return df[~whitespace_rows], df[whitespace_rows]

def remove_movie_text_rows(df):
    
    regex_strings = ["INTERCUT:",
                    "CUT TO",
                    "^THE END",
                    "^INT\.",
                    "^EXT\.",
                    "\(CONTINUED\)",
                    "CONTINUED: \(.\)",
                    "^[0-9]+\.$",
                    "\(.*radio\)",
                    "\(.*earpiece\)",
                    "\(.*headset\)",
                    "\(.*phone\)",
                    "\(.*cell\)",
                    "\(to .*\)",
                    "\(in .*\)",
                    "\(MORE\)",
                    "\(angry\)",
                    "TIME CUT:",
                    "TITLE:",
                    "MARVEL FLIP",
                    "SLAM TO",
                    "DISSOLVE TO",
                    "BEST ADAPTED SCREENPLAY",
                    "FADE TO BLACK",
                    "CUE MUSIC",
                    "BLUE DRAFT 05/20/16",
                    "OMITTED",
                    "MAIN TITLE",
                    "[ABC][0-9]+",
                    "\(.*:\)",
                    "\(beat\)"]
    
    removed_rows = pd.DataFrame()
    
    for regex_string in regex_strings:
        df, new_rows_to_remove = remove_regex_rows(df,regex_string)
        
        removed_rows = pd.concat([removed_rows, new_rows_to_remove])
    
    return df, removed_rows


def remove_page_number_rows(df):
    page_rows = df["line"].str.isnumeric()
    print(page_rows.sum(),"page number rows removed")
    
    return df[~page_rows], df[page_rows]

def remove_garbage_rows(df):
    df, whitespace_rows = remove_whitespace_rows(df)
    df, page_number_rows = remove_page_number_rows(df)
    df, movie_text_rows = remove_movie_text_rows(df)
    
    garbage_rows = pd.concat([whitespace_rows, page_number_rows, movie_text_rows], 
                            keys = ["whitespace","page_numbers","movie_text"])
    
    print("-----------------------------------------")
    print(garbage_rows.shape[0],"total rows removed\n")
    
    return df, garbage_rows

## Format Character Lines

In [3]:
def remove_regex(df,string):
    print("Removed",df["line"].str.count(string).sum(),"occurences of regex \"" + string + "\"")
    df["line"] = df["line"].str.replace(string,"",case=True,regex=True)

def remove_leading_trailing_whitespace(df):
    df["line"] = df["line"].str.replace("[ \t]+$","")
    df["line"] = df["line"].str.replace("^[ \t]+","")

def format_lines(df):
    remove_leading_trailing_whitespace(df)
    
    regex_strings = ["\(O\.S\)",
                    "\(V\.O\)",
                     "\(V\.O\.\)",
                    "\(CONT’D\)",
                     "\(CONT'D\)",
                    "\(O\.S\.\)",
                    "\(ON SCREEN\)",
                    "\(OVER COM\)",
                    "\(ON COM\)",
                    "\(INTO COM\)",
                    "\(OVER SPEAKER\)",
                    "\(INTO PHONE\)",
                    "\(OVER PHONE\)",
                    "\(HOLO\)",
                    "\(ADR\)"]
    
    for regex_string in regex_strings:
        remove_regex(df, regex_string)
    
    print("")
    
    remove_leading_trailing_whitespace(df)

## Remove Non Dialogue Uppercase Rows

In [4]:
def uppercase_rows(df):
    upper_lines = df["line"].str.isupper()
    
    return df[upper_lines]

def create_uppercase_unique_csv(df,script_name):
    '''
    Creates a csv file that is used for inspection of unique
    lines that are all uppercase.  These lines are manually labeled
    to be character names or dialogue.  The resulting table will be
    used to delete rows that are neither character names nor dialogue.
    '''
    
    upper_rows = uppercase_rows(df)
    unique_upper_rows = pd.DataFrame(upper_rows.line.unique())

    unique_upper_rows["words"] = unique_upper_rows[0].str.split(" ").str.len()
    unique_upper_rows["is_character_name"] = 0
    unique_upper_rows["is_dialogue"] = 0
    sorted_unique_upper_rows = unique_upper_rows.sort_values(by=["words"])
    sorted_unique_upper_rows.to_csv("./uppercase forms/" + script_name + "_uppercase_unique_form.csv",encoding='utf-8')
    
def remove_non_dialogue_upper_rows(df,upper_rows):
    '''
    Removes lines that are in all caps, and that are not dialogue or a character's name.
    '''
    unique_character_names = upper_rows[upper_rows["is_character_name"]]["line"]
    unique_dialogues = upper_rows[upper_rows["is_dialogue"]]["line"]
    
    all_character_name_lines = df[df["line"].isin(unique_character_names)]
    all_dialogue_lines = df[df["line"].isin(unique_dialogues)]
    
    print(df["line"].str.isupper().sum(),"total uppercase lines")
    print(all_character_name_lines.shape[0],"uppercase character name lines")
    print(all_dialogue_lines.shape[0],"uppercase dialogue lines")
    
    unique_lines_to_remove = upper_rows[~(upper_rows["is_character_name"]|upper_rows["is_dialogue"])]["line"]
    removed_lines = df["line"].isin(unique_lines_to_remove)
    print(removed_lines.sum(),"uppercase lines removed")
    
    return df[~removed_lines], df[removed_lines] 

## Merge Lines

In [5]:
def is_character_name(string,character_names):
    return string in character_names["line"].tolist()
    

def merge_lines(df,character_names):
    '''
    Starts at a character name and merges all lines until the next character's name.
    The result should be the best guess at columns "character" and "line".
    This will merge in some screenplay text that is not dialogue, so the result will 
    have to be manually cleaned as the final step.
    '''
    merged_df = pd.DataFrame(columns=["character","line"])
    
    for i in range(0,df.shape[0]):
        
        row = df.iloc[i]
        
        if is_character_name(row["line"],character_names):
            character = row["line"]
            line = ""
            
            j = i + 1
            while(j < len(df) and (not is_character_name(df.iloc[j]["line"],character_names))):
                line += df.iloc[j]["line"] + " "
                j += 1
            
            new_row = {"character": character,"line": line}
            
            merged_df = merged_df.append(new_row,ignore_index=True)
    
    return merged_df

## Thor Ragnorak

In [6]:
ragnorak = pd.read_csv("./script txts/thor-ragnorak-script-slug.txt",sep="\n",header=None,names=["line"])

ragnorak, garbage_rows = remove_garbage_rows(ragnorak)
format_lines(ragnorak)

create_uppercase_unique_csv(ragnorak,"ragnorak")
filled_out_upper_rows = pd.read_csv("./uppercase results/ragnorak_uppercase_unique.csv",
                                    names=["","line","words","is_character_name","is_dialogue"],
                                    dtype={"line":str, "words":int, "is_character_name":bool, "is_dialogue":bool},
                                    index_col=0)

ragnorak, upper_removed_rows = remove_non_dialogue_upper_rows(ragnorak, filled_out_upper_rows)
removed_rows = pd.concat([garbage_rows,pd.concat([upper_removed_rows],keys=[""])],keys=["garbage","uppercase"])

unique_character_names = pd.DataFrame(filled_out_upper_rows[filled_out_upper_rows["is_character_name"]]["line"])

ragnorak = merge_lines(ragnorak, unique_character_names)
ragnorak.reindex(copy=False)

ragnorak.to_csv("./uncleaned/ragnorak_uncleaned.csv", index=False)

ragnorak = pd.read_csv("./cleaned/ragnorak.csv")

ragnorak.head()

129 whitespace rows removed
258 page number rows removed
Removed 0 rows that match regex "INTERCUT:"
Removed 2 rows that match regex "CUT TO"
Removed 1 rows that match regex "^THE END"
Removed 65 rows that match regex "^INT\."
Removed 66 rows that match regex "^EXT\."
Removed 0 rows that match regex "\(CONTINUED\)"
Removed 0 rows that match regex "CONTINUED: \(.\)"
Removed 127 rows that match regex "^[0-9]+\.$"
Removed 0 rows that match regex "\(.*radio\)"
Removed 0 rows that match regex "\(.*earpiece\)"
Removed 0 rows that match regex "\(.*headset\)"
Removed 0 rows that match regex "\(.*phone\)"
Removed 0 rows that match regex "\(.*cell\)"
Removed 15 rows that match regex "\(to .*\)"
Removed 0 rows that match regex "\(in .*\)"
Removed 12 rows that match regex "\(MORE\)"
Removed 0 rows that match regex "\(angry\)"
Removed 2 rows that match regex "TIME CUT:"
Removed 0 rows that match regex "TITLE:"
Removed 0 rows that match regex "MARVEL FLIP"
Removed 0 rows that match regex "SLAM TO"
R

Unnamed: 0,character,line
0,THOR,"Now I know what you’re thinking. Oh no! Thor’s in a cage. How did this happen? Well, sometimes you have to get captured just to get a straight answer out of somebody. It’s a long story but basically I'm a bit of a hero. See, I spent some time on earth, fought some robots, saved the planet a couple of times. Then I went searching through the cosmos for some magic, colorful Infinity Stone things... didn’t find any. That’s when I came across a path of death and destruction which led me all the way here into this cage... where I met you."
1,THOR,How much longer do you think we’ll be here?
2,SURTUR,"Thor, Son of Odin."
3,THOR,"Surtur. Son of a bitch...you’re still alive! I thought my father killed you, like, half a million years ago."
4,SURTUR,I cannot die. Not until I fulfill my destiny and lay waste to your home.
