In [2]:
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

## Script to CSV Process

1. Load script `.txt` file.  Use [PDF Extractor](https://pdfextractor.com/) to convert any `.pdf`s to `.txt`s.
2. Remove garbage rows
3. Inspect rows removed
4. Format lines
5. Create `uppercase_unique_form.csv`
6. Manually fill out `uppercase_unique_form.csv` columns `is_character_name` and `is_dialogue` in Excel or other `.csv` editor.
7. Load `uppercase_unique.csv` and filter out rows
8. Inspect rows removed
9. Merge lines into best guesses for columns "character" and "line"
10. Manually clean the "line" column by removing text that is not dialogue.  There will be a lot, so nearly every cell will have to be cleaned!


See the Iron Man script at the end of this notebook for an example of this process.

## Remove Garbage Rows

In [3]:
def remove_regex_rows(df,string):
    rows_to_remove = df["line"].str.contains(string,case=True,regex=True)
    print("Removed",rows_to_remove.sum(),"rows that match regex \"" + string + "\"")
    
    return df[~rows_to_remove], df[rows_to_remove]

def remove_whitespace_rows(df):
    whitespace_rows = df["line"].str.isspace()
    print(whitespace_rows.sum(),"whitespace rows removed")
    return df[~whitespace_rows], df[whitespace_rows]

def remove_movie_text_rows(df):
    
    regex_strings = ["INTERCUT",
                    "CUT TO",
                    "^THE END",
                    "^INT\.",
                    "^EXT\.",
                    "CONTINUED",
                    "^[0-9]+\.$",
                    "\(.*radio\)",
                    "\(.*earpiece\)",
                    "\(.*headset\)",
                    "\(.*phone\)",
                    "\(.*cell\)",
                    "\(to .*\)",
                    "\(in .*\)",
                    "\(MORE\)",
                    "\(angry\)",
                    "TIME CUT:",
                    "TITLE:",
                    "SLAM TO",
                    "DISSOLVE TO",
                    "FADE",
                    "BEST ADAPTED SCREENPLAY",
                    "MARVEL",
                    "FADE TO BLACK",
                    "CUE MUSIC",
                    "BLUE DRAFT 05/20/16",
                    "SALMON #2",
                    "../../..",
                    "OMITTED",
                    "MAIN TITLE",
                    "[A-J][0-9]+",
                    "[0-9]+[A-J]",
                    "\*",
                    "\(.*:\)",
                    "^\(beat\)$"]
    
    removed_rows = pd.DataFrame()
    
    for regex_string in regex_strings:
        df, new_rows_to_remove = remove_regex_rows(df,regex_string)
        
        removed_rows = pd.concat([removed_rows, new_rows_to_remove])
    
    return df, removed_rows


def remove_page_number_rows(df):
    page_rows = df["line"].str.isnumeric()
    print(page_rows.sum(),"page number rows removed")
    
    return df[~page_rows], df[page_rows]

def remove_garbage_rows(df):
    df, whitespace_rows = remove_whitespace_rows(df)
    df, page_number_rows = remove_page_number_rows(df)
    df, movie_text_rows = remove_movie_text_rows(df)
    
    garbage_rows = pd.concat([whitespace_rows, page_number_rows, movie_text_rows], 
                            keys = ["whitespace","page_numbers","movie_text"])
    
    print("-----------------------------------------")
    print(garbage_rows.shape[0],"total rows removed\n")
    
    return df, garbage_rows

## Format Character Lines

In [4]:
def remove_regex(df,string):
    print("Removed",df["line"].str.count(string).sum(),"occurences of regex \"" + string + "\"")
    df["line"] = df["line"].str.replace(string,"",case=True,regex=True)

def remove_leading_trailing_whitespace(df):
    df["line"] = df["line"].str.replace("[ \t]+$","")
    df["line"] = df["line"].str.replace("^[ \t]+","")

def format_lines(df):
    remove_leading_trailing_whitespace(df)
    
    regex_strings = ["\(O\.S\)",
                    "\(V\.O\)",
                     "\(V\.O\.\)",
                    "\(CONT.D\)",
                    "\(cont.d\)",
                    "\(O\.S\.\)",
                    "\(ON.*\)",
                    "\(OVER.*\)",
                    "\(INTO.*\)",
                    "\(HOLO\)",
                    "\(ADR\)",
                    "\(then\)",
                    " \(RHODEY’S VOICE\)"]
    
    for regex_string in regex_strings:
        remove_regex(df, regex_string)
    
    print("")
    
    remove_leading_trailing_whitespace(df)

## Remove Non Dialogue Uppercase Rows

In [5]:
def uppercase_rows(df):
    upper_lines = df["line"].str.isupper()
    
    return df[upper_lines]

def create_uppercase_unique_csv(df,script_name):
    '''
    Creates a csv file that is used for inspection of unique
    lines that are all uppercase.  These lines are manually labeled
    to be character names or dialogue.  The resulting table will be
    used to delete rows that are neither character names nor dialogue.
    '''
    
    upper_rows = uppercase_rows(df)
    unique_upper_rows = pd.DataFrame(upper_rows.line.unique())

    unique_upper_rows["words"] = unique_upper_rows[0].str.split(" ").str.len()
    unique_upper_rows["is_character_name"] = 0
    unique_upper_rows["is_dialogue"] = 0
    sorted_unique_upper_rows = unique_upper_rows.sort_values(by=["words"])
    sorted_unique_upper_rows.to_csv("./uppercase forms/" + script_name + "_uppercase_unique_form.csv",encoding='utf-8')
    
    print("Created " + "./uppercase forms/" + script_name + "_uppercase_unique_form.csv\n")
    
def remove_non_dialogue_upper_rows(df,upper_rows):
    '''
    Removes lines that are in all caps, and that are not dialogue or a character's name.
    '''
    unique_character_names = upper_rows[upper_rows["is_character_name"]]["line"]
    unique_dialogues = upper_rows[upper_rows["is_dialogue"]]["line"]
    
    all_character_name_lines = df[df["line"].isin(unique_character_names)]
    all_dialogue_lines = df[df["line"].isin(unique_dialogues)]
    
    print(df["line"].str.isupper().sum(),"total uppercase lines")
    print(all_character_name_lines.shape[0],"uppercase character name lines")
    print(all_dialogue_lines.shape[0],"uppercase dialogue lines")
    
    unique_lines_to_remove = upper_rows[~(upper_rows["is_character_name"]|upper_rows["is_dialogue"])]["line"]
    removed_lines = df["line"].isin(unique_lines_to_remove)
    print(removed_lines.sum(),"uppercase lines removed")
    
    return df[~removed_lines], df[removed_lines] 

## Merge Lines

In [6]:
def is_character_name(string,character_names):
    return string in character_names["line"].tolist()
    

def merge_lines(df,character_names):
    '''
    Starts at a character name and merges all lines until the next character's name.
    The result should be the best guess at columns "character" and "line".
    This will merge in some screenplay text that is not dialogue, so the result will 
    have to be manually cleaned as the final step.
    '''
    merged_df = pd.DataFrame(columns=["character","line"])
    
    for i in range(0,df.shape[0]):
        
        row = df.iloc[i]
        
        if is_character_name(row["line"],character_names):
            character = row["line"]
            line = ""
            
            j = i + 1
            while(j < len(df) and (not is_character_name(df.iloc[j]["line"],character_names))):
                line += df.iloc[j]["line"] + " "
                j += 1
            
            new_row = {"character": character,"line": line}
            
            merged_df = merged_df.append(new_row,ignore_index=True)
    
    return merged_df

## Iron Man

In [7]:
iron_man = pd.read_csv("./script txts/iron-man-script-slug.txt",sep="\n",header=None,names=["line"])

iron_man, garbage_rows = remove_garbage_rows(iron_man)
format_lines(iron_man)

create_uppercase_unique_csv(iron_man,"iron_man")
filled_out_upper_rows = pd.read_csv("./uppercase results/iron_man_uppercase_unique.csv",
                                    names=["","line","words","is_character_name","is_dialogue"],
                                    dtype={"line":str, "words":int, "is_character_name":bool, "is_dialogue":bool},
                                    index_col=0)

iron_man, upper_removed_rows = remove_non_dialogue_upper_rows(iron_man, filled_out_upper_rows)
removed_rows = pd.concat([garbage_rows,pd.concat([upper_removed_rows],keys=[""])],keys=["garbage","uppercase"])

unique_character_names = pd.DataFrame(filled_out_upper_rows[filled_out_upper_rows["is_character_name"]]["line"])

iron_man = merge_lines(iron_man, unique_character_names)
iron_man.reindex(copy=False)

iron_man.to_csv("./uncleaned/iron_man_uncleaned.csv", index=False)

iron_man = pd.read_csv("./cleaned/iron_man.csv")

iron_man

180 whitespace rows removed
516 page number rows removed
Removed 13 rows that match regex "INTERCUT"
Removed 80 rows that match regex "CUT TO"
Removed 0 rows that match regex "^THE END"
Removed 99 rows that match regex "^INT\."
Removed 42 rows that match regex "^EXT\."
Removed 216 rows that match regex "CONTINUED"
Removed 86 rows that match regex "^[0-9]+\.$"
Removed 0 rows that match regex "\(.*radio\)"
Removed 0 rows that match regex "\(.*earpiece\)"
Removed 0 rows that match regex "\(.*headset\)"
Removed 0 rows that match regex "\(.*phone\)"
Removed 0 rows that match regex "\(.*cell\)"
Removed 8 rows that match regex "\(to .*\)"
Removed 19 rows that match regex "\(in .*\)"
Removed 10 rows that match regex "\(MORE\)"
Removed 0 rows that match regex "\(angry\)"
Removed 0 rows that match regex "TIME CUT:"
Removed 0 rows that match regex "TITLE:"
Removed 0 rows that match regex "SLAM TO"
Removed 1 rows that match regex "DISSOLVE TO"
Removed 2 rows that match regex "FADE"
Removed 0 rows 

Unnamed: 0,character,line
0,TONY,"Oh, I get it. You guys aren’t allowed to talk. Is that it? Are you not allowed to talk?"
1,JIMMY,No. We’re allowed to talk.
2,TONY,Oh. I see. So it’s personal.
3,RAMIREZ,I think they’re intimidated.
4,TONY,"Good God, you’re a woman."
5,TONY,"I, honestly, I couldn’t have called that. I would apologize, but isn’t that what we’re going for here? I saw you as a soldier first."
6,JIMMY,"I have a question, sir."
7,TONY,Please.
8,JIMMY,Is it true you’re twelve for twelve with last years Maxim cover girls?
9,TONY,"Excellent question. Yes and no. March and I had a schedule conflict but, thankfully, the Christmas cover was twins. Anyone else? You, with the hand up."
