In [23]:
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

## Script to CSV Process

1. Load script `.txt` file.  Use [PDF Extractor](https://pdfextractor.com/) to convert any `.pdf`s to `.txt`s.
2. Remove garbage rows
3. Inspect rows removed
4. Format lines
5. Create `uppercase_unique_form.csv`
6. Manually fill out `uppercase_unique_form.csv` columns `is_character_name` and `is_dialogue` in Excel or other `.csv` editor.
7. Load `uppercase_unique.csv` and filter out rows
8. Inspect rows removed
9. Merge lines into best guesses for columns "character" and "line"
10. Manually clean the "line" column by removing text that is not dialogue.  There will be a lot, so nearly every cell will have to be cleaned!


See the Avengers Endgame script at the end of this notebook for an example of this process.

## Remove Garbage Rows

In [20]:
def remove_whitespace_rows(df):
    whitespace_rows = df["line"].str.isspace()
    print(whitespace_rows.sum(),"whitespace rows removed")
    return df[~whitespace_rows], df[whitespace_rows]

def remove_int_ext_rows(df):
    int_rows = df["line"].str.contains("^INT\.",case=True)
    ext_rows = df["line"].str.contains("^EXT\.",case=True)
    
    print(int_rows.sum(),"INT. rows removed")
    print(ext_rows.sum(),"EXT. rows removed")
    
    removed_rows = int_rows | ext_rows
    
    return df[~removed_rows], df[removed_rows]

def remove_movie_text_rows(df):
    time_cut_rows = df["line"].str.contains("TIME CUT:",case=True)
    intercut_rows = df["line"].str.contains("INTERCUT:",case=True)
    title_rows = df["line"].str.contains("TITLE:",case=True)
    marvel_rows = df["line"].str.contains("MARVEL FLIP",case=True)
    cut_to_rows = df["line"].str.contains("CUT TO",case=True)
    slam_to_rows = df["line"].str.contains("SLAM TO",case=True)
    dissolve_to_rows = df["line"].str.contains("DISSOLVE TO",case=True)
    the_end_rows = df["line"].str.contains("^THE END",case=True)
    screenplay_rows = df["line"].str.contains("BEST ADAPTED SCREENPLAY",case=True)
    fade_to_black_rows = df["line"].str.contains("FADE TO BLACK",case=True)
    cue_music_rows = df["line"].str.contains("CUE MUSIC",case=True)
    
    print(time_cut_rows.sum(),"TIME CUT rows removed")
    print(intercut_rows.sum(),"INTERCUT rows removed")
    print(title_rows.sum(),"TITILE rows removed")
    print(marvel_rows.sum(),"MARVEL FLIP rows removed")
    print(cut_to_rows.sum(),"CUT TO rows removed")
    print(slam_to_rows.sum(),"SLAM TO rows removed")
    print(dissolve_to_rows.sum(),"DISSOLVE TO rows removed")
    print(the_end_rows.sum(),"THE END rows removed")
    print(screenplay_rows.sum(),"BEST ADAPTED SCREENPLAY rows removed")
    print(fade_to_black_rows.sum(),"FADE TO BLACK rows removed")
    print(cue_music_rows.sum(),"CUE MUSIC rows removed")
    
    removed_rows = time_cut_rows | title_rows | marvel_rows | cut_to_rows | \
              slam_to_rows | dissolve_to_rows | the_end_rows | intercut_rows | \
            screenplay_rows | fade_to_black_rows | cue_music_rows
    
    return df[~removed_rows], df[removed_rows]


def remove_page_number_rows(df):
    page_rows = df["line"].str.isnumeric()
    print(page_rows.sum(),"page number rows removed")
    
    return df[~page_rows], df[page_rows]

def remove_garbage_rows(df):
    df, int_ext_rows = remove_int_ext_rows(df)
    df, whitespace_rows = remove_whitespace_rows(df)
    df, page_number_rows = remove_page_number_rows(df)
    df, movie_text_rows = remove_movie_text_rows(df)
    
    garbage_rows = pd.concat([int_ext_rows, whitespace_rows, page_number_rows, movie_text_rows], 
                            keys = ["int_ext","whitespace","page_numbers","movie_text"])
    
    print("-----------------------------------------")
    print(garbage_rows.shape[0],"total rows removed\n")
    
    return df, garbage_rows

## Format Character Lines

In [21]:
def remove_cont_os(df):
    result = df
    result["line"] = result["line"].str.replace("(CONT'D)","",case=True,regex=False)
    result["line"] = result["line"].str.replace("(O.S.)","",case=True,regex=False)
    result["line"] = result["line"].str.replace("(O.S)","",case=True,regex=False)
    result["line"] = result["line"].str.replace("(ON SCREEN)","",case=True,regex=False)
    return result

def remove_coms_holo(df):
    result = df
    result["line"] = result["line"].str.replace("(OVER COM)","",case=True,regex=False)
    result["line"] = result["line"].str.replace("(ON COM)","",case=True,regex=False)
    result["line"] = result["line"].str.replace("(INTO COM)","",case=True,regex=False)
    result["line"] = result["line"].str.replace("(OVER SPEAKER)","",case=True,regex=False)
    result["line"] = result["line"].str.replace("(INTO PHONE)","",case=True,regex=False)
    result["line"] = result["line"].str.replace("(OVER PHONE)","",case=True,regex=False)
    result["line"] = result["line"].str.replace("(HOLO)","",case=True,regex=False)
    return result

def remove_leading_trailing_whitespace(df):
    result = df
    result["line"] = result["line"].str.replace("[ \t]+$","")
    result["line"] = result["line"].str.replace("^[ \t]+","")
    return result

def format_lines(df):

    df = remove_leading_trailing_whitespace(df)
    df = remove_cont_os(df)
    df = remove_coms_holo(df)
    df = remove_leading_trailing_whitespace(df)
    
    return df

## Remove Non Dialogue Uppercase Rows

In [22]:
def uppercase_rows(df):
    upper_lines = df["line"].str.isupper()
    
    #sentences = df["line"].str.split(" ").str.len() >= min_number_words
    
    return df[upper_lines]

def create_uppercase_unique_csv(df,script_name):
    '''
    Creates a csv file that is used for inspection of unique
    lines that are all uppercase.  These lines are manually labeled
    to be character names or dialogue.  The resulting table will be
    used to delete rows that are neither character names nor dialogue.
    '''
    
    upper_rows = uppercase_rows(df)
    unique_upper_rows = pd.DataFrame(upper_rows.line.unique())

    unique_upper_rows["words"] = unique_upper_rows[0].str.split(" ").str.len()
    unique_upper_rows["is_character_name"] = 0
    unique_upper_rows["is_dialogue"] = 0
    sorted_unique_upper_rows = unique_upper_rows.sort_values(by=["words"])
    sorted_unique_upper_rows.to_csv("./data/uppercase forms/" + script_name + "_uppercase_unique_form.csv",encoding='utf-8')
    
def remove_non_dialogue_upper_rows(df,upper_rows):
    '''
    Removes lines that are in all caps, and that are not dialogue or a character's name.
    '''
    unique_character_names = upper_rows[upper_rows["is_character_name"]]["line"]
    unique_dialogues = upper_rows[upper_rows["is_dialogue"]]["line"]
    
    all_character_name_lines = df[df["line"].isin(unique_character_names)]
    all_dialogue_lines = df[df["line"].isin(unique_dialogues)]
    
    print(df["line"].str.isupper().sum(),"total uppercase lines")
    print(all_character_name_lines.shape[0],"uppercase character name lines")
    print(all_dialogue_lines.shape[0],"uppercase dialogue lines")
    
    unique_lines_to_remove = upper_rows[~(upper_rows["is_character_name"]|upper_rows["is_dialogue"])]["line"]
    removed_lines = df["line"].isin(unique_lines_to_remove)
    print(removed_lines.sum(),"uppercase lines removed")
    
    return df[~removed_lines], df[removed_lines] 

## Merge Lines

In [6]:
def is_character_name(string,character_names):
    return string in character_names["line"].tolist()
    

def merge_lines(df,character_names):
    '''
    Starts at a character name and merges all lines until the next character's name.
    The result should be the best guess at columns "character" and "line".
    This will merge in some screenplay text that is not dialogue, so the result will 
    have to be manually cleaned as the final step.
    '''
    merged_df = pd.DataFrame(columns=["character","line"])
    
    for i in range(0,df.shape[0]):
        
        row = df.iloc[i]
        
        if is_character_name(row["line"],character_names):
            character = row["line"]
            line = ""
            
            j = i + 1
            while(j < len(df) and (not is_character_name(df.iloc[j]["line"],character_names))):
                line += df.iloc[j]["line"] + " "
                j += 1
            
            new_row = {"character": character,"line": line}
            
            merged_df = merged_df.append(new_row,ignore_index=True)
    
    return merged_df

## The Avengers

In [None]:
avengers = pd.read_csv("./data/script txts/avengers-script-slug.txt",sep="\n",header=None,names=["line"])