In [1]:
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

## Script to CSV Process

1. Load script `.txt` file.  Use [PDF Extractor](https://pdfextractor.com/) to convert any `.pdf`s to `.txt`s.
2. Remove garbage rows
3. Inspect rows removed
4. Format lines
5. Create `uppercase_unique_form.csv`
6. Manually fill out `uppercase_unique_form.csv` columns `is_character_name` and `is_dialogue` in Excel or other `.csv` editor.
7. Load `uppercase_unique.csv` and filter out rows
8. Inspect rows removed
9. Merge lines into best guesses for columns "character" and "line"
10. Manually clean the "line" column by removing text that is not dialogue.  There will be a lot, so nearly every cell will have to be cleaned!


See the Guardians of the Galaxy script at the end of this notebook for an example of this process.

## Remove Garbage Rows

In [2]:
def remove_regex_rows(df,string):
    rows_to_remove = df["line"].str.contains(string,case=True,regex=True)
    print("Removed",rows_to_remove.sum(),"rows that match regex \"" + string + "\"")
    
    return df[~rows_to_remove], df[rows_to_remove]

def remove_whitespace_rows(df):
    whitespace_rows = df["line"].str.isspace()
    print(whitespace_rows.sum(),"whitespace rows removed")
    return df[~whitespace_rows], df[whitespace_rows]

def remove_movie_text_rows(df):
    
    regex_strings = ["^\[[^\[\]]*\]$"]
    
    removed_rows = pd.DataFrame()
    
    for regex_string in regex_strings:
        df, new_rows_to_remove = remove_regex_rows(df,regex_string)
        
        removed_rows = pd.concat([removed_rows, new_rows_to_remove])
    
    return df, removed_rows


def remove_page_number_rows(df):
    page_rows = df["line"].str.isnumeric()
    print(page_rows.sum(),"page number rows removed")
    
    return df[~page_rows], df[page_rows]

def remove_garbage_rows(df):
    df, whitespace_rows = remove_whitespace_rows(df)
    df, page_number_rows = remove_page_number_rows(df)
    df, movie_text_rows = remove_movie_text_rows(df)
    
    garbage_rows = pd.concat([whitespace_rows, page_number_rows, movie_text_rows], 
                            keys = ["whitespace","page_numbers","movie_text"])
    
    print("-----------------------------------------")
    print(garbage_rows.shape[0],"total rows removed\n")
    
    return df, garbage_rows

## Format Character Lines

In [3]:
def remove_regex(df,string):
    print("Removed",df["line"].str.count(string).sum(),"occurences of regex \"" + string + "\"")
    df["line"] = df["line"].str.replace(string,"",case=True,regex=True)

def remove_leading_trailing_whitespace(df):
    df["line"] = df["line"].str.replace("[ \t]+$","")
    df["line"] = df["line"].str.replace("^[ \t]+","")

def format_lines(df):
    remove_leading_trailing_whitespace(df)
    
    regex_strings = ["\[[^\[\]]*\]",
                     "\(O\.S\)",
                    "\(V\.O\)",
                     "\(V\.O\.\)",
                    "\(CONT.D\)",
                    "\(cont.d\)",
                    "\(O\.S\.\)",
                    "\(ON.*\)",
                    "\(ON TV\)",
                    "\(OVER.*\)",
                    "\(INTO.*\)",
                    "\(HOLO\)",
                    "\(ADR\)",
                    "\(then\)",
                    " \(RHODEY’S VOICE\)",
                    " \(PRE-LAP\)"]
    
    for regex_string in regex_strings:
        remove_regex(df, regex_string)
    
    print("")
    
    remove_leading_trailing_whitespace(df)

## Create Character / Line Columns

In [4]:
def create_character_line_columns(df):
    character = df.line.str.extract(r'(^[^:]*:)')[0]
    line = df.line.str.extract(r'(:.*$)')[0]
    
    character = character.str.replace(":$","",case=True,regex=True)
    line = line.str.replace("^[ \t]*:[ \t]*","",case=True,regex=True)
    
    character = character.str.upper()
    
    return pd.DataFrame({"character": character, "line": line})

## Iron Man 2

In [5]:
iron_man_2 = pd.read_csv("./script txts/iron_man_2.txt",sep="\n",header=None,names=["line"])

print(iron_man_2.shape)

iron_man_2, garbage_rows = remove_garbage_rows(iron_man_2)
format_lines(iron_man_2)

iron_man_2 = create_character_line_columns(iron_man_2)

iron_man_2.to_csv("./uncleaned/iron_man_2_uncleaned.csv", index=False)

iron_man_2 = pd.read_csv("./cleaned/iron_man_2.csv")

iron_man_2.head()

#iron_man_2[iron_man_2["line"].str.contains("[^a-zA-Z\d\.,\?! '’\"-…]", regex=True)]
#iron_man_2[iron_man_2["line"].isnull()]

(1117, 1)
0 whitespace rows removed
0 page number rows removed
Removed 67 rows that match regex "^\[[^\[\]]*\]$"
-----------------------------------------
67 total rows removed

Removed 224 occurences of regex "\[[^\[\]]*\]"
Removed 0 occurences of regex "\(O\.S\)"
Removed 1 occurences of regex "\(V\.O\)"
Removed 5 occurences of regex "\(V\.O\.\)"
Removed 0 occurences of regex "\(CONT.D\)"
Removed 0 occurences of regex "\(cont.d\)"
Removed 0 occurences of regex "\(O\.S\.\)"
Removed 0 occurences of regex "\(ON.*\)"
Removed 0 occurences of regex "\(ON TV\)"
Removed 0 occurences of regex "\(OVER.*\)"
Removed 0 occurences of regex "\(INTO.*\)"
Removed 0 occurences of regex "\(HOLO\)"
Removed 0 occurences of regex "\(ADR\)"
Removed 0 occurences of regex "\(then\)"
Removed 0 occurences of regex " \(RHODEY’S VOICE\)"
Removed 0 occurences of regex " \(PRE-LAP\)"



Unnamed: 0,character,line
0,TONY,Been a while since I was up here in front of you. Maybe I’ll do us all a favour and just stick to the cards. There’s been some speculation that I was somehow involved in the events that occurred on the freeway and on the rooftop…
1,CHRISTINE EVERHART,"Sorry, Mr Stark, do you honestly expect us to believe that that was a bodyguard in a suit that convinently appeared despite the fact that you sorely despise bodyguards?"
2,TONY,Yes
3,CHRISTINE,And this mysterious bodyguard was somehow equipped with an undisclosed Stark high-tech powered battle…
4,TONY,"I know that it’s confusing. It is one thing to question the official story and another thing entirely to make wild accusations, or insinuate that I’m a superhero."
