In [32]:
import pandas as pd
import re
import glob
from tqdm import tqdm
import os

In [33]:
def clean_dialogue(text):

    # Remove text within square brackets or paranthesis (even if they span multiple lines)
    text = re.sub(r'\[.*?\]', '', text, flags=re.DOTALL)
    text = re.sub(r'\(.*?\)', '', text, flags=re.DOTALL)

    # Normalize spaces (convert multiple spaces/newlines into a single space)
    text = re.sub(r'\s+', ' ', text).strip()

    # List of phrases that indicate a dialogue should be removed (case-sensitive)
    phrases_to_remove = [
        "THE END",
        "All times are UTC-05:00",
        "Powered by phpBB® Forum Software © phpBB Limited",
        "MISS",
        "MRS",
    ]

    # Remove specific phrases (case-sensitive)
    for phrase in phrases_to_remove:
        text = text.replace(phrase, '')  # Removing the exact phrase (case-sensitive)
        return text.strip()


In [34]:
def clean_cut_to_and_everything_after(text):
    if pd.isna(text):
        return text
    # This regex finds "CUT TO" and removes it and everything after
    return re.sub(r'(?i)\s*cut\s+to.*$', '', text).strip()

In [35]:
def parse_script(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()

        # Extract the season using a regular expression pattern (e.g., '01x01' or '02x22')
        season_match = re.search(r'(\d{2})x\d{2}', file_path)
        season = int(season_match.group(1)) if season_match else None

        episode_match = re.search(r'\d{2}x(\d+)', file_path)
        episode = int(episode_match.group(1)) if episode_match else None

        # Replace abbreviated character names, case-insensitive
        content = re.sub(r'\bGRANDMOTHER\s*:', 'EMILY:', content, flags=re.IGNORECASE)
        content = re.sub(r'\bL\s*:', 'LORELAI:', content, flags=re.IGNORECASE)
        content = re.sub(r'\bR\s*:', 'RORY:', content, flags=re.IGNORECASE)

        # Clean extra spaces or newlines before and after character names and dialogue
        content = re.sub(r'\s*(\w+)\s*:', r'\1:', content)  # Remove spaces before character names
        
        # Split the content by character names (in uppercase followed by a colon)
        parts = re.split(r'([A-Za-z]+:)', content)

        characters = []
        dialogues = []
        current_character = ''
        current_dialogue = ''

        for part in parts:
            if re.match(r'[A-Za-z]+:', part):
                # If we've found a new character, save the previous dialogue (if any)
                if current_character and current_dialogue:
                    # Clean the dialogue text to remove the undesired phrases
                    cleaned_dialogue = clean_dialogue(current_dialogue)
                    characters.append(current_character)
                    dialogues.append(cleaned_dialogue)
                
                # Set the new current character and reset current dialogue
                current_character = part.strip(':').upper()  # Convert to uppercase for consistency
                current_dialogue = ''
            else:
                # Add to the current dialogue, stopping at the marker if present
                dialogue_part = part.split('\\[m//n')[0]
                current_dialogue += dialogue_part.strip()

        # Add the last character-dialogue pair if present
        if current_character and current_dialogue:
            cleaned_dialogue = clean_dialogue(current_dialogue)
            characters.append(current_character)
            dialogues.append(cleaned_dialogue)

        # Create a DataFrame and add the 'Season' column with the extracted season value
        df = pd.DataFrame({
            'Character': characters,
            'Dialogue': dialogues
        })
        df['Season'] = season
        df['Episode'] = episode
        df['Dialogue'] = df['Dialogue'].apply(clean_cut_to_and_everything_after)
        return df
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found.")
        return pd.DataFrame(columns=['Character', 'Dialogue', 'Season'])

In [36]:
# Print current working directory
print("Current working directory:", os.getcwd())


Current working directory: c:\Users\asus\Desktop\Gilmore_girls_sentiment_analysis\preprocess


In [37]:
folder_path = r"C:\Users\asus\Desktop\Gilmore_girls_sentiment_analysis\TXT"
# Use glob to list all .txt files in that folder
file_paths = glob.glob(os.path.join(folder_path, "*.txt"))

# Parse each file and collect the resulting DataFrames in a list
dataframes = [parse_script(fp) for fp in file_paths]
print(dataframes)

# Concatenate all individual DataFrames into one, resetting the index
script_df = pd.concat(dataframes, ignore_index=True)

[      Character                                           Dialogue  Season  \
0         HTTPS     //transcripts.foreverdreaming.org/ 01x01 - The       1   
1    PILOTHTTPS  //transcripts.foreverdreaming.org/viewtopic.ph...       1   
2       LORELAI              Please, Luke. Please, please, please.       1   
3          LUKE           How many cups have you had this morning?       1   
4       LORELAI                                              None.       1   
..          ...                                                ...     ...   
628        RORY                                     Check, please.       1   
629     LORELAI  No, really, are you embarrassed to bring him h...       1   
630        RORY                               I'm not embarrassed.       1   
631     LORELAI                               Does he talk at all?       1   
632        RORY  No, Mom, he's a mime.  Powered by phpBB® Forum...       1   

     Episode  
0          1  
1          1  
2          1  
3 

In [39]:
script_df.to_csv('script.csv', index=False)