In [183]:
import pandas as pd
import re
import glob

In [184]:
def clean_dialogue(text):

    # Remove text within square brackets or paranthesis (even if they span multiple lines)
    text = re.sub(r'\[.*?\]', '', text, flags=re.DOTALL)
    text = re.sub(r'\(.*?\)', '', text, flags=re.DOTALL)

    # Normalize spaces (convert multiple spaces/newlines into a single space)
    text = re.sub(r'\s+', ' ', text).strip()

    # List of phrases that indicate a dialogue should be removed (case-sensitive)
    phrases_to_remove = [
        "THE END",
        "All times are UTC-05:00",
        "Powered by phpBB® Forum Software © phpBB Limited",
        "MISS",
        "MRS"
    ]

    # Check for "CUT TO ..." and "Page 1 of ..." followed by a single word and remove that part
    text = re.sub(r'(CUT TO|Page 1 of)\s+\w+', '', text)

    # Remove specific phrases (case-sensitive)
    for phrase in phrases_to_remove:
        text = text.replace(phrase, '')  # Removing the exact phrase (case-sensitive)
            
    return text.strip()


In [185]:

def parse_script(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()

        content = content.replace('L:', 'LORELAI:').replace('R:', 'RORY:')

        # Extract the season using a regular expression pattern (e.g., '01x01' or '02x22')
        season_match = re.search(r'(\d{2})x\d{2}', file_path)
        season = int(season_match.group(1)) if season_match else None

        # Clean extra spaces or newlines before and after character names and dialogue
        content = re.sub(r'\s*(\w+:)', r'\1', content)  # Remove spaces before character names

        # Split the content by character names (in uppercase followed by a colon)
        parts = re.split(r'([A-Z]+:)', content)

        characters = []
        dialogues = []
        current_character = ''
        current_dialogue = ''

        for part in parts:
            if re.match(r'[A-Z]+:', part):
                # If we've found a new character, save the previous dialogue (if any)
                if current_character and current_dialogue:
                    # Clean the dialogue text to remove the undesired phrases
                    cleaned_dialogue = clean_dialogue(current_dialogue)
                    characters.append(current_character)
                    dialogues.append(cleaned_dialogue)
                
                # Set the new current character and reset current dialogue
                current_character = part.strip(':')
                current_dialogue = ''
            else:
                # Add to the current dialogue, stopping at the marker if present
                dialogue_part = part.split('\\[m//n')[0]
                current_dialogue += dialogue_part

        # Add the last character-dialogue pair if present
        if current_character and current_dialogue:
            cleaned_dialogue = clean_dialogue(current_dialogue)
            characters.append(current_character)
            dialogues.append(cleaned_dialogue)

        # Create a DataFrame and add the 'Season' column with the extracted season value
        df = pd.DataFrame({
            'Character': characters,
            'Dialogue': dialogues
        })
        df['Season'] = season
        return df
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found.")
        return pd.DataFrame(columns=['Character', 'Dialogue', 'Season'])

In [187]:
script_df = parse_script("C:/Users/asus/Desktop/Gilmore_girls_sentiment_analysis/TXT/01x01.txt")

In [None]:
script_df.iloc[630]['Dialogue']

"I'm not embarrassed."

In [None]:
script_df

Unnamed: 0,Character,Dialogue,Season
0,https,//transcripts.foreverdreaming.org/ 01x01 - The...,1
1,https,//transcripts.foreverdreaming.org/viewtopic.ph...,1
2,LORELAI,"Please, Luke. Please, please, please.",1
3,LUKE,How many cups have you had this morning?,1
4,LORELAI,None.,1
...,...,...,...
51654,SOOKIE,"Okay, see you there.",3
51655,LORELAI,"Wait, wait. Look around for a second. Notice?",3
51656,RORY,Notice what?,3
51657,LORELAI,It's not so scary anymore.,3
