In [1]:
import pandas as pd

In [3]:
friends_transcripts = pd.read_csv("friends_transcripts.csv")

In [10]:
# Function to transform episode dialogues into a conversation-like format with improved handling
def format_as_conversation(text):
    # Fix potential encoding issues
    text = text.encode('utf-8', 'ignore').decode('utf-8')
    
    # Split into lines and define a flexible speaker regex pattern
    lines = text.splitlines()
    speaker_pattern = re.compile(r'^\s*([A-Za-z\s\-]+):\s*(.*)')  # Allow multi-word names and hyphens
    
    conversation = []
    for line in lines:
        line = line.strip()
        match = speaker_pattern.match(line)
        if match:
            speaker, dialogue = match.groups()
            conversation.append(f"{speaker.strip()}: {dialogue.strip()}")
        else:
            # Handle lines without a speaker as "Narrator" or skip if empty
            if line:  # Only add if the line is not empty
                conversation.append(f"Narrator: {line}")
    
    # Join the lines into a single formatted string
    return " ".join(conversation)

# Apply the function to format the dialogues in 'quote' column
friends_transcripts['formatted_conversation'] = friends_transcripts['author'] + ": " + friends_transcripts['quote']
formatted_data = friends_transcripts.groupby(['season', 'episode_number'])['formatted_conversation'].apply(' '.join).reset_index()

In [11]:
formatted_data.head(32)

Unnamed: 0,season,episode_number,formatted_conversation
0,1.0,1.0,Monica: There's nothing to tell! He's just som...
1,1.0,2.0,"Monica: What you guys don't understand is, for..."
2,1.0,3.0,"Phoebe: (entering) Hi guys! All: Hey, Pheebs! ..."
3,1.0,4.0,"Monica: Alright. Phoebe? Phoebe: Okay, okay. I..."
4,1.0,5.0,Monica: Would you let it go? It's not that big...
5,1.0,6.0,Rachel: (reading the program) Ooh! Look! Look!...
6,1.0,7.0,"Rachel: Everybody? Shh, shhh. Uhhh... Central ..."
7,1.0,8.0,Chandler: Dehydrated Japanese noodles under fl...
8,1.0,9.0,"Rachel: Terry, I, I, I know that I haven't wor..."
9,1.0,10.0,Ross: Guys? There's a somebody I'd like you to...


In [8]:
formatted_data.to_csv("inputdata.csv",index=False)

In [2]:
# Load the data
file_path = 'model input.csv'
friends_transcripts = pd.read_csv(file_path)

# Display the first few rows of the data to understand its structure
friends_transcripts

Unnamed: 0,season,episode_number,formatted_line
0,1.0,1.0,Monica: There's nothing to tell! He's just som...
1,1.0,2.0,"Monica: What you guys don't understand is, for..."
2,1.0,3.0,"Phoebe: (entering) Hi guys! All: Hey, Pheebs! ..."
3,1.0,4.0,"Monica: Alright. Phoebe? Phoebe: Okay, okay. I..."
4,1.0,5.0,Monica: Would you let it go? It's not that big...
...,...,...,...
221,10.0,13.0,Phoebe: Hi All: Hey! Hi! Rachel: How was the h...
222,10.0,14.0,Mike: (raising his glass) Thank you guys for h...
223,10.0,15.0,Jennifer: Previously on Friends... Chandler: H...
224,10.0,16.0,"Joey: All right, all right, all right, let's p..."


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the tokenizer and model for summarization
tokenizer = AutoTokenizer.from_pretrained("lidiya/bart-large-xsum-samsum")
model = AutoModelForSeq2SeqLM.from_pretrained("lidiya/bart-large-xsum-samsum")

# Function to segment long dialogues
def segment_text(text, max_length=1024):
    words = text.split()
    segments = []
    current_segment = []

    for word in words:
        current_segment.append(word)
        if len(" ".join(current_segment)) > max_length:
            segments.append(" ".join(current_segment))
            current_segment = []
    
    if current_segment:  # Add the last segment if there is any leftover
        segments.append(" ".join(current_segment))
    
    return segments

# Function to summarize text
def summarize_text(text):
    # Tokenize and prepare the input
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True)
    # Generate the summary
    summary_ids = model.generate(
        inputs,
        max_length=150,  # Adjust maximum output length if needed
        min_length=40,   # Adjust minimum output length if needed
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Apply segmentation and summarization to each concatenated dialogue
def summarize_dialogues(text):
    segments = segment_text(text)
    summaries = [summarize_text(segment) for segment in segments]
    # Combine the summaries of each segment
    return " ".join(summaries)

# Apply the function to each row in the DataFrame
friends_transcripts['summary'] = friends_transcripts['formatted_line'].apply(summarize_dialogues)

# Display the updated DataFrame with summaries
friends_transcripts[['season', 'episode_number', 'summary']].head()


Unnamed: 0,season,episode_number,summary
0,1.0,1.0,Monica is going out on a date with her workmat...
1,1.0,2.0,"Monica, Rachel, Phoebe, Chandler and Ross are ..."
2,1.0,3.0,Phoebe had a date. He walked her to the subway...
3,1.0,4.0,"It's Ross' birthday today. Monica, Phoebe, Cha..."
4,1.0,5.0,"Chandler, Monica, Rachel, Phoebe and Tony are ..."


In [19]:
friends_transcripts.to_csv("summaries.csv",index=False)