In [1]:
import pandas as pd
import re
import os

directory = r'D:\github\garage\03_pulp_fiction\data\\'

In [2]:
def parse_srt(file_path):
    with open(file_path, 'r') as file:
        content = file.read()

    # Splitting the file into blocks
    blocks = re.split(r'\n\n', content)

    # Parsing each block
    data = []
    for block in blocks:
        parts = block.split('\n')
        if len(parts) >= 3:
            # Extracting the start and end times
            times = parts[1].split(' --> ')
            start_time = times[0].strip()
            end_time = times[1].strip()

            # Combining the text lines
            text = ' '.join(parts[2:])

            # Adding to the data list
            data.append([start_time, end_time, text])
    
    return pd.DataFrame(data, columns=['Start Time', 'End Time', 'Subtitle'])


file_name = 'pulp_fiction_subtitles.srt'
subtitles_df = parse_srt(directory + file_name)
subtitles_df.head()

Unnamed: 0,Start Time,End Time,Subtitle
0,"00:00:28,340","00:00:32,428",Forget it. It's too risky. I'm through doing t...
1,"00:00:32,508","00:00:35,966",You always say that. The same thing every time.
2,"00:00:36,038","00:00:37,977","""I'm through, never again, too dangerous."""
3,"00:00:38,048","00:00:40,537",I know that's what I always say. I'm always ri...
4,"00:00:40,617","00:00:44,015",- You forget about it in a day or two. - The d...


In [3]:
subtitles_df['Subtitle'] = subtitles_df['Subtitle'].str.strip('- ')

subtitles_df.head(10)

Unnamed: 0,Start Time,End Time,Subtitle
0,"00:00:28,340","00:00:32,428",Forget it. It's too risky. I'm through doing t...
1,"00:00:32,508","00:00:35,966",You always say that. The same thing every time.
2,"00:00:36,038","00:00:37,977","""I'm through, never again, too dangerous."""
3,"00:00:38,048","00:00:40,537",I know that's what I always say. I'm always ri...
4,"00:00:40,617","00:00:44,015",You forget about it in a day or two. - The day...
5,"00:00:44,087","00:00:46,176",The days of me remembering have just begun.
6,"00:00:46,248","00:00:48,417","You know, when you go on like this, what you s..."
7,"00:00:48,488","00:00:51,516",I sound like a sensible fuckin' man. - You sou...
8,"00:00:51,588","00:00:55,046","Quack, quack, quack. - Take heart, 'cause you'..."
9,"00:00:55,127","00:00:58,615","Since I'm never gonna do it again, you're neve..."


In [4]:
subtitles_df['Subtitle'] = subtitles_df['Subtitle'].str.split(' - ')

subtitles_df = subtitles_df.explode('Subtitle')

subtitles_df['Subtitle'] = subtitles_df['Subtitle'].str.strip()

subtitles_df.head(10)

Unnamed: 0,Start Time,End Time,Subtitle
0,"00:00:28,340","00:00:32,428",Forget it. It's too risky. I'm through doing t...
1,"00:00:32,508","00:00:35,966",You always say that. The same thing every time.
2,"00:00:36,038","00:00:37,977","""I'm through, never again, too dangerous."""
3,"00:00:38,048","00:00:40,537",I know that's what I always say. I'm always ri...
4,"00:00:40,617","00:00:44,015",You forget about it in a day or two.
4,"00:00:40,617","00:00:44,015",The days of me forgetting are over.
5,"00:00:44,087","00:00:46,176",The days of me remembering have just begun.
6,"00:00:46,248","00:00:48,417","You know, when you go on like this, what you s..."
7,"00:00:48,488","00:00:51,516",I sound like a sensible fuckin' man.
7,"00:00:48,488","00:00:51,516",You sound like a duck.


In [5]:
subtitles_df[subtitles_df['Subtitle'] == '']

Unnamed: 0,Start Time,End Time,Subtitle


In [6]:
def count_occurrences(word, sentence):
    # Count occurrences of the word in the sentence, case insensitive
    return sentence.lower().count(word.lower())

# Adding a new column 'Fuck Count' to the DataFrame
subtitles_df['Fuck Count'] = subtitles_df['Subtitle'].apply(lambda x: count_occurrences('fuck', x))

# Displaying the DataFrame with the new column
subtitles_df.head()

Unnamed: 0,Start Time,End Time,Subtitle,Fuck Count
0,"00:00:28,340","00:00:32,428",Forget it. It's too risky. I'm through doing t...,0
1,"00:00:32,508","00:00:35,966",You always say that. The same thing every time.,0
2,"00:00:36,038","00:00:37,977","""I'm through, never again, too dangerous.""",0
3,"00:00:38,048","00:00:40,537",I know that's what I always say. I'm always ri...,0
4,"00:00:40,617","00:00:44,015",You forget about it in a day or two.,0


In [7]:
subtitles_df['Fuck Count'].sum()

255

In [8]:
file_name = 'pulp_fiction_dialogue.csv'
script_df = pd.read_csv(directory + file_name)
script_df.head()

Unnamed: 0,Line number,Character (in script),Character (actual),Off screen,Voice-over,Place,Time,Line,Word count
0,1,Young man,Pumpkin,False,False,int. coffee shop,morning,"No, forget it, it's too risky. I'm through doi...",11
1,2,Young woman,Honey Bunny,False,False,int. coffee shop,morning,"You always say that, the same thing every time...",15
2,3,Young man,Pumpkin,False,False,int. coffee shop,morning,I know that's what I always say. I'm always ri...,12
3,4,Young woman,Honey Bunny,False,False,int. coffee shop,morning,– but you forget about it in a day or two -,10
4,5,Young man,Pumpkin,False,False,int. coffee shop,morning,"– yeah, well, the days of me forgittin' are ov...",18


In [9]:
script_df['Fuck Count'] = script_df['Line'].apply(lambda x: count_occurrences('fuck', x))
script_df.head()

Unnamed: 0,Line number,Character (in script),Character (actual),Off screen,Voice-over,Place,Time,Line,Word count,Fuck Count
0,1,Young man,Pumpkin,False,False,int. coffee shop,morning,"No, forget it, it's too risky. I'm through doi...",11,0
1,2,Young woman,Honey Bunny,False,False,int. coffee shop,morning,"You always say that, the same thing every time...",15,0
2,3,Young man,Pumpkin,False,False,int. coffee shop,morning,I know that's what I always say. I'm always ri...,12,0
3,4,Young woman,Honey Bunny,False,False,int. coffee shop,morning,– but you forget about it in a day or two -,10,0
4,5,Young man,Pumpkin,False,False,int. coffee shop,morning,"– yeah, well, the days of me forgittin' are ov...",18,0


In [10]:
script_df['Fuck Count'].sum()

166

In [11]:
file_name = 'tarantino.csv'
tarantino = pd.read_csv(directory + file_name)

tarantino = tarantino[(tarantino['movie'] == 'Pulp Fiction') & (tarantino['type'] == 'word')]
tarantino['Fuck Count 2'] = tarantino['word'].apply(lambda x: count_occurrences('fuck', x))

tarantino.head()

Unnamed: 0,movie,type,word,minutes_in,Fuck Count 2
431,Pulp Fiction,word,shit,0.52,0
432,Pulp Fiction,word,fucking,0.82,1
433,Pulp Fiction,word,fuck,1.37,1
434,Pulp Fiction,word,fucking,1.62,1
435,Pulp Fiction,word,fucking,1.73,1


In [12]:
tarantino['Fuck Count 2'].sum()

267

In [13]:
subtitles_df

Unnamed: 0,Start Time,End Time,Subtitle,Fuck Count
0,"00:00:28,340","00:00:32,428",Forget it. It's too risky. I'm through doing t...,0
1,"00:00:32,508","00:00:35,966",You always say that. The same thing every time.,0
2,"00:00:36,038","00:00:37,977","""I'm through, never again, too dangerous.""",0
3,"00:00:38,048","00:00:40,537",I know that's what I always say. I'm always ri...,0
4,"00:00:40,617","00:00:44,015",You forget about it in a day or two.,0
...,...,...,...,...
2023,"02:28:26,391","02:28:29,320",I'm tryin' real hard...,0
2024,"02:28:29,391","02:28:31,660",to be the shepherd.,0
2025,"02:28:46,980","02:28:48,909",Go.,0
2026,"02:29:06,598","02:29:09,027",I think we should be leaving now.,0


In [14]:
def get_seconds(df, column):
    temp_df = df[column].str.split(':', expand=True)
    temp_df.columns = ['hours', 'minutes', 'seconds']
    return temp_df['hours'].astype(float) * 3600 + temp_df['minutes'].astype(float) * 60 + temp_df['seconds'].str.replace(',', '.').astype(float)


subtitles_df['start_seconds'] = get_seconds(subtitles_df, 'Start Time')
subtitles_df['end_seconds'] = get_seconds(subtitles_df, 'End Time')

subtitles_df.head()

Unnamed: 0,Start Time,End Time,Subtitle,Fuck Count,start_seconds,end_seconds
0,"00:00:28,340","00:00:32,428",Forget it. It's too risky. I'm through doing t...,0,28.34,32.428
1,"00:00:32,508","00:00:35,966",You always say that. The same thing every time.,0,32.508,35.966
2,"00:00:36,038","00:00:37,977","""I'm through, never again, too dangerous.""",0,36.038,37.977
3,"00:00:38,048","00:00:40,537",I know that's what I always say. I'm always ri...,0,38.048,40.537
4,"00:00:40,617","00:00:44,015",You forget about it in a day or two.,0,40.617,44.015


In [15]:
tarantino['seconds_in'] = tarantino['minutes_in'] * 60
tarantino = tarantino.loc[tarantino['Fuck Count 2'] > 0, ['seconds_in', 'Fuck Count 2']]

tarantino.head()

Unnamed: 0,seconds_in,Fuck Count 2
432,49.2,1
433,82.2,1
434,97.2,1
435,103.8,1
436,106.2,1


In [16]:
subtitles_df['index'] = 1
subtitles_df = subtitles_df.set_index('index')

tarantino['index'] = 1
tarantino = tarantino.set_index('index')

subtitles_df = subtitles_df.join(tarantino, how='outer')
subtitles_df.head()

Unnamed: 0_level_0,Start Time,End Time,Subtitle,Fuck Count,start_seconds,end_seconds,seconds_in,Fuck Count 2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,"00:00:28,340","00:00:32,428",Forget it. It's too risky. I'm through doing t...,0,28.34,32.428,49.2,1
1,"00:00:28,340","00:00:32,428",Forget it. It's too risky. I'm through doing t...,0,28.34,32.428,82.2,1
1,"00:00:28,340","00:00:32,428",Forget it. It's too risky. I'm through doing t...,0,28.34,32.428,97.2,1
1,"00:00:28,340","00:00:32,428",Forget it. It's too risky. I'm through doing t...,0,28.34,32.428,103.8,1
1,"00:00:28,340","00:00:32,428",Forget it. It's too risky. I'm through doing t...,0,28.34,32.428,106.2,1


In [17]:
filt = subtitles_df['seconds_in'].between(subtitles_df['start_seconds'], subtitles_df['end_seconds'])
subtitles_df.loc[~filt, 'Fuck Count 2'] = 0

subtitles_df = subtitles_df.groupby(['Start Time', 'End Time', 'Subtitle', 'Fuck Count', 'start_seconds', 
                                     'end_seconds'])['Fuck Count 2'].sum().reset_index()

subtitles_df.head(10)

Unnamed: 0,Start Time,End Time,Subtitle,Fuck Count,start_seconds,end_seconds,Fuck Count 2
0,"00:00:28,340","00:00:32,428",Forget it. It's too risky. I'm through doing t...,0,28.34,32.428,0
1,"00:00:32,508","00:00:35,966",You always say that. The same thing every time.,0,32.508,35.966,0
2,"00:00:36,038","00:00:37,977","""I'm through, never again, too dangerous.""",0,36.038,37.977,0
3,"00:00:38,048","00:00:40,537",I know that's what I always say. I'm always ri...,0,38.048,40.537,0
4,"00:00:40,617","00:00:44,015",The days of me forgetting are over.,0,40.617,44.015,0
5,"00:00:40,617","00:00:44,015",You forget about it in a day or two.,0,40.617,44.015,0
6,"00:00:44,087","00:00:46,176",The days of me remembering have just begun.,0,44.087,46.176,0
7,"00:00:46,248","00:00:48,417","You know, when you go on like this, what you s...",0,46.248,48.417,0
8,"00:00:48,488","00:00:51,516",I sound like a sensible fuckin' man.,1,48.488,51.516,1
9,"00:00:48,488","00:00:51,516",You sound like a duck.,0,48.488,51.516,1


In [18]:
columns = ['Start Time', 'End Time', 'Subtitle', 'Fuck Count', 'Fuck Count 2', 'start_seconds', 'end_seconds']

subtitles_df = subtitles_df[columns]

subtitles_df.to_excel(directory + 'subtitles.xlsx', index=False)