In [1]:
import sqlite3
import numpy as np
import pandas as pd
import io
import zipfile
import re
import warnings
warnings.filterwarnings('ignore')
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from bs4 import BeautifulSoup
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import torch
from tqdm import tqdm
from nltk.tokenize import word_tokenize
import os

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yousu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yousu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
conn = sqlite3.connect('eng_subtitles_database.db')
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
print(cursor.fetchall())

cursor.execute("PRAGMA table_info('zipfiles')")
cols = cursor.fetchall()
for col in cols:
    print(col[1])

[('zipfiles',)]
num
name
content


In [5]:
df = pd.read_sql_query("""SELECT * FROM zipfiles""", conn)
df.head()

Unnamed: 0,num,name,content
0,9180533,the.message.(1976).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x1c\xa9\x...
1,9180583,here.comes.the.grump.s01.e09.joltin.jack.in.bo...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x17\xb9\x...
2,9180592,yumis.cells.s02.e13.episode.2.13.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00L\xb9\x99V...
3,9180594,yumis.cells.s02.e14.episode.2.14.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00U\xa9\x99V...
4,9180600,broker.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x001\xa9\x99V...


In [10]:
def extract_content(row):
    binary_data = row['content']
    with io.BytesIO(binary_data) as f:
        with zipfile.ZipFile(f, 'r') as zip_file:
            subtitle_content = zip_file.read(zip_file.namelist()[0])
            return subtitle_content.decode('latin-1')  # Assuming the content is latin-1 encoded text

# Apply the function to each row of the DataFrame
df['extracted_content'] = df.apply(extract_content, axis=1)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82498 entries, 0 to 82497
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   num                82498 non-null  int64 
 1   name               82498 non-null  object
 2   content            82498 non-null  object
 3   extracted_content  82498 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.5+ MB


In [12]:
def remove_pattern(content):
    pattern = r'(\r\n)|(ï»¿)'
    return re.sub(pattern, " ", content)
%time df['extracted_content'] = df['extracted_content'].apply(remove_pattern)

CPU times: total: 1min 10s
Wall time: 2min 8s


In [13]:
def remove_series(text):
    return re.sub(r"\d+\s+\d{2}:\d{2}:\d{2},\d{3}\s+-->\s+\d{2}:\d{2}:\d{2},\d{3}\s+", "", text)

%time df['extracted_content'] = df['extracted_content'].apply(remove_series)

CPU times: total: 36.3 s
Wall time: 1min 3s


In [14]:
def clean_subtitle(subtitle):
    # Remove the specified series
    subtitle = re.sub(r"Dialogue:\s+0,0:00:06.00,0:00:12.074.*\n", "", subtitle)
    # Remove the specified series
    subtitle = re.sub(r"\{.*?\}|\\N|\\[a-z]+\d*|{\*.*?}|{\d+\}|\[[^\]]+\]|\([^\)]+\)", "", subtitle)
    # Remove lines starting with [Script Info], [V4+ Styles], and [Events]
    subtitle = re.sub(r"^\[(Script Info|V4\+ Styles|Events)\].*\n", "", subtitle, flags=re.MULTILINE)
    # Remove lines starting with Format:, Style:, and Dialogue:
    subtitle = re.sub(r"^(Format|Style|Dialogue):\s.*\n", "", subtitle, flags=re.MULTILINE)
    # Remove leading and trailing whitespace
    subtitle = subtitle.strip()
    return subtitle

In [15]:
df['extracted_content'] = df['extracted_content'].apply(clean_subtitle)

In [16]:
df.head()

Unnamed: 0,num,name,content,extracted_content
0,9180533,the.message.(1976).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x1c\xa9\x...,Watch any video online with Open-SUBTITLES Fre...
1,9180583,here.comes.the.grump.s01.e09.joltin.jack.in.bo...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x17\xb9\x...,Ah! There's Princess Dawn and Terry with the ...
2,9180592,yumis.cells.s02.e13.episode.2.13.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00L\xb9\x99V...,<i>Yumi's Cells 2</i> <i>Episode 36 Extremely...
3,9180594,yumis.cells.s02.e14.episode.2.14.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00U\xa9\x99V...,Watch any video online with Open-SUBTITLES Fre...
4,9180600,broker.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x001\xa9\x99V...,Watch any video online with Open-SUBTITLES Fre...


In [17]:
pattern = r"^\d+\s+"

def remove_pattern(content):
    return re.sub(pattern, "", content, flags=re.MULTILINE)
df['extracted_content'] = df['extracted_content'].apply(remove_pattern)

In [18]:
lemmatizer = WordNetLemmatizer()
def preprocess(text):
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    text=re.sub("[^A-Za-z0-9./:']"," ",text)
    # Convert to lowercase
    text = text.lower()
    tokens = text.split()

    # Lemmatize each token
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Join the lemmatized tokens back into a single string
    cleaned_text = ' '.join(lemmatized_tokens)

    return pd.Series(cleaned_text)

In [68]:
row_index = 16
column_index = -1  # Adjust column index if necessary

# Extract the text from the specified row and column
text_to_preprocess = df.iloc[row_index, column_index]

# Preprocess the text
cleaned_text = preprocess(text_to_preprocess)

cleaned_text[0]

"title: english original script: kadokawapictures original translation: original editing: original timing: synch point: script updated by: update details: scripttype: v4.00 collisions: normal playresx: 1920 playresy: 1080 timer: 0.0000 wrapstyle: 0 format: name fontname fontsize primarycolour secondarycolour outlinecolour backcolour bold italic underline strikeout scalex scaley spacing angle borderstyle outline shadow alignment marginl marginr marginv encoding style: default trebuchet m 72 h00ffffff h000000ff h00000000 h00000000 0 0 0 0 100 100 0 0 1 6 3 2 10 10 18 1 style: defaultitalics trebuchet m 72 h00ffffff h000000ff h00000000 h00000000 0 1 0 0 100 100 0 0 1 6 3 2 10 10 18 1 style: defaulttop trebuchet m 72 h00ffffff h000000ff h00000000 h00000000 0 0 0 0 100 100 0 0 1 6 3 8 10 10 18 1 style: defaultitalicstop trebuchet m 72 h00ffffff h000000ff h00000000 h00000000 0 1 0 0 100 100 0 0 1 6 3 8 10 10 18 1 style: flashback trebuchet m 72 h00ffffff h000000ff h00400000 h00400000 0 0 0 0

In [19]:
def remove_patterns(text):
    return re.sub(r"\{.*?\}|\\N|\\[a-z]+\d*|{\*.*?}|{\d+\}|\([^\)]+\)", "", text)

%time df['extracted_content'] = df['extracted_content'].apply(remove_patterns)

CPU times: total: 2.08 s
Wall time: 4.16 s


In [20]:
df

Unnamed: 0,num,name,content,extracted_content
0,9180533,the.message.(1976).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x1c\xa9\x...,Watch any video online with Open-SUBTITLES Fre...
1,9180583,here.comes.the.grump.s01.e09.joltin.jack.in.bo...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x17\xb9\x...,Ah! There's Princess Dawn and Terry with the ...
2,9180592,yumis.cells.s02.e13.episode.2.13.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00L\xb9\x99V...,<i>Yumi's Cells 2</i> <i>Episode 36 Extremely...
3,9180594,yumis.cells.s02.e14.episode.2.14.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00U\xa9\x99V...,Watch any video online with Open-SUBTITLES Fre...
4,9180600,broker.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x001\xa9\x99V...,Watch any video online with Open-SUBTITLES Fre...
...,...,...,...,...
82493,9521935,the.prophets.game.(2000).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xb8\xa6\x...,"God, why are you punishing me? ""With red on h..."
82494,9521937,west.beirut.(1998).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x13\x97\x...,"api.OpenSubtitles.org is deprecated, please im..."
82495,9521938,frankenstein.the.true.story.(1973).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00$\x97\x9aV...,Advertise your product or brand here contact w...
82496,9521940,frankenstein.the.true.story.(1973).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x00\x97\x...,Advertise your product or brand here contact w...


In [21]:
def remove_patterns(text):
    return re.sub(r"(â\x99ª)", "", text)

%time df['extracted_content'] = df['extracted_content'].apply(remove_patterns)

CPU times: total: 688 ms
Wall time: 1.39 s


In [22]:
def remove_patterns(text):
    return re.sub(r"\Dialogue:\s*\d{1},\d{1}:\d{2}:\d{2}\.\d{2},\d{1}:\d{2}:\d{2}\.\d{2},\w+,", "", text)

%time df['extracted_content'] = df['extracted_content'].apply(remove_patterns)

CPU times: total: 17.5 s
Wall time: 34.5 s


In [23]:
def remove_patterns(text):
    return re.sub(r"\Dialogue:\s*\d{1},\d{1}:\d{2}:\d{2}\.\d{2},\d{1}:\d{2}:\d{2}\.\d{2},\w+,", "", text)

# Apply the function to the extracted_content column
%time df['extracted_content'] = df['extracted_content'].apply(remove_patterns)

CPU times: total: 21.9 s
Wall time: 33.8 s


In [24]:
def remove_patterns(text):
    return re.sub(r"Dialogue:\s+\d+,\d{1,2}:\d{2}:\d{2}\.\d{2},\d{1,2}:\d{2}:\d{2}\.\d{2},[^\n]*\n", "", text)
    return re.sub(r"Dialogue:\s+\d+,\d{1,2}:\d{2}:\d{2}\.\d{2},\d{1,2}:\d{2}:\d{2}\.\d{2},[^\n]*\n", "", text)

# Apply the function to the extracted_content column
df['extracted_content'] = df['extracted_content'].apply(remove_patterns)

In [25]:
def clean_subtitle(subtitle):
    # Remove the specified series
    subtitle = re.sub(r"Dialogue:\s+0,0:00:06.00,0:00:12.074.*\n", "", subtitle)
    # Remove lines starting with [Script Info], [V4+ Styles], and [Events]
    subtitle = re.sub(r"^\[(Script Info|V4\+ Styles|Events)\].*\n", "", subtitle, flags=re.MULTILINE)
    # Remove lines starting with Format:, Style:, and Dialogue:
    subtitle = re.sub(r"^(Format|Style|Dialogue):\s.*\n", "", subtitle, flags=re.MULTILINE)
    # Remove leading and trailing whitespace
    subtitle = subtitle.strip()
    return subtitle

In [26]:
df['extracted_content'] = df['extracted_content'].apply(clean_subtitle)

In [27]:
def clean_subtitle(subtitle):
    # Remove the specified series
    subtitle = re.sub(r"\{.*?\}|\\N|\\[a-z]+\d*|{\*.*?}|{\d+\}|\[[^\]]+\]|\([^\)]+\)", "", subtitle)
    # Remove lines starting with [Script Info], [V4+ Styles], and [Events]
    subtitle = re.sub(r"^\[(Script Info|V4\+ Styles|Events)\].*\n", "", subtitle, flags=re.MULTILINE)
    # Remove lines starting with Format:, Style:, and Dialogue:
    subtitle = re.sub(r"^(Format|Style|Dialogue):\s.*\n", "", subtitle, flags=re.MULTILINE)
    # Remove leading and trailing whitespace
    subtitle = subtitle.strip()
    return subtitle

In [28]:
df['extracted_content'] = df['extracted_content'].apply(clean_subtitle)

In [29]:
%time df['clean_content'] = df['extracted_content'].apply(preprocess)

CPU times: total: 9min 8s
Wall time: 17min 22s


In [30]:
df.iloc[190,-1]

"theme song playing... api.opensubtitles.org is deprecated please implement rest api from opensubtitles.com i don't like lady pianists. it is my good friend judge lobbett. very courageous of him to come down for the concert. he's a marked man you know. four murder in his household within the past month and each time he ha miraculously escaped. campion: travelling for his health i take it. like i am. he'll be safe on the boat. i wouldn't bet on it. your police will have to keep him in a steel band box if they are going to look after him. bravo those are his child with him. senora verola lady and gentlemen. and now mystery. audience: ohh i'm pleased to tell you that prince samir the world famous indian illusionist ha very kindly consented to entertain us. oh good. i'm potty about conjurers. emcee: prince samir. my name is barber alistair ferguson barber. i am turkish but i had a scotch ancestor. scottish . yes. i am in the fine art business. picture you know. campion: an expert barber: y

In [31]:
def preprocess_name(text):
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    # Remove non-alphanumeric characters except apostrophes
    text = re.sub("[^A-Za-z0-9']", " ", text)
    # Convert to lowercase
    text = text.lower()
    # Remove the substring 'eng 1cd' and strip leading/trailing whitespaces
    text = text.replace('eng 1cd', '').strip()
    tokens = text.split()

    # Lemmatize each token
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Join the lemmatized tokens back into a single string
    cleaned_text = ' '.join(lemmatized_tokens)

    return cleaned_text

In [32]:
%time df['name'] = df['name'].apply(preprocess_name)

CPU times: total: 1.64 s
Wall time: 3.38 s


In [33]:
df

Unnamed: 0,num,name,content,extracted_content,clean_content
0,9180533,the message 1976,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x1c\xa9\x...,Watch any video online with Open-SUBTITLES Fre...,watch any video online with open subtitle free...
1,9180583,here come the grump s01 e09 joltin jack in box...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x17\xb9\x...,Ah! There's Princess Dawn and Terry with the ...,ah there's princess dawn and terry with the bl...
2,9180592,yumis cell s02 e13 episode 2 13 2022,b'PK\x03\x04\x14\x00\x00\x00\x08\x00L\xb9\x99V...,<i>Yumi's Cells 2</i> <i>Episode 36 Extremely...,yumi's cell 2 episode 36 extremely polite yumi...
3,9180594,yumis cell s02 e14 episode 2 14 2022,b'PK\x03\x04\x14\x00\x00\x00\x08\x00U\xa9\x99V...,Watch any video online with Open-SUBTITLES Fre...,watch any video online with open subtitle free...
4,9180600,broker 2022,b'PK\x03\x04\x14\x00\x00\x00\x08\x001\xa9\x99V...,Watch any video online with Open-SUBTITLES Fre...,watch any video online with open subtitle free...
...,...,...,...,...,...
82493,9521935,the prophet game 2000,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xb8\xa6\x...,"God, why are you punishing me? ""With red on h...",god why are you punishing me with red on his h...
82494,9521937,west beirut 1998,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x13\x97\x...,"api.OpenSubtitles.org is deprecated, please im...",api.opensubtitles.org is deprecated please imp...
82495,9521938,frankenstein the true story 1973,b'PK\x03\x04\x14\x00\x00\x00\x08\x00$\x97\x9aV...,Advertise your product or brand here contact w...,advertise your product or brand here contact w...
82496,9521940,frankenstein the true story 1973,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x00\x97\x...,Advertise your product or brand here contact w...,advertise your product or brand here contact w...


In [34]:
def remove_name(text):
    text = re.sub(r'episode\s\d+\s\d+','',text)
    return text

%time df['name'] = df['name'].apply(remove_name)

CPU times: total: 31.2 ms
Wall time: 51 ms


In [35]:
df.head(10)

Unnamed: 0,num,name,content,extracted_content,clean_content
0,9180533,the message 1976,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x1c\xa9\x...,Watch any video online with Open-SUBTITLES Fre...,watch any video online with open subtitle free...
1,9180583,here come the grump s01 e09 joltin jack in box...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x17\xb9\x...,Ah! There's Princess Dawn and Terry with the ...,ah there's princess dawn and terry with the bl...
2,9180592,yumis cell s02 e13 2022,b'PK\x03\x04\x14\x00\x00\x00\x08\x00L\xb9\x99V...,<i>Yumi's Cells 2</i> <i>Episode 36 Extremely...,yumi's cell 2 episode 36 extremely polite yumi...
3,9180594,yumis cell s02 e14 2022,b'PK\x03\x04\x14\x00\x00\x00\x08\x00U\xa9\x99V...,Watch any video online with Open-SUBTITLES Fre...,watch any video online with open subtitle free...
4,9180600,broker 2022,b'PK\x03\x04\x14\x00\x00\x00\x08\x001\xa9\x99V...,Watch any video online with Open-SUBTITLES Fre...,watch any video online with open subtitle free...
5,9180607,the myth 2005,b'PK\x03\x04\x14\x00\x00\x00\x08\x00K\xb9\x99V...,"General, the princess's convoy has entered our...",general the princess's convoy ha entered our r...
6,9180608,the great beauty 2013,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x1b\xa9\x...,"api.OpenSubtitles.org is deprecated, please im...",api.opensubtitles.org is deprecated please imp...
7,9180662,rudrabinar obhishaap s02 e01 swaralipir kut ta...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x12\xa9\x...,The mystery of Tansen's tanpura begins with......,the mystery of tansen's tanpura begin with... ...
8,9180684,rudrabinar obhishaap s02 e02 arek naad 2022,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x16\xb9\x...,"Listen to me, Saaz. Where are you going? Try...",listen to me saaz. where are you going try to ...
9,9180694,rudrabinar obhishaap s02 e03 anandagarher akhh...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x16\xa9\x...,So you're assuming that my grandma Mumtaz is ...,so you're assuming that my grandma mumtaz is a...


In [37]:
def remove_space_before_single_quotes(text):
    return text.replace(" '", "'")

# Apply the function to the 'text_column'
df['clean_content'] = df['clean_content'].apply(remove_space_before_single_quotes)

In [38]:
data = df.head(int(len(df) * 0.3))

In [36]:
from nltk.tokenize import word_tokenize
CHUNK_SIZE = 1000  # Adjust this size to control the number of words in each chunk
OVERLAP_SIZE = 200  # Adjust this size to control the overlap between chunks

def chunk_content(content):
    # Tokenize the input content into words
    words = word_tokenize(content)
    
    # Initialize a list to hold the chunks
    chunks = []
    
    # Create chunks based on the specified chunk size and overlap size
    for i in range(0, len(words), CHUNK_SIZE - OVERLAP_SIZE):
        # Calculate the end index of the current chunk
        end_index = i + CHUNK_SIZE
        
        # Create a chunk by slicing the words list
        chunk = words[i:end_index]
        
        # Convert the chunk list back to a single string
        chunk_text = ' '.join(chunk)
        
        # Add the chunk text to the chunks list
        chunks.append(chunk_text)
    
    # Return the list of chunks
    return chunks

In [39]:
%time data['chunk_content'] = data['clean_content'].apply(chunk_content)

CPU times: total: 6min 39s
Wall time: 11min 25s


In [40]:
data

Unnamed: 0,num,name,content,extracted_content,clean_content,chunk_content
0,9180533,the message 1976,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x1c\xa9\x...,Watch any video online with Open-SUBTITLES Fre...,watch any video online with open subtitle free...,[watch any video online with open subtitle fre...
1,9180583,here come the grump s01 e09 joltin jack in box...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x17\xb9\x...,Ah! There's Princess Dawn and Terry with the ...,ah there's princess dawn and terry with the bl...,[ah there 's princess dawn and terry with the ...
2,9180592,yumis cell s02 e13 2022,b'PK\x03\x04\x14\x00\x00\x00\x08\x00L\xb9\x99V...,<i>Yumi's Cells 2</i> <i>Episode 36 Extremely...,yumi's cell 2 episode 36 extremely polite yumi...,[yumi 's cell 2 episode 36 extremely polite yu...
3,9180594,yumis cell s02 e14 2022,b'PK\x03\x04\x14\x00\x00\x00\x08\x00U\xa9\x99V...,Watch any video online with Open-SUBTITLES Fre...,watch any video online with open subtitle free...,[watch any video online with open subtitle fre...
4,9180600,broker 2022,b'PK\x03\x04\x14\x00\x00\x00\x08\x001\xa9\x99V...,Watch any video online with Open-SUBTITLES Fre...,watch any video online with open subtitle free...,[watch any video online with open subtitle fre...
...,...,...,...,...,...,...
24744,9279492,scorpion s04 e15 wave goodbye 2018,b'PK\x03\x04\x14\x00\x00\x00\x08\x00W\xa1\x99V...,- WALTER:<i>Previously on</i> Scorpion... - TO...,walter:previously on scorpion... toby: that's ...,[walter : previously on scorpion ... toby : th...
24745,9279493,scorpion s04 e16 nerd wind fire 2018,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xa6\xa1\x...,- WALTER:<i>Previously on</i> Scorpion... - He...,walter:previously on scorpion... hey patty. i ...,[walter : previously on scorpion ... hey patty...
24746,9279494,scorpion s04 e17 dumbster fire 2018,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\\\xa1\x99...,- WALTER:<i>Previously on</i> Scorpion... - Wh...,walter:previously on scorpion... what's that t...,[walter : previously on scorpion ... what 's t...
24747,9279495,scorpion s04 e18 dork day afternoon 2018,b'PK\x03\x04\x14\x00\x00\x00\x08\x00;\xa1\x99V...,"- Kid, we'll run out of time. - We'll make it....",kid we'll run out of time. we'll make it. you'...,[kid we 'll run out of time . we 'll make it ....


In [41]:
def remove_space_before_single_quotes(text):
    return text.replace(" '", "'")

# Apply the function to the 'text_column'
data['clean_content'] = data['clean_content'].apply(remove_space_before_single_quotes)

In [44]:
data.tail(20)

Unnamed: 0,num,name,content,extracted_content,clean_content,chunk_content
24729,9279474,scorpion s03 e25 scorp family robinson 2017,b'PK\x03\x04\x14\x00\x00\x00\x08\x00K\xa1\x99V...,<i>Previously on</i> Scorpion... I know this ...,previously on scorpion... i know this is not a...,[previously on scorpion ... i know this is not...
24730,9279478,scorpion s04 e01 extinction 2017,b'PK\x03\x04\x14\x00\x00\x00\x08\x00M\xa1\x99V...,WALTER:<i>Previously on</i> Scorpion... Mar...,walter:previously on scorpion... mark collins ...,[walter : previously on scorpion ... mark coll...
24731,9279479,scorpion s04 e02 more extinction 2017,b'PK\x03\x04\x14\x00\x00\x00\x08\x00r\xa1\x99V...,WALTER:<i>Previously on</i> Scorpion... Metha...,walter:previously on scorpion... methane ha be...,[walter : previously on scorpion ... methane h...
24732,9279480,scorpion s04 e03 grow a deer a female deer 2017,b'PK\x03\x04\x14\x00\x00\x00\x08\x00Z\xa1\x99V...,WALTER:<i>Previously on</i> Scorpion... Last ...,walter:previously on scorpion... last night wa...,[walter : previously on scorpion ... last nigh...
24733,9279481,scorpion s04 e04 nuke kid on the block 2017,b'PK\x03\x04\x14\x00\x00\x00\x08\x00|\xa1\x99V...,WALTER:<i>Previously on</i> Scorpion... - Who...,walter:previously on scorpion... who are you i...,[walter : previously on scorpion ... who are y...
24734,9279482,scorpion s04 e05 sci hard 2017,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xdd\xa1\x...,- WALTER: Is my tie too short? - PAIGE: It's f...,walter: is my tie too short paige: it's fine. ...,[walter : is my tie too short paige : it 's fi...
24735,9279483,scorpion s04 e06 queen scary 2017,b'PK\x03\x04\x14\x00\x00\x00\x08\x009\xa1\x99V...,WALTER:<i>Previously on</i> Scorpion... We ne...,walter:previously on scorpion... we need to ba...,[walter : previously on scorpion ... we need t...
24736,9279484,scorpion s04 e07 go with the flo rence 2017,b'PK\x03\x04\x14\x00\x00\x00\x08\x00X\xa1\x99V...,WALTER:<i>Previously on</i> Scorpion... Every...,walter:previously on scorpion... every time yo...,[walter : previously on scorpion ... every tim...
24737,9279485,scorpion s04 e08 faire is foul 2017,b'PK\x03\x04\x14\x00\x00\x00\x08\x00:\xa1\x99V...,WALTER:<i>Previously on</i> Scorpion... HAP...,walter:previously on scorpion... happy: i foun...,[walter : previously on scorpion ... happy : i...
24738,9279486,scorpion s04 e09 it raining men of war 2017,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x80\xa1\x...,CABE: <i>It started as a normal night.</i> <i...,cabe: it started a a normal night. i wa gettin...,[cabe : it started a a normal night . i wa get...


In [45]:
data.iloc[190,-1]

["theme song playing ... api.opensubtitles.org is deprecated please implement rest api from opensubtitles.com i do n't like lady pianists . it is my good friend judge lobbett . very courageous of him to come down for the concert . he 's a marked man you know . four murder in his household within the past month and each time he ha miraculously escaped . campion : travelling for his health i take it . like i am . he 'll be safe on the boat . i would n't bet on it . your police will have to keep him in a steel band box if they are going to look after him . bravo those are his child with him . senora verola lady and gentlemen . and now mystery . audience : ohh i 'm pleased to tell you that prince samir the world famous indian illusionist ha very kindly consented to entertain us . oh good . i 'm potty about conjurers . emcee : prince samir . my name is barber alistair ferguson barber . i am turkish but i had a scotch ancestor . scottish . yes . i am in the fine art business . picture you kn

In [33]:
'''new_data = []

# Iterate over each row in the original DataFrame
for index, row in data.iterrows():
    # Iterate over each chunk in the 'chunks' column
    for i, chunk in enumerate(row['chunk_content']):
        new_data.append({
            'num': row['num'],
            'name': row['name'],
            'content': row['content'],
            'extracted_content': row['extracted_content'],
            'clean_content': chunk
        })

# Create a new DataFrame from the list of dictionaries
new_df = pd.DataFrame(new_data)'''

In [46]:
new_data = []

# Iterate over each row in the original DataFrame
for index, row in data.iterrows():
    # Iterate over each chunk in the 'chunks' column
    for i, chunk in enumerate(row['chunk_content']):  # Start enumeration from 1
        new_data.append({
            'num': f"{row['num']}_{i}",  # Append chunk number to original num
            'name': row['name'],
            'content': row['content'],
            'extracted_content': row['extracted_content'],
            'clean_content': chunk
        })

# Create a new DataFrame from the list of dictionaries
new_df = pd.DataFrame(new_data)

In [47]:
new_df

Unnamed: 0,num,name,content,extracted_content,clean_content
0,9180533_0,the message 1976,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x1c\xa9\x...,Watch any video online with Open-SUBTITLES Fre...,watch any video online with open subtitle free...
1,9180533_1,the message 1976,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x1c\xa9\x...,Watch any video online with Open-SUBTITLES Fre...,there . three days . i 'm afraid for him on th...
2,9180533_2,the message 1976,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x1c\xa9\x...,Watch any video online with Open-SUBTITLES Fre...,with worry . i 'm sorry father . where were yo...
3,9180533_3,the message 1976,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x1c\xa9\x...,Watch any video online with Open-SUBTITLES Fre...,break him have you finished with him there is ...
4,9180533_4,the message 1976,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x1c\xa9\x...,Watch any video online with Open-SUBTITLES Fre...,u go . we found their track and followed them ...
...,...,...,...,...,...
200008,9279496_4,scorpion s04 e19 gator done 2018,b'PK\x03\x04\x14\x00\x00\x00\x08\x00l\xa1\x99V...,WALTER:<i>Previously on</i> Scorpion... My en...,it a cacciatore or a chicken parmigiana . you ...
200009,9279496_5,scorpion s04 e19 gator done 2018,b'PK\x03\x04\x14\x00\x00\x00\x08\x00l\xa1\x99V...,WALTER:<i>Previously on</i> Scorpion... My en...,is the name of patty 's guidance counselor uh ...
200010,9279496_6,scorpion s04 e19 gator done 2018,b'PK\x03\x04\x14\x00\x00\x00\x08\x00l\xa1\x99V...,WALTER:<i>Previously on</i> Scorpion... My en...,. so you want me to drown you just find someth...
200011,9279496_7,scorpion s04 e19 gator done 2018,b'PK\x03\x04\x14\x00\x00\x00\x08\x00l\xa1\x99V...,WALTER:<i>Previously on</i> Scorpion... My en...,make it buddy . move your foot man . you got t...


In [48]:
new_df.iloc[190,-1]

"i 'm ma xidao in charge the person in charge . yeah yeah yeah the staff i work for he did n't speak a word of english it 's been a bit of a farce ok do you want to eat before you go out yeah sure sure . sure sure well you take your bag and your luggage bao bao . but there 's nothing to eat here here here here here where is all that stuff there . ah is this only a thousand dollar a thousand yeah . oh my god i do n't know how i can travel without money they all need a discount those guy over there are korean gangster right korean gangster everywhere you go there are a lot more korean doing business around here the underworld ha also increased violent crime also give me a headache what the fuck are you looking at underworld hello all the way to vietnam . fuck you it 's a cop . are you sure cops . really you sure are you sure i 'm sure . put it on hey there 's some weird stuff in there and what looked like marijuana those guy are weird what marijuana is n't . i 'm on my way . oh no oh you

In [49]:
def remove_space_before_single_quotes(text):
    return text.replace(" '", "'")

# Apply the function to the 'text_column'
new_df['clean_content'] = new_df['clean_content'].apply(remove_space_before_single_quotes)

In [50]:
new_df.iloc[190,-1]

"i'm ma xidao in charge the person in charge . yeah yeah yeah the staff i work for he did n't speak a word of english it's been a bit of a farce ok do you want to eat before you go out yeah sure sure . sure sure well you take your bag and your luggage bao bao . but there's nothing to eat here here here here here where is all that stuff there . ah is this only a thousand dollar a thousand yeah . oh my god i do n't know how i can travel without money they all need a discount those guy over there are korean gangster right korean gangster everywhere you go there are a lot more korean doing business around here the underworld ha also increased violent crime also give me a headache what the fuck are you looking at underworld hello all the way to vietnam . fuck you it's a cop . are you sure cops . really you sure are you sure i'm sure . put it on hey there's some weird stuff in there and what looked like marijuana those guy are weird what marijuana is n't . i'm on my way . oh no oh you . not 

In [51]:
df1 = new_df[['num', 'name', 'clean_content']]

In [52]:
df1.head(50)

Unnamed: 0,num,name,clean_content
0,9180533_0,the message 1976,watch any video online with open subtitle free...
1,9180533_1,the message 1976,there . three days . i'm afraid for him on the...
2,9180533_2,the message 1976,with worry . i'm sorry father . where were you...
3,9180533_3,the message 1976,break him have you finished with him there is ...
4,9180533_4,the message 1976,u go . we found their track and followed them ...
5,9180533_5,the message 1976,... through abraham noah moses and through jes...
6,9180533_6,the message 1976,in this pledge go now . and if you have no dou...
7,9180533_7,the message 1976,loose and where quaswa sits the prophet stays ...
8,9180533_8,the message 1976,or any old person you may not harm cripple you...
9,9180533_9,the message 1976,to victory god is great there is no god but go...


In [53]:
df1

Unnamed: 0,num,name,clean_content
0,9180533_0,the message 1976,watch any video online with open subtitle free...
1,9180533_1,the message 1976,there . three days . i'm afraid for him on the...
2,9180533_2,the message 1976,with worry . i'm sorry father . where were you...
3,9180533_3,the message 1976,break him have you finished with him there is ...
4,9180533_4,the message 1976,u go . we found their track and followed them ...
...,...,...,...
200008,9279496_4,scorpion s04 e19 gator done 2018,it a cacciatore or a chicken parmigiana . you ...
200009,9279496_5,scorpion s04 e19 gator done 2018,is the name of patty's guidance counselor uh m...
200010,9279496_6,scorpion s04 e19 gator done 2018,. so you want me to drown you just find someth...
200011,9279496_7,scorpion s04 e19 gator done 2018,make it buddy . move your foot man . you got t...


In [54]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('bert-base-nli-mean-tokens')
def generate_embeddings(sentences):
    # Generate embeddings for the sentences
    embeddings = model.encode(sentences)
    return embeddings

In [55]:
df2 = df1.iloc[0:200]

In [56]:
%time df2['chunk_embedding'] = df2['clean_content'].apply(generate_embeddings)

CPU times: total: 1min 22s
Wall time: 17.7 s


In [60]:
df2

Unnamed: 0,num,name,clean_content,chunk_embedding
0,9180533_0,the message 1976,watch any video online with open subtitle free...,"[-0.06122897, 1.2223547, 0.3217321, -0.3316896..."
1,9180533_1,the message 1976,there . three days . i'm afraid for him on the...,"[-0.1526536, 1.5214349, 0.5439601, -0.19433674..."
2,9180533_2,the message 1976,with worry . i'm sorry father . where were you...,"[0.19221133, 1.1043675, 0.75206536, -0.4568424..."
3,9180533_3,the message 1976,break him have you finished with him there is ...,"[-0.44091898, 0.8067595, -0.20584032, -0.14974..."
4,9180533_4,the message 1976,u go . we found their track and followed them ...,"[-0.080335595, 0.5870151, 0.2626214, 0.0968323..."
...,...,...,...,...
195,9181871_7,the roundup 2022,by the police here anyway hello if you tell me...,"[-0.20554946, 0.8416153, 1.0564214, 0.13947293..."
196,9181871_8,the roundup 2022,me yes bring it to me alive or dead okay got i...,"[-0.04149872, 0.36500067, 0.72019565, 0.169848..."
197,9181871_9,the roundup 2022,me to death i said i would come to korea where...,"[-0.4502845, 0.8839978, -0.10003736, 0.0990959..."
198,9181871_10,the roundup 2022,how you start your new life just close your ey...,"[-0.4036696, 0.87375915, 0.83474123, -0.051862..."


In [55]:
# Function to remove space before single quotes
def remove_space_before_single_quotes(text):
    return text.replace(" '", "'")

# Apply the function to the 'text_column'
df2['clean_content'] = df2['clean_content'].apply(remove_space_before_single_quotes)

In [57]:
df2.clean_content[199]

"the worse for u we've passed this intersection four time now this is gon na blow our car out of the water hey tin tao why do n't we grab these two first caught them so quickly that they did not have time to contact jiang haizang why do n't you just call cui chunbai later no we need to make sure choon bai is safe first no it doe matter but what if we do n't get those two no hostage are more important i'm freaking out whatever the fuck mama of mama of the eldest brother we found the hyundai that choi wa driving when we kidnapped her oh yeah choi chun bai i do n't see . let me search the building again call me a soon a you find it well know squad leader dong gyun found the hyundai what i found it . yeah i got it . i got it . i'll follow to the feeling come ha there you go . fuck cui chunbai it feel like it's somewhere around here mama of hello cui chunbai well i guess you're all right mama of up to the uh let's get out of here and get up it's ok well ah almost there . almost there it's a

In [81]:
df2.name[170]

'survivor 2000'

In [82]:
df2.clean_content[170]

"... politically correct here . probst : carl bilancione a dentist and the father of two from winter spring florida . i'm not your typical dentist . most of them are fat and bald headed and divorced . not me . i've completed six marathons . survivor's easy . probst : jessie camacho a deputy sheriff from orlando florida . why do i think i'll make the ultimate survivor i'm a puerto rican female ... the heat the rice bring it on . i'm ready to go to work . see ya probst : and tom buchanan a farmer and the father of one from rich valley virginia . come on we're going to survivor . woo hoo come on doggy here we go let's hit the big show boy we're gon na be a star probst : whether it's on an island in the outback or deep in africa the game remains the same : 16 stranger abandoned in the middle of nowhere forced to create a new society . to succeed you must not only survive the element but each other . loudmouth sue . yeah sean get some b ... grouch b.b . grouch probably me . myself . jeff . 

In [26]:
# df.to_csv('dfn_1.csv', index=False)

In [24]:
# df1 = df.sample(frac=0.4, random_state=42)

In [28]:
#df = pd.read_csv("dfn_1.csv")
df.head()

Unnamed: 0,num,name,content,extracted_content,clean_content,chunk_content
0,9180533,the message 1976,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x1c\xa9\x...,Watch any video online with Open-SUBTITLES Fre...,watch any video online with open subtitle free...,[watch any video online with open subtitle fre...
1,9180583,here come the grump s01 e09 joltin jack in box...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x17\xb9\x...,Ah! There's Princess Dawn and Terry with the ...,ah there's princess dawn and terry with the bl...,[ah there 's princess dawn and terry with the ...
2,9180592,yumis cell s02 e13 episode 2 13 2022,b'PK\x03\x04\x14\x00\x00\x00\x08\x00L\xb9\x99V...,<i>Yumi's Cells 2</i> <i>Episode 36 Extremely...,yumi's cell 2 episode 36 extremely polite yumi...,[yumi 's cell 2 episode 36 extremely polite yu...
3,9180594,yumis cell s02 e14 episode 2 14 2022,b'PK\x03\x04\x14\x00\x00\x00\x08\x00U\xa9\x99V...,Watch any video online with Open-SUBTITLES Fre...,watch any video online with open subtitle free...,[watch any video online with open subtitle fre...
4,9180600,broker 2022,b'PK\x03\x04\x14\x00\x00\x00\x08\x001\xa9\x99V...,Watch any video online with Open-SUBTITLES Fre...,watch any video online with open subtitle free...,[watch any video online with open subtitle fre...


In [28]:
df.iloc[6402, 1]

'love island s04 e24  2022'

In [40]:
df1 = df.iloc[20402:25402]

In [None]:
# Constants
CHROMA_DB_FILE = "subtitle_embeddings_11.db"


# Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
    return ' '.join(tokens)



# Initialize BERT model
bert_model = SentenceTransformer('distilbert-base-nli-mean-tokens') 

# Initialize ChromaDB for storing embeddings
if os.path.exists(CHROMA_DB_FILE):
    os.remove(CHROMA_DB_FILE)
chroma_conn = sqlite3.connect(CHROMA_DB_FILE)
chroma_c = chroma_conn.cursor()

# Create the embeddings table in the database
chroma_c.execute('''
    CREATE TABLE embeddings (
        name TEXT,
        chunk_num INTEGER,
        chunk_content TEXT,
        embedding BLOB
    )
''')

# Process documents and insert embeddings into ChromaDB
for idx, row in tqdm(df1.iterrows(), total=len(df1), desc="Processing documents"):
    name = row['name']
    chunk_contents = row['chunk_content']
    
    # Iterate through each chunk in chunk_contents
    for chunk_num, chunk in enumerate(chunk_contents):
        # Preprocess the chunk
        processed_chunk = preprocess_text(chunk)
        
        # Generate BERT-based embedding
        bert_embedding = bert_model.encode([processed_chunk])
        
        # Convert embedding to bytes
        embedding_bytes = bert_embedding.tobytes()
        
        # Insert the document name, chunk number, chunk content, and embedding into the database
        chroma_c.execute('''
            INSERT INTO embeddings (name, chunk_num, chunk_content, embedding)
            VALUES (?, ?, ?, ?)
        ''', (name, chunk_num, chunk, embedding_bytes))

# Commit and close the connection
chroma_conn.commit()
chroma_conn.close()

Processing documents:  14%|███████▌                                               | 687/5000 [08:09<1:01:36,  1.17it/s]

In [38]:
# Connect to the first ChromaDB file
conn1 = sqlite3.connect('subtitle_embeddings_11.db')
cursor1 = conn1.cursor()

# Connect to the second ChromaDB file
conn2 = sqlite3.connect('subtitle_embeddings_5.db')
cursor2 = conn2.cursor()

# Merge the two databases
cursor2.execute("ATTACH DATABASE 'subtitle_embeddings_11.db' AS db1")
cursor2.execute("INSERT INTO main.embeddings SELECT * FROM db1.embeddings")

# Commit changes and close connections
conn2.commit()
conn1.close()
conn2.close()