# **Setting up the environment**

In [1]:
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [10]:
!pip install -U sentence-transformers
!pip install tqdm
!pip install beautifulsoup4

Collecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl.metadata (11 kB)
Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-2.7.0


In [5]:
!pip install nltk



In [11]:
!pip install torch sentence-transformers[gpu]



In [6]:
!pip install chromadb



# **Part 1: Ingesting Documents**

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/subtitle_multi-qa-minilm-l6-cos-v1_bert_embeddings/pytorch/1/1/subtitle_multi-qa-MiniLM-L6-cos-v1_bert_embeddings_pytorch.pt
/kaggle/input/opensubtitles-sql-database/data/eng_subtitles_database.db
/kaggle/input/opensubtitles-sql-database/data/README.txt
/kaggle/input/opensubtitles-preprocessed-with-bert-embeddings/preprocessed_subtitles_30_with_chunks_embeddings.csv
/kaggle/input/opensubtitles-preprocessed-data/opensubtitles_data_preprocessed.csv
/kaggle/input/opensubtitles-preprocessed-with-chunks-30-percent/preprocessed_subtitles_30_with_chunks.csv


In [5]:
import os
for dirname, _, filenames in os.walk('/kaggle/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/lib/kaggle/gcp.py
/kaggle/input/opensubtitles-preprocessed-data/opensubtitles_data_preprocessed.csv
/kaggle/input/opensubtitles-preprocessed-with-chunks-30-percent/preprocessed_subtitles_30_with_chunks.csv
/kaggle/input/opensubtitles-sql-database/data/eng_subtitles_database.db
/kaggle/input/opensubtitles-sql-database/data/README.txt
/kaggle/input/opensubtitles-preprocessed-with-bert-embeddings/preprocessed_subtitles_30_with_chunks_embeddings.csv


In [6]:
os.path.isfile("/kaggle/working/preprocessed_subtitles_30.csv")

False

### **Decoding and Reading the given data.**

In [2]:
import sqlite3
import zipfile
import io

def read_and_decode_database(file_path):
    # Connect to the SQLite database
    connection = sqlite3.connect(file_path)
    # Read the database into a DataFrame
    data = pd.read_sql_query("SELECT * FROM zipfiles", connection)
    # Close the connection
    connection.close()
    
    # Initialize count variable
    count = 0
    
    # Function to decode binary data from zip files
    def decode_binary_data(binary_data):
        nonlocal count
        count += 1
        try:
            with io.BytesIO(binary_data) as byte_stream:
                with zipfile.ZipFile(byte_stream, 'r') as zip_file:
                    file_name = zip_file.namelist()[0]
                    subtitle_content = zip_file.read(file_name)
                    return subtitle_content.decode('latin-1')
        except Exception as e:
            print(f"Error: {e}")
            return None
    
    # Decode each binary data entry in the DataFrame
    data['file_content'] = data['content'].apply(decode_binary_data)
    
    return data, count

# # Apply the function to the content column
file_path = '/kaggle/input/opensubtitles-sql-database/data/eng_subtitles_database.db'
df, df_count = read_and_decode_database(file_path)

# Display the first few rows of the DataFrame with decoded content
df.head()

Unnamed: 0,num,name,content,file_content
0,9180533,the.message.(1976).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x1c\xa9\x...,"1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch an..."
1,9180583,here.comes.the.grump.s01.e09.joltin.jack.in.bo...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x17\xb9\x...,"1\r\n00:00:29,359 --> 00:00:32,048\r\nAh! Ther..."
2,9180592,yumis.cells.s02.e13.episode.2.13.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00L\xb9\x99V...,"1\r\n00:00:53,200 --> 00:00:56,030\r\n<i>Yumi'..."
3,9180594,yumis.cells.s02.e14.episode.2.14.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00U\xa9\x99V...,"1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch an..."
4,9180600,broker.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x001\xa9\x99V...,"ï»¿1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch..."


**Selecting the 30% of the given data randomly**

In [3]:
# Selecting 30% of the data randomly
df_sampled = df.sample(frac=0.3, random_state=42)

In [4]:
# Display the first few rows of the sampled DataFrame
df_sampled.head()

Unnamed: 0,num,name,content,file_content
17262,9251120,maybe.this.time.(2014).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x89\x9a\x...,"ï»¿1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch..."
7294,9211589,down.the.shore.s01.e10.and.justice.for.all.(19...,b'PK\x03\x04\x14\x00\x00\x00\x08\x007\x8f\x99V...,"1\r\n00:00:09,275 --> 00:00:11,876\r\n¶ Oh, I ..."
47707,9380845,uncontrollably.fond.s01.e07.heartache.(2016).e...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x8f\x19\x...,"1\r\n00:00:07,140 --> 00:00:14,220\r\n<i>Timin..."
29914,9301436,screen.two.s13.e04.the.precious.blood.(1996).e...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00[\xaa\x99V...,"1\r\n00:00:06,133 --> 00:00:08,900\r\n[etherea..."
54266,9408707,battlebots.(2015).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xf4<\x9aV...,"ï»¿1\r\n00:00:01,480 --> 00:00:03,570\r\n[Chri..."


## **Applying the appropriate cleaning steps on subtitle documents**

In [None]:
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
# from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Ensure you have the necessary NLTK data downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def clean_text(text):
    # Remove timestamps
    text = re.sub(r'\d+:\d+:\d+,\d+ --> \d+:\d+:\d+,\d+', '', text)
    # Remove line numbers and HTML tags
    text = re.sub(r'\b\d+\b', '', text)
    text = BeautifulSoup(text, "html.parser").get_text(separator=" ")
    # Remove special characters, punctuation, and symbols
    text = re.sub(r'[^\w\s]', '', text)
    # Remove extra whitespace and convert to lowercase
    text = re.sub(r'\s+', ' ', text).lower()
    # Tokenize the text and remove stopwords
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Reconstruct the cleaned text
    cleaned_text = ' '.join(tokens)
    return cleaned_text

print('\n\nApplying `clean_text` function on the Data')
df_sampled['cleaned_content'] = df_sampled['file_content'].apply(clean_text)

In [7]:
# Display the first few rows of the DataFrame with cleaned content
df_sampled[['name', 'cleaned_content']].head()

Unnamed: 0,name,cleaned_content
0,maybe.this.time.(2014).eng.1cd,ï watch video online opensubtitles free browse...
1,down.the.shore.s01.e10.and.justice.for.all.(19...,oh know getting late dont wan na go home im hu...
2,uncontrollably.fond.s01.e07.heartache.(2016).e...,timing subtitles uncontrollable lovebirds team...
3,screen.two.s13.e04.the.precious.blood.(1996).e...,ethereal music apiopensubtitlesorg deprecated ...
4,battlebots.(2015).eng.1cd,ï chris oh minibots yelling oh leave little bo...


In [None]:
df_sampled[['name', 'cleaned_content']].to_csv('preprocessed_subtitles_30.csv')

The `optimized_document_chunker` function efficiently breaks down a large document into smaller, overlapping chunks. It is particularly useful for natural language processing tasks. Here are the key details:

- **Arguments**:
  - `document` (str): The input subtitle document to be chunked.
  - `max_tokens_per_chunk` (int, optional): Maximum number of tokens allowed per chunk (default is 500).
  - `overlap_tokens` (int, optional): Number of tokens to overlap between adjacent chunks (default is 50).

- **Returns**:
  - A list of chunked segments from the original document.

This function tokenizes the input document, calculates the necessary number of chunks, and creates these chunks efficiently. It's designed to handle large texts while ensuring overlapping segments for context preservation. 📝🚀

In [8]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

def optimized_document_chunker(document, max_tokens_per_chunk=500, overlap_tokens=50):
    """
    Chunk a large document into smaller, overlapping chunks efficiently.
    
    Args:
    - document (str): The subtitle document to be chunked.
    - max_tokens_per_chunk (int): Maximum number of tokens allowed per chunk.
    - overlap_tokens (int): Number of tokens to overlap between adjacent chunks.
    
    Returns:
    - List[str]: List of chunked segments from the document.
    """
    # Tokenize the document into individual words or subwords
    tokens = [word for word in document.split() if word not in ENGLISH_STOP_WORDS]
    
    # Calculate the number of chunks needed
    num_chunks = (len(tokens) - overlap_tokens) // (max_tokens_per_chunk - overlap_tokens) + 1
    
    # Create chunks using list comprehension for better performance
    chunks = [
        ' '.join(tokens[i:i + max_tokens_per_chunk])
        for i in range(0, len(tokens), max_tokens_per_chunk - overlap_tokens)
        if i + max_tokens_per_chunk <= len(tokens) or i == (num_chunks - 1) * (max_tokens_per_chunk - overlap_tokens)
    ]
    
    return chunks


In [None]:
# Apply the document chunker to the cleaned content column
df_sampled['chunks'] = df_sampled['cleaned_content'].apply(optimized_document_chunker)

In [9]:
df_sampled['chunks']

0        ['ï watch video online opensubtitles free brow...
1        ['oh know getting late dont wan na home im hur...
2        ['timing subtitles uncontrollable lovebirds te...
3        ['ethereal music apiopensubtitlesorg deprecate...
4        ['ï chris oh minibots yelling oh leave little ...
                               ...                        
24744    ['ïscript info title default file scripttype v...
24745    ['ï come dont know tape helena girls called ca...
24746    ['ï previously heroes tell virus stop primatec...
24747    ['ï hot cleveland recorded live studio audienc...
24748    ['ï apiopensubtitlesorg deprecated implement r...
Name: chunks, Length: 24749, dtype: object

In [None]:
df_sampled[['name', 'cleaned_content', 'chunks']].to_csv('preprocessed_subtitles_30_with_chunks.csv')

In [6]:
df_sampled = pd.read_csv("/kaggle/input/opensubtitles-preprocessed-with-chunks-30-percent/preprocessed_subtitles_30_with_chunks.csv")

## **Experiment with the following to generate text vectors of subtitle documents**

### Generating Text Embeddings with the **TfidfVectorizer** for the keyword-based search engine experimentation.

The **TF-IDF Vectorizer** is used to transform subtitle text data into TF-IDF vectors, which represent the importance of words in the context of the entire document collection. It helps improve interpretability by associating feature names with these vectors. 📊🔤

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

def tfidf_vectorize(subtitles):   
    vectorizer = TfidfVectorizer(stop_words='english')
    # Fit the vectorizer to the subtitles and transform the subtitles into TF-IDF vectors
    vectors = vectorizer.fit_transform(subtitles)
    # Return the TF-IDF vectors along with the feature names for better interpretability
    return vectors, vectorizer.get_feature_names_out()

subtitle_tfidf_vectors, subtitle_feature_names1 = tfidf_vectorize(df_sampled['chunks'])

In [21]:
import pickle

with open('subtitle_chunks_tfidf_vectors_final.pkl', 'wb') as f:
    pickle.dump(subtitle_tfidf_vectors, f)

with open('subtitle_chunks_feature_names_final.pkl', 'wb') as f:
    pickle.dump(subtitle_feature_names1, f)

In [23]:
import os
import pickle

# Check if the file exists and is not empty
if os.path.exists('subtitle_chunks_tfidf_vectors_final.pkl') and os.path.getsize('subtitle_chunks_tfidf_vectors_final.pkl') > 0:
    with open('subtitle_chunks_tfidf_vectors_final.pkl', 'rb') as file:
        try:
            subtitle_tfidf_vectors = pickle.load(file)
        except EOFError:
            # Handle the exception if the file is not properly pickled
            subtitle_tfidf_vectors = None
            print("The file is not properly pickled or is empty.")
else:
    print("The file does not exist or is empty.")

# Now you can use subtitle_tfidf_vectors as a normal variable, if it was loaded successfully
if subtitle_tfidf_vectors is not None:
    print(subtitle_tfidf_vectors)

  (0, 542551)	0.0016152756544135388
  (0, 81167)	0.001422704219523802
  (0, 586605)	0.001657147619625029
  (0, 611386)	0.008751832265513317
  (0, 541251)	0.001648835706289278
  (0, 473399)	0.0015370077694319094
  (0, 337807)	0.004856223054457998
  (0, 311249)	0.0035196455045136887
  (0, 527828)	0.007282381687701824
  (0, 217070)	0.00839746094110244
  (0, 524858)	0.004680375517401724
  (0, 524897)	0.012391117023521648
  (0, 246701)	0.0012215867693711154
  (0, 444777)	0.003510333123604347
  (0, 396304)	0.0056135710305065005
  (0, 260012)	0.0015710853277995421
  (0, 262333)	0.004495967706003902
  (0, 425378)	0.01794132166533833
  (0, 282949)	0.00841139141309038
  (0, 235866)	0.004303203654556054
  (0, 524860)	0.004164232691588318
  (0, 23973)	0.0030209681628947365
  (0, 465834)	0.007180210155673248
  (0, 189145)	0.0034985086227181327
  (0, 87263)	0.00489362970385651
  :	:
  (24748, 387123)	0.0380810860048123
  (24748, 595429)	0.005765402356754568
  (24748, 211124)	0.00514655134593789
  (2

### Generating Text Embeddings with the **multi-qa-MiniLM-L6-cos-v1** model

The **multi-qa-MiniLM-L6-cos-v1** model is a **sentence-transformers** model designed for **semantic search**. It maps sentences and paragraphs to a 384-dimensional dense vector space and has been trained on 215 million (question, answer) pairs from diverse sources¹[1].

In [12]:
import torch
import sentence_transformers
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm

# Initialize the model outside the function to avoid loading it multiple times
model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1') # bert-base-nli-mean-tokens and sentence-transformers/all-MiniLM-L6-v2

def generate_sentence_embeddings(subtitles):
    # Check if CUDA is available and set the device accordingly
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    # # Join each list of strings into a single string per subtitle entry
    # subtitles_series = subtitles_series.apply(lambda chunks: ' '.join(chunks) if isinstance(chunks, list) else chunks)
    
    # # Convert the pandas Series to a list of strings
    # subtitles_list = subtitles_series.tolist()
#     subtitles = [' '.join(chunk) if isinstance(chunk, list) else chunk for chunk in subtitles]
    
    # Encode the subtitles to get the embeddings
    embeddings = model.encode(subtitles, show_progress_bar=True, convert_to_tensor=True)
    
    # Move embeddings to CPU and convert to numpy array if they are not already
    if torch.cuda.is_available():
        embeddings = embeddings.cpu()
    
    embeddings_numpy = embeddings.numpy()
    return embeddings_numpy

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

**Applying the `generate_sentence_embeddings` function on the generated chunks from the data.**

In [None]:
# Encode the chunks using Sentence Transformers
# df_sampled['chunk_embeddings'] = df_sampled['chunks'].apply(encode_chunks)
subtitle_bert_embeddings = generate_sentence_embeddings(df_sampled['chunks'].tolist())
subtitle_bert_embeddings

**Saving the generated embeddings to numpy array and pytorch file**

In [9]:
# Assuming subtitle_bert_embeddings is a numpy array
np.save('subtitle_embeddings.npy', subtitle_bert_embeddings)

In [10]:
import torch

# Save the embeddings to a file
torch.save(subtitle_bert_embeddings, 'subtitle_embeddings.pt')

In [14]:
import torch

subtitle_embeddings_bert = torch.load('/kaggle/input/subtitle_multi-qa-minilm-l6-cos-v1_bert_embeddings/pytorch/1/1/subtitle_multi-qa-MiniLM-L6-cos-v1_bert_embeddings_pytorch.pt')

In [16]:
subtitle_embeddings_bert.shape

(24749, 384)

In [None]:
# # df_sampled['chunk_embeddings'] = subtitle_bert_embeddings.tolist()
# df_sampled[['name', 'cleaned_content', 'chunks', 'chunk_embeddings']].to_csv('preprocessed_subtitles_30_with_all-MiniLM-L6-v2_embeddings.csv')

## **Storing the generated embeddings in a ChromaDB database.**

In [1]:
import pandas as pd
import numpy as np

df_sampled = pd.read_csv('/kaggle/input/opensubtitles-preprocessed-with-bert-embeddings/preprocessed_subtitles_30_with_chunks_embeddings.csv')

**The below code generates unique IDs and metadata for each document in a DataFrame. 📄🔑**

In [4]:
import uuid

ids = []
metadatas = []
for index, row in df_sampled.iterrows():
        ids.append(str(uuid.uuid4()))
        print({"name": row['name'], "chunk_index": index})
        metadatas.append({"name": row['name'], "index": index})

{'name': 'maybe.this.time.(2014).eng.1cd', 'chunk_index': 0}
{'name': 'down.the.shore.s01.e10.and.justice.for.all.(1992).eng.1cd', 'chunk_index': 1}
{'name': 'uncontrollably.fond.s01.e07.heartache.(2016).eng.1cd', 'chunk_index': 2}
{'name': 'screen.two.s13.e04.the.precious.blood.(1996).eng.1cd', 'chunk_index': 3}
{'name': 'battlebots.(2015).eng.1cd', 'chunk_index': 4}
{'name': 'csi.crime.scene.investigation.s08.e16.two.and.a.half.deaths.(2008).eng.1cd', 'chunk_index': 5}
{'name': 'royal.ashes.().eng.1cd', 'chunk_index': 6}
{'name': 'return.to.seoul.(2022).eng.1cd', 'chunk_index': 7}
{'name': 'idris.elba.king.of.speed.s01.e02.episode.1.2.(2013).eng.1cd', 'chunk_index': 8}
{'name': 'tooth.pari.when.love.bites.s01.e08.episode.1.8.(2023).eng.1cd', 'chunk_index': 9}
{'name': 'studio.one.s08.e30.the.arena.(1956).eng.1cd', 'chunk_index': 10}
{'name': 'love.life.s02.e10.epilogue.(2021).eng.1cd', 'chunk_index': 11}
{'name': 'scrubs.s03.e14.my.screwup.(2004).eng.1cd', 'chunk_index': 12}
{'name':

In [7]:
ids

['ee83d0b3-aef9-4961-a486-5c13f9423ca6',
 '8097ccc9-d300-4c25-b962-b0f1109603e9',
 '4b4ac296-617d-4000-ba66-1b18d538c24b',
 'e4e75b55-7225-422b-ae94-db8fd8c34350',
 'f7f6825e-93a4-4bba-805e-8be7e4bb2b78',
 'a45c0ee8-64dc-49ac-85df-6451209578ff',
 '15291fa6-f79a-4e69-b425-a1bf0e0845a2',
 'd27fe0d5-6fb0-4ee1-8e18-238afd335a55',
 'c22bdf3d-55f9-4248-aaa1-4f220c5c6580',
 '15692559-3e2c-4973-8eda-be24eb3998ee',
 'e9c4a285-3746-4f91-b736-69348292efb8',
 'e59d18fb-6966-4743-a0aa-442203b25ec9',
 '3b7002ef-7bc3-4d09-a0d7-9c68458c4d59',
 '45ac1be0-2ff5-4b62-8d50-a0dfc298c7ad',
 'cc66d8c4-9d6d-42ad-aa96-f838e951af03',
 '27671bc5-4401-402d-9447-2fd9227181b4',
 '7aaacb34-9bb6-44ad-9646-43d63734dbfd',
 '928601c6-0a3c-470d-b804-efc75898224f',
 '4f98bc48-1f03-466c-a345-e9e98a88c5ef',
 'a44a9b7e-2893-456b-b066-ec0b955f3d83',
 '0c216902-2a4b-4b6d-8790-de4d56dac371',
 'f55ff564-c40f-49d9-945c-cc2cee1e6b65',
 'c822078f-e448-47f2-bc86-13ecdfe235c6',
 '0913e824-8277-47d4-83b5-855eecbede2f',
 '40ad9f5d-7a84-

### **Setting up the ChromaDB Database and creating a collection for to store the embeddings**

**This below code snippet initializes a ChromaDB collection named `'subtitle_multi-qa-MiniLM-L6-cos-v1_bert_embeddings_final'` with specified metadata settings. The collection uses cosine similarity for embedding space. The purpose of this collection is likely to store and manage embeddings related to subtitle data. 📊🔍**

In [10]:
import chromadb
from chromadb.config import Settings

CHROMA_DATA_PATH = 'subtitle_chromadb_data/'

# Create a collection named 'subtitle_bert_embeddings'
client = chromadb.PersistentClient(path=CHROMA_DATA_PATH)
collection = client.get_or_create_collection(
        name="subtitle_multi-qa-MiniLM-L6-cos-v1_bert_embeddings_final",
        metadata={"hnsw:space": "cosine"},
        embedding_function=None
)

**This below code snippet adds subtitle documents and their BERT embeddings to the `subtitle_multi-qa-MiniLM-L6-cos-v1_bert_embeddings_final` ChromaDB collection, along with associated metadata that we created before. 📊🔑**

In [11]:
document_ids = [str(id) for id in df_sampled['cleaned_content'].index.tolist()]
subtitle_embeddings_bert_list = subtitle_embeddings_bert.tolist()

collection.add(
    ids = ids,
    documents = df_sampled['cleaned_content'].tolist(),
    embeddings = subtitle_embeddings_bert_list, 
    metadatas=metadatas
)

# **Part 2: Retrieving Documents**

These below `preprocess_text` function and `search_query` function performs a document retrieval process based on user search queries:
1. **Preprocessing**: It cleans and preprocesses the user's query by removing timestamps, line numbers, HTML tags, special characters, and stopwords.
2. **Embedding Creation**: It generates a query embedding using sentence embeddings (e.g., BERT).
3. **Cosine Similarity**: The code calculates cosine similarity scores between the query embedding and document embeddings.
4. **Candidate Selection**: The most relevant documents are returned based on these similarity scores.

In summary, this code enables efficient document retrieval using semantic search techniques. 📄🔍

In [20]:
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
# from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

def preprocess_text(text):
    # Remove timestamps
    text = re.sub(r'\d+:\d+:\d+,\d+ --> \d+:\d+:\d+,\d+', '', text)
    # Remove line numbers and HTML tags
    text = re.sub(r'\b\d+\b', '', text)
    text = BeautifulSoup(text, "html.parser").get_text(separator=" ")
    # Remove special characters, punctuation, and symbols
    text = re.sub(r'[^\w\s]', '', text)
    # Remove extra whitespace and convert to lowercase
    text = re.sub(r'\s+', ' ', text).lower()
    # Tokenize the text and remove stopwords
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Reconstruct the cleaned text
    cleaned_text = ' '.join(tokens)
    return cleaned_text

def search_query(query):
    # Preprocess the query
    preprocessed_query = preprocess_text(query)

    query_embedding = generate_sentence_embeddings([preprocessed_query])[0]

    population = collection.query(
        query_embeddings=query_embedding.tolist(),
        n_results=2
    )
    return population

In [21]:
search_query('Hello!')

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

{'ids': [['a90ae7f4-ca61-4e93-b0de-56dcfeeadbad',
   'c7bfc675-c3c3-41c8-b629-8714d9fdecc9']],
 'distances': [[0.4949023723602295, 0.5080372095108032]],
 'metadatas': [[{'index': 22945, 'name': 'chakravyuha.(2016).eng.1cd'},
   {'index': 6194, 'name': 'salam.(2018).eng.1cd'}]],
 'embeddings': None,
 'documents': [['ï bye bye watch video online opensubtitles free browser extension osdblinkext papa way us day come touch feet wont get key mom bless dear bubbling impatient accept loved say yes pray dont fall love someone else want promise love alone keep away others even dreams thats okay hello think happened come dream girl doesnt suit sorry okay loose okay bye dear bye bye come home quickly dont ask reason come home quickly please please come quickly please please come quickly love obsession near peace hi yes slave learnt make coffee wanted taste made thats please come pray god dont belong someone else nuisance dont want anything else even dream keep away others dont mind use phone yes s