In [1]:
import whisper
import os

In [2]:
model = whisper.load_model("tiny")

In [3]:
def transcribe_audio(file_path: str):
    result = model.transcribe(file_path, fp16=False)
    return result["segments"]

In [4]:
files = [os.path.join("Video/", file) for file in os.listdir("Video")]

In [None]:
results = []
for file in files:
    results.append(transcribe_audio(file))

In [5]:
import json

In [6]:
from sentence_transformers import SentenceTransformer
def vectorize_texts(texts, model_name='all-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name)
    vectors = model.encode(texts, convert_to_numpy=True)
    return vectors

  from .autonotebook import tqdm as notebook_tqdm


In [20]:
json_data = []
for file in files:
    data = {}
    data["file"] = file
    result = transcribe_audio(file)
    text = ""
    transcribe_data = []
    for res in result:
        i = {}
        i["text"] = res["text"]
        i["id"] = res["id"]
        i["start"] = res["start"]
        i["end"] = res["end"]
        text += res["text"]
        transcribe_data.append(i)
    data["vectorized"] = vectorize_texts(text).tolist()
    data["result"] = transcribe_data
    json_data.append(data)

In [21]:
json_data

[{'file': 'Video/1.mp4',
  'vectorized': [-0.09549727290868759,
   -0.12596260011196136,
   -0.028921086341142654,
   -0.03505568951368332,
   0.042137496173381805,
   -0.015229630284011364,
   -0.022178733721375465,
   -0.005358344875276089,
   0.0059755356051027775,
   -0.051506444811820984,
   -0.002381883794441819,
   0.024292007088661194,
   -0.015702521428465843,
   -0.011057798750698566,
   -0.04802440479397774,
   0.012980494648218155,
   -0.07392896711826324,
   0.04332427680492401,
   -0.1471230387687683,
   -0.05909452214837074,
   0.06496739387512207,
   -0.09326478838920593,
   0.014408997260034084,
   0.015588340349495411,
   -0.012856305576860905,
   0.03692637383937836,
   0.029655292630195618,
   0.01451374776661396,
   0.06400568783283234,
   -0.019746968522667885,
   0.07202315330505371,
   -0.002293406752869487,
   0.018678121268749237,
   0.053666047751903534,
   -0.03604087606072426,
   0.003194665303453803,
   -0.07746347039937973,
   0.005761477164924145,
   -0.

In [9]:
with open("app/data/vectorized.json", "r") as file:
    data = json.load(file)

In [10]:
data

[{'file': 'Video/1.mp4',
  'vectorized': [-0.09549727290868759,
   -0.12596260011196136,
   -0.028921086341142654,
   -0.03505568951368332,
   0.042137496173381805,
   -0.015229630284011364,
   -0.022178733721375465,
   -0.005358344875276089,
   0.0059755356051027775,
   -0.051506444811820984,
   -0.002381883794441819,
   0.024292007088661194,
   -0.015702521428465843,
   -0.011057798750698566,
   -0.04802440479397774,
   0.012980494648218155,
   -0.07392896711826324,
   0.04332427680492401,
   -0.1471230387687683,
   -0.05909452214837074,
   0.06496739387512207,
   -0.09326478838920593,
   0.014408997260034084,
   0.015588340349495411,
   -0.012856305576860905,
   0.03692637383937836,
   0.029655292630195618,
   0.01451374776661396,
   0.06400568783283234,
   -0.019746968522667885,
   0.07202315330505371,
   -0.002293406752869487,
   0.018678121268749237,
   0.053666047751903534,
   -0.03604087606072426,
   0.003194665303453803,
   -0.07746347039937973,
   0.005761477164924145,
   -0.

In [11]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [12]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4') 
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Peanutmonster\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Peanutmonster\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Peanutmonster\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Peanutmonster\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Peanutmonster\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Peanutmonster\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_

True

In [13]:
with open("app/data/stop_words_english.txt", "rb") as file:
    stop_word = file.read()
    dictionary = stop_word.decode("utf-8")
    dictionary = dictionary.replace("\r", "")
    dictionary = set(dictionary.split("\n"))


In [14]:
stop_words = set(stopwords.words('english'))
stop_words.update(dictionary)
lemmatizer = WordNetLemmatizer()

In [15]:
def normalize_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    text = re.sub(r'\d+', '', text)
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word, pos='v') for word in tokens]
    return tokens

In [16]:
import gensim
import gensim.corpora as corpora
from gensim.models import LdaModel
import spacy
nlp = spacy.load("en_core_web_lg")

def extract_keywords(text: str):
    doc = nlp(text)
    return set(doc.ents)



In [17]:
def extract_title_with_lda(tokens):
    dictionary = corpora.Dictionary([tokens])
    corpus = [dictionary.doc2bow(tokens)]
    lda_model = LdaModel(corpus, num_topics=1, id2word=dictionary, passes=5)
    top_words = lda_model.show_topic(0, topn=5)
    title = " ".join([word for word, _ in top_words])
    return title

In [18]:
normalize_data = []
keywords = []
for text in data:
    script_time_embeding = []
    text_for_keyword = ""
    for res in text["result"]:
        normalized = normalize_text(res["text"])
        normalized = " ".join(normalized)
        text_for_keyword += normalized
        script_time_embeding.append(normalized)
    keywords.append(list(extract_keywords(text_for_keyword)))
    normalize_data.append(script_time_embeding)


In [19]:
texts = []
segments = []
for text in data:
    t = ""
    segments.append(text["result"])
    for i in text["result"]:
        t += i["text"] + " "
    texts.append(t.strip())

In [20]:
segments

[[{'text': " Let's go to our guys jungle to her from codemy.com and in this video we're gonna start to build out our convolutional neural network for PyTorch and Python",
   'id': 0,
   'start': 0.0,
   'end': 6.54},
  {'text': " Guys like I said in this video we're gonna start to build out our convolutional neural network",
   'id': 1,
   'start': 9.72,
   'end': 13.200000000000001},
  {'text': ' But for a good start if you like this video, I want to see more like it was your two smash like button below subscribe to the channel',
   'id': 2,
   'start': 13.200000000000001,
   'end': 17.86},
  {'text': ' We have thumbs up for the YouTube algorithm and check out codemy.com',
   'id': 3,
   'start': 17.86,
   'end': 20.64},
  {'text': " I've dozens of courses with thousands of videos teach to code is coupon code YouTube 50 to get 50% off lifetime membership",
   'id': 4,
   'start': 20.64,
   'end': 26.54},
  {'text': ' So all my courses videos and books for one time feet, which is insan

In [21]:
import faiss
from collections import defaultdict
import numpy as np

In [22]:
vector = []
for text in data:
    vector.append(np.array(text["vectorized"]).astype(np.float32))

vector = np.array(vector)

In [23]:
vector.shape

(15, 384)

In [24]:
import hdbscan
from sklearn.cluster import KMeans

In [25]:
cluster_algo = KMeans(n_clusters=4)

In [26]:
search_query = vectorize_texts("Machine Learning and Deep Learning").astype(np.float32)

In [27]:
search_query = search_query.reshape(1, -1)

In [28]:
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=2, min_samples=1)

In [29]:
cluster_algo.fit_predict(vector)

[WinError 2] The system cannot find the file specified
  File "d:\Personal Projects\Search-Engine\venv\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Users\Peanutmonster\AppData\Local\Programs\Python\Python312\Lib\subprocess.py", line 550, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Peanutmonster\AppData\Local\Programs\Python\Python312\Lib\subprocess.py", line 1028, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\Peanutmonster\AppData\Local\Programs\Python\Python312\Lib\subprocess.py", line 1540, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incomp

array([1, 1, 3, 3, 2, 0, 1, 1, 1, 0, 1, 0, 3, 3, 1])

In [30]:
label = cluster_algo.labels_

In [31]:
hdbscan_model.fit_predict(vector)



array([ 1,  1, -1,  2, -1,  0, -1,  1, -1,  0, -1,  0,  2,  2, -1],
      dtype=int64)

In [32]:
cluster_algo.predict(search_query.reshape(1, -1))

array([1])

In [33]:
clustered_data = [
    {"text": t, "vector": v, "cluster": int(l), "file": f, "keywords": k, "segments": s}
    for t, v, l, f, k, s in zip(texts, vector, label, files, keywords, segments) if l != -1
]

In [34]:
label

array([1, 1, 3, 3, 2, 0, 1, 1, 1, 0, 1, 0, 3, 3, 1])

In [35]:
clustered_data

[{'text': "Let's go to our guys jungle to her from codemy.com and in this video we're gonna start to build out our convolutional neural network for PyTorch and Python  Guys like I said in this video we're gonna start to build out our convolutional neural network  But for a good start if you like this video, I want to see more like it was your two smash like button below subscribe to the channel  We have thumbs up for the YouTube algorithm and check out codemy.com  I've dozens of courses with thousands of videos teach to code is coupon code YouTube 50 to get 50% off lifetime membership  So all my courses videos and books for one time feet, which is insane like cheap okay in the last video  We sort of imported all the stuff that we're gonna need we created a little transform and then we set up our training data and our  Test data so in this video we want to start to build out our convolutional neural network  Now we're not gonna actually create the model in this video instead  We're gonn

In [36]:
clusters = defaultdict(list)
for item in clustered_data:
    clusters[item["cluster"]].append(item)

In [37]:
clusters

defaultdict(list,
            {1: [{'text': "Let's go to our guys jungle to her from codemy.com and in this video we're gonna start to build out our convolutional neural network for PyTorch and Python  Guys like I said in this video we're gonna start to build out our convolutional neural network  But for a good start if you like this video, I want to see more like it was your two smash like button below subscribe to the channel  We have thumbs up for the YouTube algorithm and check out codemy.com  I've dozens of courses with thousands of videos teach to code is coupon code YouTube 50 to get 50% off lifetime membership  So all my courses videos and books for one time feet, which is insane like cheap okay in the last video  We sort of imported all the stuff that we're gonna need we created a little transform and then we set up our training data and our  Test data so in this video we want to start to build out our convolutional neural network  Now we're not gonna actually create the model

In [38]:
faiss_indexes = {}
for cluster_id, items in clusters.items():
    vecs = np.array([item["vector"] for item in items]).astype("float32")
    index = faiss.IndexFlatL2(vecs.shape[1])
    faiss.normalize_L2(vecs)
    index.add(vecs)
    faiss_indexes[cluster_id] = {
        "index": index,
        "data": items
    }

In [39]:
faiss_indexes

{1: {'index': <faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x000001C3141509F0> >,
  'data': [{'text': "Let's go to our guys jungle to her from codemy.com and in this video we're gonna start to build out our convolutional neural network for PyTorch and Python  Guys like I said in this video we're gonna start to build out our convolutional neural network  But for a good start if you like this video, I want to see more like it was your two smash like button below subscribe to the channel  We have thumbs up for the YouTube algorithm and check out codemy.com  I've dozens of courses with thousands of videos teach to code is coupon code YouTube 50 to get 50% off lifetime membership  So all my courses videos and books for one time feet, which is insane like cheap okay in the last video  We sort of imported all the stuff that we're gonna need we created a little transform and then we set up our training data and our  Test data so in this video we wa

In [40]:
from sklearn.metrics.pairwise import cosine_similarity

In [41]:
def semantic_faiss_search(query, kmean_model, top_k=5):
    query_vec = vectorize_texts(query)
    faiss.normalize_L2(query_vec.reshape(1, -1))

    #Estimate cluster for query
    cluster_id = kmean_model.predict(query_vec.reshape(1, -1))[0]

    if cluster_id not in faiss_indexes:
        return []
    
    index_data = faiss_indexes[cluster_id]
    index = index_data["index"]
    data = index_data["data"]

    D, I = index.search(query_vec.reshape(1, -1), top_k)

    results = []
    for i, dist in zip(I[0], D[0]):
        results.append({
            "text": data[i]["text"],
            "similarity": 1 - dist / 2,
            "file": data[i]["file"],
            "keyword": data[i]["keywords"],
            "segments": data[i]["segments"]
        })

    return sorted(results, key=lambda x:x["similarity"], reverse=True)

    

In [42]:
semantic_faiss_search(query="Machine Learning and Deep Learning", kmean_model=cluster_algo)

[{'text': "What's going on guys?  John older her from codemute.com and in this video we're going to start to look at PyTorch for  Deep Learning.  Now guys, I could sit in this video we're going to start to look at PyTorch for Deep Learning,  but for we can start if you like this video, I want to see more like if you're  sure to smash like button below, subscribe to the channel, we have thumbs up for the  YouTube algorithm, and check out codemute.com where I've done some of the courses with  thousands of videos to teach you to code.  Use coupon code YouTube and eating at 50% off lifetime membership.  It's all my courses videos and books for one time fee, which is insanely cheap.  Alright, we are starting a new playlist here on the channel, PyTorch for Deep Learning.  So we've already looked at NumPy, we've looked at Pandas, we've looked at a little bit  of psychic learn for machine learning, and this playlist we're going to focus on deep learning.  Now deep learning is a subset of machi

In [57]:
import pickle

In [58]:
pickle.dump(cluster_algo, open("app/models/Clustering.pkl", "wb"))

In [59]:
model = pickle.loads(open("app/models/Clustering.pkl", "rb").read())

In [60]:
query_vec = vectorize_texts("Deep Learning Neural Network")
model.predict(query_vec.reshape(1, -1))

array([1])

In [61]:
print(type(faiss_indexes))

<class 'dict'>


In [62]:
for cluster_id, cluster_data in faiss_indexes.items():
    faiss.write_index(cluster_data['index'], f"app/models/index_cluster_{cluster_id}.faiss")

In [63]:
serializable_index = {}
for cluster_id, cluster_data in faiss_indexes.items():
    serializable_index[cluster_id] = {
        'data': cluster_data['data'],
    }

In [64]:
serializable_index

{1: {'data': [{'text': "Let's go to our guys jungle to her from codemy.com and in this video we're gonna start to build out our convolutional neural network for PyTorch and Python  Guys like I said in this video we're gonna start to build out our convolutional neural network  But for a good start if you like this video, I want to see more like it was your two smash like button below subscribe to the channel  We have thumbs up for the YouTube algorithm and check out codemy.com  I've dozens of courses with thousands of videos teach to code is coupon code YouTube 50 to get 50% off lifetime membership  So all my courses videos and books for one time feet, which is insane like cheap okay in the last video  We sort of imported all the stuff that we're gonna need we created a little transform and then we set up our training data and our  Test data so in this video we want to start to build out our convolutional neural network  Now we're not gonna actually create the model in this video instea

In [65]:
from spacy.tokens import Span, Doc
import numpy as np

def clean_value(value):
    if isinstance(value, Span):
        return value.text
    elif isinstance(value, Doc):
        return value.text
    elif isinstance(value, np.ndarray):
        return value.tolist()
    elif isinstance(value, dict):
        return {k: clean_value(v) for k, v in value.items()}
    elif isinstance(value, list):
        return [clean_value(v) for v in value]
    else:
        return value

def make_metadata_serializable(metadata):
    return {cluster_id: {'data': clean_value(cluster_data['data'])} for cluster_id, cluster_data in metadata.items()}

In [66]:
serializable_index = make_metadata_serializable(serializable_index)

In [67]:
serializable_index

{1: {'data': [{'text': "Let's go to our guys jungle to her from codemy.com and in this video we're gonna start to build out our convolutional neural network for PyTorch and Python  Guys like I said in this video we're gonna start to build out our convolutional neural network  But for a good start if you like this video, I want to see more like it was your two smash like button below subscribe to the channel  We have thumbs up for the YouTube algorithm and check out codemy.com  I've dozens of courses with thousands of videos teach to code is coupon code YouTube 50 to get 50% off lifetime membership  So all my courses videos and books for one time feet, which is insane like cheap okay in the last video  We sort of imported all the stuff that we're gonna need we created a little transform and then we set up our training data and our  Test data so in this video we want to start to build out our convolutional neural network  Now we're not gonna actually create the model in this video instea

In [68]:

with open("app/models/index_metadata.pkl", "wb") as f:
    pickle.dump(serializable_index, f)

In [69]:
with open("app/models/index_metadata.pkl", "rb") as f:
    loaded_metadata = pickle.load(f)

In [70]:
for cluster_id in loaded_metadata:
    index = faiss.read_index(f"app/models/index_cluster_{cluster_id}.faiss")
    loaded_metadata[cluster_id]['index'] = index