In [1]:
import os
import pandas as pd
from lib.youtube import YouTube
from lib.constants import DEM_KEYWORDS, HSE_KEYWORDS, BASE_DATA_PATH
from lib.utils import preprocess_text, fix_punctuations, count_tokens
from tqdm.notebook import tqdm_notebook
from tqdm import tqdm
from lib.classify import classifier
from lib.createdb import VectorDB
from langdetect import detect
from langchain_qdrant import QdrantVectorStore, RetrievalMode
from langchain_ollama import OllamaEmbeddings
from pprint import pprint

### Extract relevant video data from YouTube

In [None]:
YT = YouTube(keywords=HSE_KEYWORDS, filename="youtube_HSE")
YT.get_transcript()

### Data Preprocessing

In [None]:
df = pd.read_csv("data/youtube_HSE.csv")
tqdm.pandas()

# Detect the language
df['lang'] = df['transcript'].progress_apply(lambda x: detect(x))
df = df[df['lang'] == 'en']

# Preprocess
df['transcript'] = df['transcript'].progress_apply(preprocess_text)
df['transcript'] = df['transcript'].progress_apply(fix_punctuations)

df.to_csv(os.path.join(BASE_DATA_PATH, 'youtube_hse_v1.csv'), index=False)

### Video Classification

In [5]:
df = pd.read_csv(os.path.join(BASE_DATA_PATH, 'youtube_hse_v1.csv'))

decisions = []
for index, row in tqdm_notebook(df.iterrows(), total=df.shape[0]):
    try:
        result = classifier(
            text=row['transcript'],
            context="Health, Safety and Environment (HSE)")
        decisions.append(result['output'])
    except Exception as e:
        print(f"Issue in index: {index}")
        print(e)
        decisions.append('N/A')
        pass

df['is_related'] = decisions
df['is_related'].value_counts()

  0%|          | 0/1301 [00:00<?, ?it/s]

is_related
yes    863
no     438
Name: count, dtype: int64

In [6]:
condition = df['is_related'] == 'yes'
df_new = df[condition]
df_new.to_csv(
    os.path.join(BASE_DATA_PATH, 'youtube_hse_v2.csv'), 
    index=False)

### Semantic Chunking

In [8]:
df = pd.read_csv(os.path.join(BASE_DATA_PATH, 'youtube_dem_v2.csv'))
VDB = VectorDB(model="qwen2.5:32b", dataframe=df)
chunks = VDB.create_chunks()

100%|██████████| 792/792 [00:00<00:00, 109053.83it/s]
Processing Documents: 100%|██████████| 792/792 [4:34:31<00:00, 20.80s/it]   


In [9]:
data = [{**doc.metadata, 'page_content': doc.page_content} for doc in chunks]
docs_df = pd.DataFrame(data)
docs_df.to_csv(os.path.join(BASE_DATA_PATH, "yt_chunks_qwen2.5-32b-dem.csv"), index=False)
docs_df

Unnamed: 0,video_id,title,url,language,page_content
0,7Iu_lvcxK0w,How Artificial Intelligence Can Help Communiti...,https://www.youtube.com/watch?v=7Iu_lvcxK0w,en,"in a sunny day in july 2017, i drove my car do..."
1,7Iu_lvcxK0w,How Artificial Intelligence Can Help Communiti...,https://www.youtube.com/watch?v=7Iu_lvcxK0w,en,the latest us census data showed that in a spa...
2,7Iu_lvcxK0w,How Artificial Intelligence Can Help Communiti...,https://www.youtube.com/watch?v=7Iu_lvcxK0w,en,california in the 19 years of the past 20 year...
3,7Iu_lvcxK0w,How Artificial Intelligence Can Help Communiti...,https://www.youtube.com/watch?v=7Iu_lvcxK0w,en,"some of you may think: well, my phone. well, t..."
4,k_usYJ7nErM,The Role Of AI In Disaster Management,https://www.youtube.com/watch?v=k_usYJ7nErM,en,hi and Welcome to our Channel. disasters can s...
...,...,...,...,...,...
6197,OFWDK657e5w,AI Presidents Trapped During a Hurricane (AI P...,https://www.youtube.com/watch?v=OFWDK657e5w,en,so you're saying that if I walk away from you ...
6198,nZdnju6_EMU,Fake AI Images During Hurricane Helene | On Th...,https://www.youtube.com/watch?v=nZdnju6_EMU,en,have you seen a picture of a little girl holdi...
6199,nZdnju6_EMU,Fake AI Images During Hurricane Helene | On Th...,https://www.youtube.com/watch?v=nZdnju6_EMU,en,"yeah, that's a fake AI photo. welcome back to ..."
6200,MatKrPBLOsg,A new era in hurricane tracking: how AI drones...,https://www.youtube.com/watch?v=MatKrPBLOsg,en,Would you fly into a hurricane to change the f...


### Creating the Vector Database

In [2]:
df1 = pd.read_csv(os.path.join(BASE_DATA_PATH, "yt_chunks_qwen2.5-32b-dem.csv"))
df2 = pd.read_csv(os.path.join(BASE_DATA_PATH, "yt_chunks_qwen2.5-32b-hse.csv"))

# Concat two dataframes
df = pd.concat([df1, df2])


lengths = count_tokens(df)
df['token_length'] = lengths
df = df[df['token_length'] >= 100]

  0%|          | 0/15055 [00:00<?, ?it/s]

In [3]:
DB = VectorDB(model="qwen2.5:32b", dataframe=df)
documents = DB.get_documents()
DB.create_vectordb(documents=documents)

100%|██████████| 10438/10438 [00:00<00:00, 93565.91it/s]


10438 documents added to the vector store.


Create a vector database snapshot

In [None]:
from qdrant_client import QdrantClient

client = QdrantClient(path="/home/lab1/Documents/DEM_HSE/backend/qdrant/qwen-32b")
client.create_snapshot(collection_name="youtube_collection")

In [None]:
vector_store = QdrantVectorStore.from_existing_collection(
    embedding=OllamaEmbeddings(model="qwen2.5:32b"),
    path="/home/lab1/Documents/DEM_HSE/backend/qdrant/qwen-32b",
    collection_name="youtube_collection",
    retrieval_mode=RetrievalMode.DENSE,
)

retriever = vector_store.as_retriever()
result = retriever.invoke("What are the trending AI technologies in Disaster Management?")
pprint(result)

### Topic Modeling