In [1]:
import os
import pandas as pd
from typing import List
from lib.train import Train
from lib.topic_modeling import BertTopic
from lib.youtube import YouTube
from lib.constants import DEM_KEYWORDS, HSE_KEYWORDS, BASE_DATA_PATH
from lib.utils import preprocess_text, fix_punctuations, count_tokens
from tqdm.notebook import tqdm_notebook
from tqdm import tqdm
from lib.classify import classifier
from lib.createdb import VectorDB
from langdetect import detect
from langchain_qdrant import QdrantVectorStore, RetrievalMode
from langchain_ollama import OllamaEmbeddings
from pprint import pprint
from dotenv import load_dotenv
from googleapiclient.discovery import build
from setfit import SetFitModel
import evaluate
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    accuracy_score,
    classification_report
)

load_dotenv(dotenv_path=".env")
API_KEY = os.getenv("YOUTUBE_API_KEY")

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


### Extract relevant video data from YouTube

In [None]:
YT = YouTube(keywords=HSE_KEYWORDS, filename="youtube_HSE")
YT.get_transcript()

### Data Preprocessing

In [None]:
df = pd.read_csv("data/youtube_HSE.csv")
tqdm.pandas()

# Detect the language
df['lang'] = df['transcript'].progress_apply(lambda x: detect(x))
df = df[df['lang'] == 'en']

# Preprocess
df['transcript'] = df['transcript'].progress_apply(preprocess_text)
df['transcript'] = df['transcript'].progress_apply(fix_punctuations)

df.to_csv(os.path.join(BASE_DATA_PATH, 'youtube_hse_v1.csv'), index=False)

### Video Classification

In [5]:
df = pd.read_csv(os.path.join(BASE_DATA_PATH, 'youtube_hse_v1.csv'))

decisions = []
for index, row in tqdm_notebook(df.iterrows(), total=df.shape[0]):
    try:
        result = classifier(
            text=row['transcript'],
            context="Health, Safety and Environment (HSE)")
        decisions.append(result['output'])
    except Exception as e:
        print(f"Issue in index: {index}")
        print(e)
        decisions.append('N/A')
        pass

df['is_related'] = decisions
df['is_related'].value_counts()

  0%|          | 0/1301 [00:00<?, ?it/s]

is_related
yes    863
no     438
Name: count, dtype: int64

In [6]:
condition = df['is_related'] == 'yes'
df_new = df[condition]
df_new.to_csv(
    os.path.join(BASE_DATA_PATH, 'youtube_hse_v2.csv'), 
    index=False)

### Semantic Chunking

In [2]:
df = pd.read_csv(os.path.join(BASE_DATA_PATH, 'youtube_hse_v2.csv'))
VDB = VectorDB(model="llama3.3:latest", dataframe=df)
chunks = VDB.create_chunks()

100%|██████████| 863/863 [00:00<00:00, 80419.56it/s]
Processing Documents: 100%|██████████| 863/863 [10:52:38<00:00, 45.37s/it]   


In [3]:
data = [{**doc.metadata, 'page_content': doc.page_content} for doc in chunks]
docs_df = pd.DataFrame(data)
docs_df.to_csv(os.path.join(BASE_DATA_PATH, "yt_chunks_llama3.3-70b-hse.csv"), index=False)
docs_df

Unnamed: 0,video_id,title,url,page_content
0,yYsDLBAp6LM,Emergency Evacuation Planning,https://www.youtube.com/watch?v=yYsDLBAp6LM,"in case of emergency, personnel and the local ..."
1,yYsDLBAp6LM,Emergency Evacuation Planning,https://www.youtube.com/watch?v=yYsDLBAp6LM,we are using the latest location technology to...
2,aicxqzb3DJ4,Emergency Evacuations: Planning for the Whole ...,https://www.youtube.com/watch?v=aicxqzb3DJ4,- [Paul]. Hello everyone and thank you for joi...
3,aicxqzb3DJ4,Emergency Evacuations: Planning for the Whole ...,https://www.youtube.com/watch?v=aicxqzb3DJ4,"Madeline over to you. - [Madeline]. Thank you,..."
4,aicxqzb3DJ4,Emergency Evacuations: Planning for the Whole ...,https://www.youtube.com/watch?v=aicxqzb3DJ4,So in the remainder of this first presentation...
...,...,...,...,...
8848,HpcFczHymBE,"Disaster, Conflict, and Impact Assessment Mak...",https://www.youtube.com/watch?v=HpcFczHymBE,I think the first point could be debated. I've...
8849,HpcFczHymBE,"Disaster, Conflict, and Impact Assessment Mak...",https://www.youtube.com/watch?v=HpcFczHymBE,they're engaged in one side or the other of th...
8850,HpcFczHymBE,"Disaster, Conflict, and Impact Assessment Mak...",https://www.youtube.com/watch?v=HpcFczHymBE,"and if so, what advice would you have for this..."
8851,HpcFczHymBE,"Disaster, Conflict, and Impact Assessment Mak...",https://www.youtube.com/watch?v=HpcFczHymBE,"you want to reduce risk, make it safer than be..."


### Creating the Vector Database

In [2]:
df = pd.read_csv(os.path.join(BASE_DATA_PATH, "yt_chunks_llama3.3-70b-dem.csv"))
# df2 = pd.read_csv(os.path.join(BASE_DATA_PATH, "yt_chunks_qwen2.5-32b-hse.csv"))

# Concat two dataframes
# df = pd.concat([df1, df2])


lengths = count_tokens(df)
df['token_length'] = lengths
df = df[df['token_length'] >= 100]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/6188 [00:00<?, ?it/s]

In [None]:
DB = VectorDB(model="qwen2.5:32b", dataframe=df)
documents = DB.get_documents()
DB.create_vectordb(documents=documents)

Create a vector database snapshot

In [4]:
vector_store = QdrantVectorStore.from_existing_collection(
    embedding=OllamaEmbeddings(model="qwen2.5:32b"),
    collection_name="youtube_collection",
    url="http://localhost:6333",
    retrieval_mode=RetrievalMode.DENSE,
)

retriever = vector_store.as_retriever()
result = retriever.invoke("What are the trending AI technologies in Disaster Management?")
pprint(result)

[Document(metadata={'video_id': 'pGJliOHx1cc', 'title': 'Intelligence-enabled work health and safety, Maureen Hassall', 'url': 'https://www.youtube.com/watch?v=pGJliOHx1cc', '_id': 'fb28ec60-7414-4366-acf8-6f26ad6d420b', '_collection_name': 'youtube_collection'}, page_content="- Well, it's actually happening with the guy in the field. - [Chris], Yeah - So, wearing some of the tech, or with the cameras, with the mobile devices, they are filming and talking, They are looking at situations in the field, or they're looking at a piece of equipment, for example, in the field, And they're beaming the vision and having the conversation back to the manufacturer, which could be anywhere in the world, or back to the, say, the expert engineers who are sitting in a corporate office somewhere because they can't fly it in the moment. That's happening now?"),
 Document(metadata={'video_id': 'pfAjb2gCvJs', 'title': 'Safety Management System', 'url': 'https://www.youtube.com/watch?v=pfAjb2gCvJs', '_id':

### Topic Modeling

In [2]:
df_dem = pd.read_csv("data/yt_chunks_llama3.3-70b-dem.csv")
df_hse = pd.read_csv("data/yt_chunks_llama3.3-70b-hse.csv")

df_dem['token_length'] = count_tokens(df_dem)
df_dem = df_dem[df_dem['token_length'] >= 100]

df_hse['token_length'] = count_tokens(df_hse)
df_hse = df_hse[df_hse['token_length'] >= 100]

paragraphs_dem = df_dem['page_content'].tolist()
paragraphs_hse = df_hse['page_content'].tolist()

  0%|          | 0/6188 [00:00<?, ?it/s]

  0%|          | 0/8853 [00:00<?, ?it/s]

In [3]:
BT = BertTopic()
result_conv_dem = BT.get_topics(paragraphs=paragraphs_dem, filename="topics_conv_dem")
result_fewshot_dem = BT.get_topics(paragraphs=paragraphs_dem, filename="topics_fewshot_dem", is_few_shot=True, topics_type="dem")
result_conv_hse = BT.get_topics(paragraphs=paragraphs_hse, filename="topics_conv_hse")
result_fewshot_hse = BT.get_topics(paragraphs=paragraphs_hse, filename="topics_fewshot_hse", is_few_shot=True, topics_type="hse")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


Chunks:   0%|          | 0/40 [00:00<?, ?it/s]

2024-12-30 14:11:39,429 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-30 14:12:05,579 - BERTopic - Dimensionality - Completed ✓
2024-12-30 14:12:05,580 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-30 14:12:05,783 - BERTopic - Cluster - Completed ✓
2024-12-30 14:12:05,786 - BERTopic - Representation - Extracting topics from clusters using representation models.
  0%|          | 0/3 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 33%|███▎      | 1/3 [01:23<02:47, 83.85s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 67%|██████▋   | 2/3 [03:08<01:36, 96.08s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
100%|██████████| 3/3 [04:34<00:00, 91.60s/it]
2024-12-30 14:16:49,869 - BERTopic - Representation - Completed ✓


Model saved successfully.


Chunks:   0%|          | 0/40 [00:00<?, ?it/s]

2024-12-30 14:17:26,508 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-30 14:17:43,091 - BERTopic - Dimensionality - Completed ✓
2024-12-30 14:17:43,092 - BERTopic - Zeroshot Step 1 - Finding documents that could be assigned to either one of the zero-shot topics
2024-12-30 14:17:43,355 - BERTopic - Zeroshot Step 1 - Completed ✓
2024-12-30 14:17:43,378 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-30 14:17:43,485 - BERTopic - Cluster - Completed ✓
2024-12-30 14:17:43,488 - BERTopic - Representation - Extracting topics from clusters using representation models.
  0%|          | 0/79 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|▏         | 1/79 [01:36<2:05:58, 96.90s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  3%|▎         | 2/79 [02:52<1:48:33, 84.59s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  4%|▍         | 3

Model saved successfully.


Chunks:   0%|          | 0/40 [00:00<?, ?it/s]

2024-12-30 15:41:31,906 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-30 15:41:52,651 - BERTopic - Dimensionality - Completed ✓
2024-12-30 15:41:52,653 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-30 15:41:52,960 - BERTopic - Cluster - Completed ✓
2024-12-30 15:41:52,963 - BERTopic - Representation - Extracting topics from clusters using representation models.
  0%|          | 0/3 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 33%|███▎      | 1/3 [01:10<02:21, 70.70s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 67%|██████▋   | 2/3 [02:17<01:08, 68.62s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
100%|██████████| 3/3 [03:36<00:00, 72.32s/it]
2024-12-30 15:45:38,283 - BERTopic - Representation - Completed ✓


Model saved successfully.


Chunks:   0%|          | 0/40 [00:00<?, ?it/s]

2024-12-30 15:46:27,025 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-30 15:46:28,457 - BERTopic - Dimensionality - Completed ✓
2024-12-30 15:46:28,459 - BERTopic - Zeroshot Step 1 - Finding documents that could be assigned to either one of the zero-shot topics
2024-12-30 15:46:28,728 - BERTopic - Zeroshot Step 1 - Completed ✓
2024-12-30 15:46:28,764 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-30 15:46:28,903 - BERTopic - Cluster - Completed ✓
2024-12-30 15:46:28,907 - BERTopic - Representation - Extracting topics from clusters using representation models.
  0%|          | 0/108 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|          | 1/108 [01:14<2:13:35, 74.91s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  2%|▏         | 2/108 [02:27<2:10:25, 73.83s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  3%|▎         

Model saved successfully.


In [5]:
result_fewshot_dem

Unnamed: 0,Number,Keywords,Scores,topics
0,-1,"[disasters, disaster, talk, response, technolo...","[0.66213006, 0.64768475, 0.6150311, 0.6131026,...",Disaster Resilience and Utilization of Artifi...
1,0,"[landsat, floods, sensing, flooding, satellite...","[0.71881, 0.6544526, 0.6510002, 0.64924717, 0....",Environmental impacts of urban flooding and r...
2,1,"[ai, disasters, analytics, technologies, algor...","[0.6929591, 0.61474663, 0.59913355, 0.592487, ...",Artificial Intelligence in Disaster Management
3,2,"[earthquakes, seismic, earthquake, liquefactio...","[0.7056249, 0.68656886, 0.6598463, 0.6497767, ...",Machine Learning Applications in Earth Scienc...
4,3,"[preparedness, emergencies, evacuate, prepare,...","[0.715209, 0.7061695, 0.68774116, 0.6874632, 0...",Emergency Preparedness and Response
...,...,...,...,...
74,73,"[ai, algorithms, actuarial, leveraging, actuar...","[0.69038916, 0.63820773, 0.63776004, 0.6367227...",Environmental and Social Impact of AI in the ...
75,74,"[scotland, conservation, edinburgh, scottish, ...","[0.66746104, 0.65448564, 0.6487241, 0.64365715...",Climate Change Adaptation in Scotland's Cultu...
76,75,"[ai, communications, communication, communicat...","[0.6819091, 0.6579471, 0.6568199, 0.6485001, 0...",Environmental impact of language barriers in ...
77,76,"[populations, conflict, population, iraq, bagh...","[0.64009595, 0.6313702, 0.6119294, 0.6098175, ...",Cities and conflict: the impact of war on hum...


### Fine-Tune Classifier

In [2]:
RUNNER = Train(model_name='sentence-transformers/all-MiniLM-L6-v2',
               dataset_path='data/train/setfit.csv')

train, validation, test = RUNNER.split_data()
RUNNER.train(train_dataset=train, eval_dataset=validation)

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Currently using DataParallel (DP) for multi-gpu training, while DistributedDataParallel (DDP) is recommended for faster training. See https://sbert.net/docs/sentence_transformer/training/distributed.html for more information.


Map:   0%|          | 0/160 [00:00<?, ? examples/s]

***** Running training *****
  Num unique pairs = 12880
  Batch size = 16
  Num epochs = 1


Step,Training Loss,Validation Loss
20,0.2207,0.177177
40,0.1662,0.106601
60,0.0827,0.081038
80,0.0387,0.078453
100,0.0192,0.074794
120,0.0098,0.077936
140,0.0064,0.07942
160,0.0054,0.080536
180,0.0055,0.082035
200,0.005,0.082227


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

***** Running evaluation *****


{'accuracy': 0.95,
 'f1': 0.9523809523809523,
 'precision': 0.9090909090909091,
 'recall': 1.0}


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model_head.pkl:   0%|          | 0.00/3.94k [00:00<?, ?B/s]

In [36]:
# Test the trained model
model = SetFitModel.from_pretrained('sharukat/adersim-dem-hse')
preds = model.predict(test['text'])
preds = preds.tolist() 

accuracy = accuracy_score(test['label'], preds)
precision = precision_score(test['label'], preds, average="binary")
recall = recall_score(test['label'], preds, average="binary")
f1 = f1_score(test['label'], preds, average="binary")

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1: {f1}")

Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1: 1.0


In [38]:
print(classification_report(test['label'], preds))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00        10

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20

