# Ted Talks : Extending local LLM capability to extract data using Vector embeddings

local llm installation: https://github.com/Mozilla-Ocho/llamafile?tab=readme-ov-file

Dataset From: https://www.kaggle.com/datasets/ahmadfatani/ted-talks-dataset

In [2]:
import pandas as pd
from huggingface_hub import Collection

df = pd.read_csv('ted_main_v2.csv')

In [8]:
df.head()

Unnamed: 0,speaker_name,title,posted_date,duration,Link,about_speaker,about_talk,views,tags
0,Alex Gendler,The Egyptian myth of the death of Osiris,Jul 2020,3:56,https://www.ted.com/talks/alex_gendler_the_egy...,,"Long jealous of his older brother Osiris, the ...",208703,"education,ancient world,TED-Ed"
1,Shari Davis,What if you could help decide how the governme...,Jul 2020,10:28,https://www.ted.com/talks/shari_davis_what_if_...,As a leader of the Participatory Budgeting Pro...,What if you could help decide how the governme...,425688,"democracy,leadership,community"
2,Nita Mosby Tyler,Want a more just world? Be an unlikely ally,Jul 2020,10:15,https://www.ted.com/talks/nita_mosby_tyler_wan...,Nita Mosby Tyler specializes in the developmen...,A more equal world starts with you. Citing a f...,460269,"activism,inequality,race"
3,Susan Lupack,The race to decode a mysterious language,Jul 2020,4:24,https://www.ted.com/talks/susan_lupack_the_rac...,,"In the early 1900s, archaeologist Sir Arthur E...",350202,"TED-Ed,education,language"
4,Ariel Waldman,The colorful critter world of microbes in Anta...,Jul 2020,5:56,https://www.ted.com/talks/ariel_waldman_the_co...,"An artist who's pivoted to science, Ariel Wald...","In this tour of the microscopic world, explore...",333482,"science,animals,exploration"


In [9]:
df.columns

Index(['speaker_name', 'title', 'posted_date', 'duration', 'Link',
       'about_speaker', 'about_talk', 'views', 'tags'],
      dtype='object')

In [10]:
df = df[df['about_talk'].notna()] # remove any NaN values as it blows up serialization
#data = df.sample(700).to_dict('records') # Get only 700 records. More records will make it slower to index
data = df.to_dict('records')
len(data)

2159

In [11]:
from qdrant_client import models, QdrantClient
from sentence_transformers import SentenceTransformer

In [12]:
encoder = SentenceTransformer('all-MiniLM-L6-v2') # Model to create embeddings

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [13]:
# create the vector database client
qdrant = QdrantClient(":memory:") # Create in-memory Qdrant instance

In [17]:
ted_talk_collection = "ted_talks"

In [18]:
# Create collection to store wines
if qdrant.collection_exists(ted_talk_collection):
    qdrant.delete_collection(ted_talk_collection)

qdrant.create_collection(
    collection_name=ted_talk_collection,
    vectors_config=models.VectorParams(
        size=encoder.get_sentence_embedding_dimension(), # Vector size is defined by used model
        distance=models.Distance.COSINE
    )
)

True

In [19]:
# vectorize!
qdrant.upload_points(
    collection_name=ted_talk_collection,
    points=[
        models.PointStruct(
            id=idx,
            vector=encoder.encode(doc["about_talk"]).tolist(),
            payload=doc,
        ) for idx, doc in enumerate(data) # data is the variable holding all the wines
    ]
)

In [20]:
user_prompt = "Suggest me something related to adventure in the nature and wild"

In [22]:
# Search time for ted talks
hits = qdrant.search(
    collection_name=ted_talk_collection,
    query_vector=encoder.encode(user_prompt).tolist(),
    limit=3
)
for hit in hits:
  print(hit.payload, "score:", hit.score)

{'speaker_name': 'Lisa Winer', 'title': 'Can you solve the river crossing riddle?', 'posted_date': 'Nov 2016', 'duration': '3:58', 'Link': 'https://www.ted.com/talks/lisa_winer_can_you_solve_the_river_crossing_riddle?language=en', 'about_speaker': nan, 'about_talk': 'As a wildfire rages through the grasslands, three lions and three wildebeest flee for their lives. To escape the inferno, they must cross over to the left bank of a crocodile-infested river. Can you help them figure out how to get across on the one raft available without losing any lives? Lisa Winer shows how. [Directed by Artrake Studio, narrated by Addison Anderson].', 'views': '4,929,129', 'tags': 'poverty,finance,art'} score: 0.42276408329670906
{'speaker_name': 'Steve Boyes', 'title': "How we're saving one of Earth's last wild places", 'posted_date': 'Jul 2018', 'duration': '9:01', 'Link': 'https://www.ted.com/talks/steve_boyes_how_we_re_saving_one_of_earth_s_last_wild_places?language=en', 'about_speaker': 'Steve Boye

  hits = qdrant.search(


In [23]:
# define a variable to hold the search results
search_results = [hit.payload for hit in hits]

In [25]:
# Now time to connect to the local large language model
# locally ran llm from https://github.com/Mozilla-Ocho/llamafile?tab=readme-ov-file : llava-v1.5-7b-q4.llamafile
from openai import OpenAI
client = OpenAI(
    base_url="http://127.0.0.1:8080/v1", # "http://<Your api-server IP>:port"
    api_key = "sk-no-key-required"
)
completion = client.chat.completions.create(
    model="LLaMA_CPP",
    messages=[
        {"role": "system", "content": "You are chatbot, a video suggestor. Your top priority is to help suggest users into selecting amazing ted talks which fit perfectly with their requests."},
        {"role": "user", "content": user_prompt},
        {"role": "assistant", "content": str(search_results)}
    ]
)
print(completion.choices[0].message)

ChatCompletionMessage(content='Based on your request, I have selected these three Ted Talks that might interest you:\n\n1. Lisa Winer\'s talk "Can you solve the river crossing riddle?" explores the problem of three lions and three wildebeest trying to cross a crocodile-infested river to escape a wildfire. This talk might be of interest if you\'re looking for an adventure in the wild.\n2. Steve Boyes\' talk "How we\'re saving one of Earth\'s last wild places" shares his work studying and conserving the endangered Okavango Delta in Botswana. This talk might be of interest if you\'re looking for an adventure in the nature and wild.\n3. Emma Marris\' talk "Nature is everywhere — we just need to learn to see it" explores the definition of nature and encourages us to consider a new definition of nature that includes not only pristine wilderness but also the untended patches of plants growing in urban spaces. This talk might be of interest if you\'re looking for an adventure in the nature and