In [1]:
import pandas as pd

# Load the CSV file into a Pandas DataFrame
csv_file = './data/Youtube_Video_Dataset.csv'
df = pd.read_csv(csv_file)

# filter out all other categories
only_music = df[df['Category'] == 'Art&Music']


# Combine specified columns into a single text column
only_music['combined_text'] = only_music['Title'] + ' ' + only_music['Description'] + ' ' + only_music['Category']

# convert all values into string type
only_music['combined_text'] = only_music['combined_text'].astype(str)
only_music['Title'] = only_music['Title'].astype(str)
only_music['Description'] = only_music['Description'].astype(str)
only_music['Category'] = only_music['Category'].astype(str)

only_music.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  only_music['combined_text'] = only_music['Title'] + ' ' + only_music['Description'] + ' ' + only_music['Category']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  only_music['combined_text'] = only_music['combined_text'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  only_music['Title'] 

Unnamed: 0,Title,Videourl,Category,Description,combined_text
9446,FINE ART Music and Painting PEACEFUL SELECTION...,/watch?v=13E5azGDK1k,Art&Music,"CALM MELODIES AND BEAUTIFUL PICTURES\nDebussy,...",FINE ART Music and Painting PEACEFUL SELECTION...
9447,Improvised Piano Music and Emotional Art Thera...,/watch?v=5mWjq2BsD9Q,Art&Music,When watching this special episode of The Perf...,Improvised Piano Music and Emotional Art Thera...
9448,babyfirst art and music,/watch?v=rrJbuF6zOIk,Art&Music,,
9449,"Art: music & painting - Van Gogh on Caggiano, ...",/watch?v=1b8xiXKd9Kk,Art&Music,♫ Buy “Art: Music & Painting - Van Gogh on on ...,"Art: music & painting - Van Gogh on Caggiano, ..."
9450,The Great Masterpieces of Art & Music,/watch?v=tsKlRF2Gw1s,Art&Music,Skip the art museum and come experience “Great...,The Great Masterpieces of Art & Music Skip the...


Next, we create a Qdrant collection. We need to instantiate a Qdrant client and connect it to Qdrant's local server running at port 6333. The recreate_collection function takes in a collection_name argument, which is the name you want to give to your collection. Note also the vectors_config argument, where we define the size of vector embeddings (our embedding model will be 384 dimension) and similarity calculation metric, where we use Cosine similarity. One can also use create_collection function but it will throw an error if you call the function again with the same collection name.

In [73]:
from qdrant_client import QdrantClient
from qdrant_client.http import models

client = QdrantClient("localhost", port=6333)

client.recreate_collection(
    collection_name="youtube_music_videos",
    vectors_config=models.VectorParams(size=384, distance=models.Distance.COSINE),
)

True

We also initialise the embeddings model. Here we use the sentence-transformer library and the MiniLM model which is a light weight embeddings model and good enough for common language words.

In [3]:
# Initialize SentenceTransformer model
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# convert pandas dataframe to a dictionary of records for inserting into Qdrant collection
music_videos_dict = only_music.to_dict(orient='records')
music_videos_dict

[{'Title': 'FINE ART Music and Painting PEACEFUL SELECTION (Calm Melodies and Beautiful Pictures)',
  'Videourl': '/watch?v=13E5azGDK1k',
  'Category': 'Art&Music',
  'Description': 'CALM MELODIES AND BEAUTIFUL PICTURES\nDebussy, Milena Stanisic,\nPiano, Flute, Harp,\nFlowers, Sailing, Mediterranean, Lavender,',
  'combined_text': 'FINE ART Music and Painting PEACEFUL SELECTION (Calm Melodies and Beautiful Pictures) CALM MELODIES AND BEAUTIFUL PICTURES\nDebussy, Milena Stanisic,\nPiano, Flute, Harp,\nFlowers, Sailing, Mediterranean, Lavender, Art&Music'},
 {'Title': 'Improvised Piano Music and Emotional Art Therapy - Featuring Erica Orth',
  'Videourl': '/watch?v=5mWjq2BsD9Q',
  'Category': 'Art&Music',
  'Description': 'When watching this special episode of The Perfect Note, keep in mind, every single note heard and stroke of paint seen in this video is completely improvised. Like a conversation between two people, together they are telling a story. \n\nIn this segment, Pianist Virtuo

Finally, we insert the records into the collection, including converting the text in the combined_text columns to embeddings.

In [74]:
# upload the records in the Qdrant collection, including creating the vector embeddings of the combined_text column
for idx, doc in enumerate(music_videos_dict):
    client.upload_records(
    collection_name="youtube_music_videos",
    records=[
        models.Record(
            id=idx, vector=model.encode(doc["Title"]), payload=doc
        )])

  client.upload_records(


In [91]:
# perform semantic search for a given query in the collection 
def search_video(query: str) -> list[dict]:
    collection_name = "youtube_music_videos"
# Convert text query into vector
    vector = model.encode(query).tolist()

    # Use `vector` for search for closest vectors in the collection
    search_results = client.search(
        collection_name=collection_name,
        query_vector=vector,
        query_filter=None,  # If you don't want any other filters
        limit=10,  # get 10 most similar results
    )
    # `search_results` contains found vector ids with similarity scores along with the stored payload
    # In this function we are interested in payload only
    results = []
    for hit in search_results:
        item = {}
        item['score'] = hit.score
        item['Title'] = hit.payload['Title']
        url = hit.payload['Videourl']
        item['URL'] = f'youtube.com{url}'
        results.append(item)
    return results

In [93]:
# query the collection
query = 'dua lipa videos'
search_video(query)

[{'score': 0.8309551,
  'Title': 'Dua Lipa - New Rules (Official Music Video)',
  'URL': 'youtube.com/watch?v=k2qgadSvNyU'},
 {'score': 0.8116781,
  'Title': 'Dua Lipa - IDGAF (Official Music Video)',
  'URL': 'youtube.com/watch?v=Mgfe5tIwOj0'},
 {'score': 0.80936086,
  'Title': 'Dua Lipa - Be The One (Official Music Video)',
  'URL': 'youtube.com/watch?v=-rey3m8SWQI'},
 {'score': 0.55487275,
  'Title': 'Sean Paul - No Lie ft. Dua Lipa (Krajnc Remix) (Baywatch Official Music Video)',
  'URL': 'youtube.com/watch?v=hMiHGkzr3ZQ'},
 {'score': 0.49306965,
  'Title': 'Lana Del Rey - Music To Watch Boys To (Official Music Video)',
  'URL': 'youtube.com/watch?v=5kYsxoWfjCg'},
 {'score': 0.48478898,
  'Title': 'Smash Mouth - All Star (Official Music Video)',
  'URL': 'youtube.com/watch?v=L_jWHffIx5E'},
 {'score': 0.47906196,
  'Title': 'Iggy Azalea - Fancy ft. Charli XCX (Official Music Video)',
  'URL': 'youtube.com/watch?v=O-zpOMYRi0w'},
 {'score': 0.47792414,
  'Title': 'ZAYN - PILLOWTALK (O

In [150]:
def search_video(query: str) -> list[dict]:
    collection_name = "youtube_music_videos"
# Convert text query into vector
    vector = model.encode(query).tolist()

    # Use `vector` for search for closest vectors in the collection
    search_results = client.search(
        collection_name=collection_name,
        query_vector=vector,
        query_filter=None,  # If you don't want any other filters
        limit=10, 
         score_threshold = 0.5 # get 10 most similar results
    )
    # `search_results` contains found vector ids with similarity scores along with the stored payload
    # In this function we are interested in payload only
    results = []
    for hit in search_results:
        item = {}
        item['score'] = hit.score
        item['Title'] = hit.payload['Title']
        url = hit.payload['Videourl']
        item['URL'] = f'youtube.com{url}'
        results.append(item)
    return results

In [151]:
# query the collection
query = 'dua lipa videos'
search_video(query)

[{'Title': 'Dua Lipa - New Rules (Official Music Video)',
  'URL': 'youtube.com/watch?v=k2qgadSvNyU'},
 {'Title': 'Dua Lipa - IDGAF (Official Music Video)',
  'URL': 'youtube.com/watch?v=Mgfe5tIwOj0'},
 {'Title': 'Dua Lipa - Be The One (Official Music Video)',
  'URL': 'youtube.com/watch?v=-rey3m8SWQI'},
 {'Title': 'Sean Paul - No Lie ft. Dua Lipa (Krajnc Remix) (Baywatch Official Music Video)',
  'URL': 'youtube.com/watch?v=hMiHGkzr3ZQ'}]

In [182]:
# specify likes and dislikes as positive and negative queries
negative_1 = 'heavy metal'
positive_1 = 'piano music'

negative_2 = 'rock music'
positive_2 = 'classical music'

# only used when a target query is available
target_embedding = model.encode(query).tolist()

# calculate embeddings for the positive and negative points
positive_embedding_1 = model.encode(positive_1).tolist()
negative_embedding_1= model.encode(negative_1).tolist()

# calculate embeddings for the another pair of positive and negative points
positive_embedding_2 = model.encode(positive_2).tolist()
negative_embedding_2= model.encode(negative_2).tolist()

# create the context example pair
context = [models.ContextExamplePair(positive=positive_embedding_1, negative=negative_embedding_1),
           models.ContextExamplePair(positive=positive_embedding_2, negative=negative_embedding_2)]

# call the discover api
discover = client.discover(
    collection_name = "youtube_music_videos",
        context = context,
        limit=5,

)

# organise the results from the discover api
results = []
for hit in discover:
    item = {}
    item['Title'] = hit.payload['Title']
    url = hit.payload['Videourl']
    item['URL'] = f'youtube.com{url}'
    results.append(item)

display(results)


[{'Title': 'The computer as artist: AI art and music',
  'URL': 'youtube.com/watch?v=ZDcaDv0U8yw'},
 {'Title': 'Arts For Healing: Music and Art Therapy',
  'URL': 'youtube.com/watch?v=6By9oTQIQxQ'},
 {'Title': 'Elephants, Art and Music on the River Kwai',
  'URL': 'youtube.com/watch?v=r1uDNRzcAV0'},
 {'Title': "Art: music & painting - Van Gogh on Caggiano, Floridia, Boito, Mahler and Brahms' music",
  'URL': 'youtube.com/watch?v=1b8xiXKd9Kk'},
 {'Title': 'The Artist Who Paints What She Hears',
  'URL': 'youtube.com/watch?v=zbh7tAnwLCY'}]

In [178]:
music_videos_dict

[{'Title': 'FINE ART Music and Painting PEACEFUL SELECTION (Calm Melodies and Beautiful Pictures)',
  'Videourl': '/watch?v=13E5azGDK1k',
  'Category': 'Art&Music',
  'Description': 'CALM MELODIES AND BEAUTIFUL PICTURES\nDebussy, Milena Stanisic,\nPiano, Flute, Harp,\nFlowers, Sailing, Mediterranean, Lavender,',
  'combined_text': 'FINE ART Music and Painting PEACEFUL SELECTION (Calm Melodies and Beautiful Pictures) CALM MELODIES AND BEAUTIFUL PICTURES\nDebussy, Milena Stanisic,\nPiano, Flute, Harp,\nFlowers, Sailing, Mediterranean, Lavender, Art&Music'},
 {'Title': 'Improvised Piano Music and Emotional Art Therapy - Featuring Erica Orth',
  'Videourl': '/watch?v=5mWjq2BsD9Q',
  'Category': 'Art&Music',
  'Description': 'When watching this special episode of The Perfect Note, keep in mind, every single note heard and stroke of paint seen in this video is completely improvised. Like a conversation between two people, together they are telling a story. \n\nIn this segment, Pianist Virtuo