In [2]:
#Imports
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Initialize embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

In [2]:
# From Chat GPT - "Generate a list of realistic sample text that can serve as sample data for a semantic search tool. Write them as a Python dict"
sample_data = [
    {
        "id": 1,
        "category": "Product Description",
        "text": "Experience unparalleled sound quality with the EchoSphere wireless earbuds, featuring noise cancellation, 12-hour battery life, and an ergonomic design perfect for workouts."
    },
    {
        "id": 2,
        "category": "Movie Synopsis",
        "text": "In a world ravaged by climate change, a group of unlikely heroes embarks on a perilous journey to save humanity from extinction."
    },
    {
        "id": 3,
        "category": "News Article",
        "text": "The city council approved the new public transportation plan yesterday, aiming to reduce traffic congestion and lower carbon emissions by 2030."
    },
    {
        "id": 4,
        "category": "Recipe",
        "text": "Preheat the oven to 375°F. Mix flour, sugar, and eggs in a bowl, then fold in fresh blueberries. Bake for 25 minutes or until golden brown."
    },
    {
        "id": 5,
        "category": "Travel Guide",
        "text": "Discover the hidden gems of Kyoto, from tranquil temples to bustling markets, and experience authentic Japanese culture like never before."
    },
    {
        "id": 6,
        "category": "Scientific Abstract",
        "text": "This study investigates the effects of microplastic pollution on marine ecosystems, revealing significant impacts on coral reef health and biodiversity."
    },
    {
        "id": 7,
        "category": "Book Review",
        "text": "An evocative tale of love and loss, 'The Silent Horizon' beautifully captures the complexities of human relationships through vivid prose."
    },
    {
        "id": 8,
        "category": "Job Posting",
        "text": "Looking for a skilled software engineer proficient in Python and cloud computing to join a fast-paced startup focused on AI-driven healthcare solutions."
    },
    {
        "id": 9,
        "category": "User Manual",
        "text": "To reset your device, hold the power button for 10 seconds until the LED indicator flashes. Release the button and wait for the system reboot."
    },
    {
        "id": 10,
        "category": "Historical Event",
        "text": "The Berlin Wall, constructed in 1961, symbolized the Cold War division between East and West Germany until its fall in 1989 sparked reunification."
    },
    {
        "id": 11,
        "category": "Customer Review",
        "text": "The blender exceeded my expectations with its powerful motor and easy-to-clean design. Perfect for smoothies and soups!"
    },
    {
        "id": 12,
        "category": "Health & Fitness",
        "text": "Regular cardio workouts not only improve heart health but also boost mental clarity and reduce stress levels."
    },
    {
        "id": 13,
        "category": "Legal Document",
        "text": "This agreement is entered into by and between Party A and Party B for the purpose of outlining the terms and conditions of service."
    },
    {
        "id": 14,
        "category": "E-commerce FAQ",
        "text": "Q: Does this jacket have waterproof capabilities? A: Yes, it is made with breathable waterproof fabric suitable for heavy rain."
    },
    {
        "id": 15,
        "category": "Educational Content",
        "text": "Photosynthesis is the process by which green plants convert sunlight into chemical energy, producing oxygen as a byproduct."
    }
]

# From Chat GPT - "Write some sample queries that would help evaluate whether the semantic search tool was working properly"
sample_queries = [
    "wireless earbuds with good battery life",
    "a movie about climate change and heroes",
    "latest city plans to reduce traffic and pollution",
    "how to bake blueberry muffins",
    "best places to visit in Kyoto",
    "impact of plastic waste on ocean life",
    "a novel about love and heartbreak",
    "job opening for Python developer in healthcare AI",
    "how to restart a gadget when it freezes",
    "events leading to the fall of the Berlin Wall",
    "reviews for powerful and easy-to-clean blender",
    "tips for improving heart health with exercise",
    "contract terms between two companies",
    "does this jacket keep you dry in rain",
    "process plants use to make energy from sunlight"
]

In [5]:
# Extract texts
sample_texts = [data["text"] for data in sample_data]

In [6]:
# Generate text embeddings
embeddings = model.encode(sample_texts)

In [8]:
type(embeddings)

numpy.ndarray

In [12]:
# Embed query
test_query = sample_queries[0]

In [13]:
test_query #should return embedding index 0

'wireless earbuds with good battery life'

In [14]:
test_query_embedding = model.encode(test_query)

In [None]:
similarities = cosine_similarity(X = test_query_embedding.reshape(1,-1), Y = embeddings)

In [21]:
similarities

array([[ 0.6872417 , -0.06613445, -0.00151109,  0.0740163 ,  0.09249084,
         0.01730586, -0.05482122, -0.00388624, -0.06451464,  0.03496358,
        -0.00553083,  0.03900235, -0.02225405,  0.03706757,  0.04317591]],
      dtype=float32)

In [50]:
#Find top k results - let k = 3
np.argsort(similarities).tolist()[0][::-1][0:3]

[0, 4, 3]

In [51]:
sample_data[0]

{'id': 1,
 'category': 'Product Description',
 'text': 'Experience unparalleled sound quality with the EchoSphere wireless earbuds, featuring noise cancellation, 12-hour battery life, and an ergonomic design perfect for workouts.'}

In [3]:
# Modify sample_data to mirror metadata format in Document class
sample_metadata = [
    {
        "id": 1,
        "category": "Product Description"
    },
    {
        "id": 2,
        "category": "Movie Synopsis"
    },
    {
        "id": 3,
        "category": "News Article"
    },
    {
        "id": 4,
        "category": "Recipe"
    },
    {
        "id": 5,
        "category": "Travel Guide"
    },
    {
        "id": 6,
        "category": "Scientific Abstract"
    },
    {
        "id": 7,
        "category": "Book Review"
    },
    {
        "id": 8,
        "category": "Job Posting"
    },
    {
        "id": 9,
        "category": "User Manual"
    },
    {
        "id": 10,
        "category": "Historical Event"
    },
    {
        "id": 11,
        "category": "Customer Review"
    },
    {
        "id": 12,
        "category": "Health & Fitness"
    },
    {
        "id": 13,
        "category": "Legal Document"
    },
    {
        "id": 14,
        "category": "E-commerce FAQ"
    },
    {
        "id": 15,
        "category": "Educational Content"
    }
]

In [60]:
# Test filteringa and combining text and metadata
top_idxs = np.argsort(similarities).tolist()[0][::-1][0:3]
top_texts = [sample_texts[idx] for idx in top_idxs]
top_metadata = [sample_metadata[idx] for idx in top_idxs]


In [61]:
top_texts

['Experience unparalleled sound quality with the EchoSphere wireless earbuds, featuring noise cancellation, 12-hour battery life, and an ergonomic design perfect for workouts.',
 'Discover the hidden gems of Kyoto, from tranquil temples to bustling markets, and experience authentic Japanese culture like never before.',
 'Preheat the oven to 375°F. Mix flour, sugar, and eggs in a bowl, then fold in fresh blueberries. Bake for 25 minutes or until golden brown.']

In [62]:
top_metadata

[{'id': 1, 'category': 'Product Description'},
 {'id': 5, 'category': 'Travel Guide'},
 {'id': 4, 'category': 'Recipe'}]

In [67]:
for dictionary in top_metadata:
    idx = top_metadata.index(dictionary)
    dictionary["text"] = top_texts[idx]
    
top_metadata

[{'id': 1,
  'category': 'Product Description',
  'text': 'Experience unparalleled sound quality with the EchoSphere wireless earbuds, featuring noise cancellation, 12-hour battery life, and an ergonomic design perfect for workouts.'},
 {'id': 5,
  'category': 'Travel Guide',
  'text': 'Discover the hidden gems of Kyoto, from tranquil temples to bustling markets, and experience authentic Japanese culture like never before.'},
 {'id': 4,
  'category': 'Recipe',
  'text': 'Preheat the oven to 375°F. Mix flour, sugar, and eggs in a bowl, then fold in fresh blueberries. Bake for 25 minutes or until golden brown.'}]

In [68]:
import pprint

In [69]:
# pprint
pprint.pprint(top_metadata)

[{'category': 'Product Description',
  'id': 1,
  'text': 'Experience unparalleled sound quality with the EchoSphere wireless '
          'earbuds, featuring noise cancellation, 12-hour battery life, and an '
          'ergonomic design perfect for workouts.'},
 {'category': 'Travel Guide',
  'id': 5,
  'text': 'Discover the hidden gems of Kyoto, from tranquil temples to '
          'bustling markets, and experience authentic Japanese culture like '
          'never before.'},
 {'category': 'Recipe',
  'id': 4,
  'text': 'Preheat the oven to 375°F. Mix flour, sugar, and eggs in a bowl, '
          'then fold in fresh blueberries. Bake for 25 minutes or until golden '
          'brown.'}]


In [70]:
import tabulate

In [74]:
# tabulate
headers = top_metadata[0].keys() # Get headers from the first dictionary
rows = [d.values() for d in top_metadata]

print(tabulate.tabulate(rows, headers=headers, tablefmt="grid"))

+------+---------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|   id | category            | text                                                                                                                                                                          |
|    1 | Product Description | Experience unparalleled sound quality with the EchoSphere wireless earbuds, featuring noise cancellation, 12-hour battery life, and an ergonomic design perfect for workouts. |
+------+---------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|    5 | Travel Guide        | Discover the hidden gems of Kyoto, from tranquil temples to bustling markets, and experience authentic Japanese culture like never before.   

In [84]:
# F-string
for dictionary in top_metadata:
    for key,value in dictionary.items():
        print(f"{key}: {value}", end=" | ")
    print(" ")

id: 1 | category: Product Description | text: Experience unparalleled sound quality with the EchoSphere wireless earbuds, featuring noise cancellation, 12-hour battery life, and an ergonomic design perfect for workouts. |  
id: 5 | category: Travel Guide | text: Discover the hidden gems of Kyoto, from tranquil temples to bustling markets, and experience authentic Japanese culture like never before. |  
id: 4 | category: Recipe | text: Preheat the oven to 375°F. Mix flour, sugar, and eggs in a bowl, then fold in fresh blueberries. Bake for 25 minutes or until golden brown. |  


In [93]:
options = ["pprint", "tabulate", "f-string"]

option = "pprint"

if option not in options:
    opt_list = ", ".join(options)
    raise ValueError(f"expected one of {opt_list}")
else:
    print(option)


pprint


In [94]:
# Turn sample data into df for export to data folder
import pandas as pd
sample_df = pd.DataFrame(sample_data)

Unnamed: 0,id,category,text
0,1,Product Description,Experience unparalleled sound quality with the...
1,2,Movie Synopsis,"In a world ravaged by climate change, a group ..."
2,3,News Article,The city council approved the new public trans...
3,4,Recipe,"Preheat the oven to 375°F. Mix flour, sugar, a..."
4,5,Travel Guide,"Discover the hidden gems of Kyoto, from tranqu..."
5,6,Scientific Abstract,This study investigates the effects of micropl...
6,7,Book Review,"An evocative tale of love and loss, 'The Silen..."
7,8,Job Posting,Looking for a skilled software engineer profic...
8,9,User Manual,"To reset your device, hold the power button fo..."
9,10,Historical Event,"The Berlin Wall, constructed in 1961, symboliz..."


In [95]:
sample_df

Unnamed: 0,id,category,text
0,1,Product Description,Experience unparalleled sound quality with the...
1,2,Movie Synopsis,"In a world ravaged by climate change, a group ..."
2,3,News Article,The city council approved the new public trans...
3,4,Recipe,"Preheat the oven to 375°F. Mix flour, sugar, a..."
4,5,Travel Guide,"Discover the hidden gems of Kyoto, from tranqu..."
5,6,Scientific Abstract,This study investigates the effects of micropl...
6,7,Book Review,"An evocative tale of love and loss, 'The Silen..."
7,8,Job Posting,Looking for a skilled software engineer profic...
8,9,User Manual,"To reset your device, hold the power button fo..."
9,10,Historical Event,"The Berlin Wall, constructed in 1961, symboliz..."


In [97]:

sample_df.to_csv(path_or_buf = "../data/synthetic_data.csv", sep = ",", )

In [4]:
sample_metadata

[{'id': 1, 'category': 'Product Description'},
 {'id': 2, 'category': 'Movie Synopsis'},
 {'id': 3, 'category': 'News Article'},
 {'id': 4, 'category': 'Recipe'},
 {'id': 5, 'category': 'Travel Guide'},
 {'id': 6, 'category': 'Scientific Abstract'},
 {'id': 7, 'category': 'Book Review'},
 {'id': 8, 'category': 'Job Posting'},
 {'id': 9, 'category': 'User Manual'},
 {'id': 10, 'category': 'Historical Event'},
 {'id': 11, 'category': 'Customer Review'},
 {'id': 12, 'category': 'Health & Fitness'},
 {'id': 13, 'category': 'Legal Document'},
 {'id': 14, 'category': 'E-commerce FAQ'},
 {'id': 15, 'category': 'Educational Content'}]

In [12]:
print(f"List of metadata fields: {', '.join(list(sample_metadata[0].keys()))}")

List of metadata fields: id, category
