In [1]:
import json
from tqdm.auto import tqdm

In [2]:
def load_from_json(filename):
    try:
        with open(filename, 'r') as f:
            data = json.load(f)
        print(f"Data successfully loaded from {filename}")
        return data
    except IOError as e:
        print(f"An error occurred while loading data from {filename}: {e}")
        return None

In [3]:
lego_doc = load_from_json("sets_reviews.json")

Data successfully loaded from sets_reviews.json


In [4]:
# Flattening the structure
flattened_lego_doc = []

for year, sets in lego_doc.items():
    flattened_lego_doc.extend(sets)

print(flattened_lego_doc[0])

{'setID': 49041, 'number': '10359', 'numberVariant': 1, 'name': '{?}', 'year': 2025, 'theme': 'Icons', 'themeGroup': 'Model making', 'category': 'Normal', 'released': False, 'image': {}, 'bricksetURL': 'https://brickset.com/sets/10359-1', 'collection': {}, 'collections': {'wantedBy': 135}, 'LEGOCom': {'US': {}, 'UK': {}, 'CA': {}, 'DE': {}}, 'rating': 0.0, 'reviewCount': 0, 'packagingType': '{Not specified}', 'availability': '{Not specified}', 'instructionsCount': 0, 'additionalImageCount': 0, 'ageRange': {}, 'dimensions': {}, 'barcode': {}, 'extendedData': {}, 'lastUpdated': '2024-03-04T15:05:35.09Z'}


In [12]:

print(flattened_lego_doc[2000])

{'setID': 33706, 'number': '60335', 'numberVariant': 1, 'name': 'Train Station', 'year': 2022, 'theme': 'City', 'themeGroup': 'Modern day', 'subtheme': 'Trains', 'category': 'Normal', 'released': True, 'pieces': 907, 'minifigs': 6, 'image': {'thumbnailURL': 'https://images.brickset.com/sets/small/60335-1.jpg', 'imageURL': 'https://images.brickset.com/sets/images/60335-1.jpg'}, 'bricksetURL': 'https://brickset.com/sets/60335-1', 'collection': {}, 'collections': {'ownedBy': 3028, 'wantedBy': 1933}, 'LEGOCom': {'US': {'retailPrice': 99.99, 'dateFirstAvailable': '2022-07-26T00:00:00Z'}, 'UK': {'retailPrice': 69.99, 'dateFirstAvailable': '2022-06-01T00:00:00Z'}, 'CA': {'retailPrice': 129.99, 'dateFirstAvailable': '2022-07-26T00:00:00Z'}, 'DE': {'retailPrice': 79.99, 'dateFirstAvailable': '2022-08-04T00:00:00Z'}}, 'rating': 4.3, 'reviewCount': 1, 'packagingType': 'Box', 'availability': 'Retail', 'instructionsCount': 7, 'additionalImageCount': 10, 'ageRange': {'min': 7}, 'dimensions': {'heigh

In [5]:
from openai import OpenAI

In [6]:
#client = OpenAI(api_key="")

client = OpenAI(
    base_url='http://localhost:11434/v1/',
    'ollama',
)

In [27]:
prompt_template = """
You emulate a lego customer who's talking to shop asistant because you are searching for a lego set.
Formulate 3 questions based on a catalog record. The record
should contain name year in which lego set was produced and theme of the set. 
Optionaly it can also contain subtheme, description, tags and reviews. 
Tags is a string with number of tags separated by a space. 
Reviews is a string that can contain multiple reviews separeted by a # 
The questions should be complete and not too short.

If possible, use as fewer words as possible from the record. 
Assume that customer doesn't know details from the record and describing what they want in less details.

The record:

name: {name}
year: {year}
theme: {theme}
subtheme: {subtheme}
description: {description}
tags: {tags}
reviews: {reviews}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question3"]
""".strip()

In [20]:
def generate_questions(doc):
    name = doc["name"]
    year = ""
    if "year" in doc:
        year = doc["year"]
    theme = doc["theme"]
    subtheme = "" 
    if "subtheme" in doc:
        subtheme = doc["subtheme"]
    tags = "" 
    description = "" 
    if "extendedData" in doc:
        if "tags" in doc:
            tags = ''.join(doc["tags"])
        if "description" in doc:
            description = doc["description"]
    reviews = ""
    if "reviews" in doc:
        reviews_arr = map(lambda x: x["review"], doc["reviews"])
        reviews = '#'.join(reviews_arr)
        
    prompt = prompt_template.format(name = name, year = year, theme = theme, subtheme = subtheme, description = description, tags = tags, reviews = reviews)

    response = client.chat.completions.create(
        model='phi3',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [32]:
results = {}

In [37]:
import random

test_data = random.sample(flattened_lego_doc, 170)

In [34]:
len(test_data)

150

In [38]:
for doc in tqdm(test_data): 
    doc_id = doc['setID']
    if doc_id in results:
        continue
    questions = generate_questions(doc)
    results[doc_id] = questions

  0%|          | 0/170 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [31]:
results

{9562: '[\n  "Could you tell me more about Stephanie\'s outfit and how it looks like on her? I believe she is supposed to portray a ballerina.", \n  "I am curious about the curtain mechanism. How exactly do we use this stage for ballet practice with my friends, including using sticker placement in tricky reflective areas, as seen from reviews.", \n  "Could you expand on which features of the set were highlighted by reviewers? I\'m particularly interested to know what they thought about combining music performances and dance moves based off their experiences." , \n  "I saw some mentions regarding parts. Specifically in regards to musical instruments accessory, could it perhaps have been better designed or fit within this theme more naturally?",\n  "Finally as a collector of LEGO Friends sets I\'m always on the lookout for unique baseplates and tags but never heard about tan ones before. From your review was there anything you liked uniquely from my collected assortment?"\n]'}

In [36]:
prompt_template = """
You're a lego shop assistant. Answer the QUESTION based on the CONTEXT from Lego sets database.
Use only the facts from the CONTEXT when answering the QUESTION. Return brief description of the set bsaed on the sample data use bricksetURL to provde it's link

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

def build_prompt(query, search_results):
   

    context = ""
    #+ ''.join(doc.get("extendedData", "")) + ''.join(doc.get("reviews", "")) 
    for doc in search_results:
        #context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
        context = context + doc["name"] + doc["bricksetURL"] + str(doc["year"]) + doc["theme"] + doc["category"] + f"\n\n"
        if "minifigs" in doc:
            context = context + str(doc["minifigs"])
        if "themeGroup" in doc:
            context = context + doc["themeGroup"]
        if "reviews" in doc:
            reviews_str = "\n\n".join(
                f"Author: {review['author']}\nDate: {review['datePosted']}\nTitle: {review['title']}\nReview: {review['review']}"
                for review in doc["reviews"]
            )
            context = context + reviews_str
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [8]:
from elasticsearch import Elasticsearch

In [12]:
es_client = Elasticsearch('http://localhost:9200') 

In [13]:
# Define the mapping for the index
mapping = {
    "properties": {
        "setID": {"type": "long"},
        "number": {"type": "keyword"},
        "numberVariant": {"type": "integer"},
        "name": {"type": "text", "fields": {"keyword": {"type": "keyword"}}},
        "year": {"type": "integer", "null_value": 0},
        "theme": {"type": "keyword",  "null_value": ""},
        "themeGroup": {"type": "keyword",  "null_value": ""},
        "category": {"type": "keyword", "null_value": ""},
        "released": {"type": "boolean"},
        "pieces": {"type": "integer"},
        "minifigs": {"type": "integer", "null_value": 0},
        "image": {
            "properties": {
                "thumbnailURL": {"type": "keyword"},
                "imageURL": {"type": "keyword"}
            }
        },
        "bricksetURL": {"type": "keyword"},
        "collection": {"type": "object"},
        "collections": {
            "properties": {
                "ownedBy": {"type": "integer"},
                "wantedBy": {"type": "integer"}
            }
        },
        "LEGOCom": {
            "properties": {
                "US": {
                    "properties": {
                        "retailPrice": {"type": "float"},
                        "dateFirstAvailable": {"type": "date"},
                        "dateLastAvailable": {"type": "date"}
                    }
                },
                "UK": {
                    "properties": {
                        "retailPrice": {"type": "float"},
                        "dateFirstAvailable": {"type": "date"},
                        "dateLastAvailable": {"type": "date"}
                    }
                },
                "CA": {
                    "properties": {
                        "retailPrice": {"type": "float"},
                        "dateFirstAvailable": {"type": "date"},
                        "dateLastAvailable": {"type": "date"}
                    }
                },
                "DE": {"type": "object"}
            }
        },
        "rating": {"type": "float", "null_value": 0.0},
        "reviewCount": {"type": "integer"},
        "packagingType": {"type": "keyword"},
        "availability": {"type": "keyword"},
        "instructionsCount": {"type": "integer"},
        "additionalImageCount": {"type": "integer"},
        "ageRange": {
            "properties": {
                "min": {"type": "integer"},
                "max": {"type": "integer"}
            }
        },
        "dimensions": {
            "properties": {
                "height": {"type": "float"},
                "width": {"type": "float"},
                "depth": {"type": "float"},
                "weight": {"type": "float"}
            }
        },
        "barcode": {
            "properties": {
                "EAN": {"type": "keyword"},
                "UPC": {"type": "keyword"}
            }
        },
        "extendedData": {
            "properties": {
                "tags": {"type": "keyword"},
                "description": {"type": "text"}
            }
        },
        "lastUpdated": {"type": "date"},
        "reviews": {
            "type": "nested",
            "properties": {
                "author": {"type": "keyword"},
                "datePosted": {"type": "date"},
                "rating": {
                    "properties": {
                        "overall": {"type": "float"},
                        "parts": {"type": "float"},
                        "buildingExperience": {"type": "float"},
                        "playability": {"type": "float"},
                        "valueForMoney": {"type": "float"}
                    }
                },
                "title": {"type": "text"},
                "review": {"type": "text"},
                "HTML": {"type": "boolean"}
            }
        },
    }
}


index_name = "lego_sets"

es_client.options(ignore_status=[400,404]).indices.delete(index=index_name)
es_client.indices.create(index=index_name, body={"mappings": mapping})

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'lego_sets'})

In [51]:
from elasticsearch.helpers import bulk

# Function to read documents from the backup file
def read_documents(filename):
    with open(filename, 'r') as f:
        for line in f:
            yield json.loads(line.strip())

# Function to prepare documents for bulk indexing
def doc_generator(documents):
    for doc in documents:
        yield {
            "_index": index_name,
            "_source": doc
        }
# Read documents from the backup file
documents = read_documents('lego_sets_backup.json')

# Bulk index the documents
success, failed = bulk(es_client, doc_generator(tqdm(documents, desc="Indexing documents")), stats_only=True)

print(f"Indexed {success} documents successfully. {failed} documents failed.")

Indexing documents: 0it [00:00, ?it/s]

Indexed 20930 documents successfully. 0 documents failed.


In [14]:
for doc in tqdm(flattened_lego_doc):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/20930 [00:00<?, ?it/s]

In [29]:
import re

def elastic_search(user_input):
    # Connect to Elasticsearch
#"fields": ["name^3", "theme", "subtheme", "year", "rating", "themeGroup", "minifigs"],
    # Convert the extracted information into an Elasticsearch query
    es_query = {
        "query": {
            "multi_match": {
                "query": user_input,
                "fields": ["name^3", "theme", "subtheme", "theme"],
                "fuzziness": 2
            }
        },
        "size": 5
    }

    # Execute the search query
    error_info = ""
    try:
        response = es_client.search(index=index_name, body=es_query)
    except ConnectionError as e:
        print(f"ConnectionError during search: {e}")
        error_info = e
        return error_info
    except RequestError as e:
        print(f"RequestError during search: {e}")
        print("This might be due to an invalid query structure or non-existent fields.")
        error_info = f"RequestError during search: {e}"
        return error_info
    
    result_docs = []

    # Print the search results
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [16]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [26]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer.strip()

In [40]:
query = "I want monster truck lego with best reviews and a lot of parts"

In [22]:
results = elastic_search(query)
display(len(results))

10

In [41]:
rag(query)

"The best-reviewed monster truck LEGO set with a significant number of parts is the **LEGO Monster Trucks** set. This set is particularly fun for kids due to its diverse parts, which promote imaginative free builds, despite having less than ideal building instructions. The set includes several interesting little pieces that allow for various creative scenarios, making it highly playable. \n\nFor more details, you can visit the set's page [here](https://brickset.com/sets/10655-12013)."

In [49]:
from elasticsearch.helpers import scan

# Function to handle JSON serialization of special types
def json_serial(obj):
    if isinstance(obj, (datetime, date)):
        return obj.isoformat()
    raise TypeError(f"Type {type(obj)} not serializable")

# Scan and scroll through the index
results = scan(es_client, index=index_name, query={"query": {"match_all": {}}})

# Save the documents to a file
with open('lego_sets_backup.json', 'w') as f:
    for item in tqdm(results, desc="Saving documents"):
        json.dump(item['_source'], f, default=json_serial)
        f.write('\n')

print("Index backup completed and saved to 'lego_sets_backup.json'")

Saving documents: 0it [00:00, ?it/s]

Index backup completed and saved to 'lego_sets_backup.json'
