In [8]:
import json
from tqdm.auto import tqdm

In [9]:
def load_from_json(filename):
    try:
        with open(filename, 'r') as f:
            data = json.load(f)
        print(f"Data successfully loaded from {filename}")
        return data
    except IOError as e:
        print(f"An error occurred while loading data from {filename}: {e}")
        return None

In [10]:
lego_doc = load_from_json("sets_reviews_rag4000.json")

Data successfully loaded from sets_reviews_rag4000.json


In [4]:
# Flattening the structure
flattened_lego_doc = []

for year, sets in lego_doc.items():
    flattened_lego_doc.extend(sets)

print(flattened_lego_doc[0])

{'setID': 49041, 'number': '10359', 'numberVariant': 1, 'name': '{?}', 'year': 2025, 'theme': 'Icons', 'themeGroup': 'Model making', 'category': 'Normal', 'released': False, 'image': {}, 'bricksetURL': 'https://brickset.com/sets/10359-1', 'collection': {}, 'collections': {'wantedBy': 135}, 'LEGOCom': {'US': {}, 'UK': {}, 'CA': {}, 'DE': {}}, 'rating': 0.0, 'reviewCount': 0, 'packagingType': '{Not specified}', 'availability': '{Not specified}', 'instructionsCount': 0, 'additionalImageCount': 0, 'ageRange': {}, 'dimensions': {}, 'barcode': {}, 'extendedData': {}, 'lastUpdated': '2024-03-04T15:05:35.09Z'}


In [12]:

print(flattened_lego_doc[2000])

{'setID': 33706, 'number': '60335', 'numberVariant': 1, 'name': 'Train Station', 'year': 2022, 'theme': 'City', 'themeGroup': 'Modern day', 'subtheme': 'Trains', 'category': 'Normal', 'released': True, 'pieces': 907, 'minifigs': 6, 'image': {'thumbnailURL': 'https://images.brickset.com/sets/small/60335-1.jpg', 'imageURL': 'https://images.brickset.com/sets/images/60335-1.jpg'}, 'bricksetURL': 'https://brickset.com/sets/60335-1', 'collection': {}, 'collections': {'ownedBy': 3028, 'wantedBy': 1933}, 'LEGOCom': {'US': {'retailPrice': 99.99, 'dateFirstAvailable': '2022-07-26T00:00:00Z'}, 'UK': {'retailPrice': 69.99, 'dateFirstAvailable': '2022-06-01T00:00:00Z'}, 'CA': {'retailPrice': 129.99, 'dateFirstAvailable': '2022-07-26T00:00:00Z'}, 'DE': {'retailPrice': 79.99, 'dateFirstAvailable': '2022-08-04T00:00:00Z'}}, 'rating': 4.3, 'reviewCount': 1, 'packagingType': 'Box', 'availability': 'Retail', 'instructionsCount': 7, 'additionalImageCount': 10, 'ageRange': {'min': 7}, 'dimensions': {'heigh

In [5]:
from openai import OpenAI

In [115]:
client = OpenAI(api_key="")

In [7]:
prompt_template = """
I'm building a RAG system for lego sets 
here is a sample of a lego set item 

SAMPLE: {sample}

you are a customer, create a short user's query that matches parameters from this sample 
reply only with user input without additional phrases. While makin a query pay a bit more attention on properties such as name, theme, subtheme, rating, year, price short reviews if available
""".strip()

rag_lego_doc = []
i = 1
for doc in tqdm(flattened_lego_doc[10374:]):
    prompt = prompt_template.format(sample=doc).strip()
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    description = response.choices[0].message.content

    doc["rag_description"] = description

    rag_lego_doc.append(doc)

    if i % 1000 == 0:
        name = "sets_reviews_rag{batch}.json".format(batch=i)
        try:
            with open(name, 'w') as f:
                json.dump(rag_lego_doc, f, indent=4)
            print(f"Data successfully saved to {"sets_reviews_rag"}")
        except IOError as e:
            print(f"An error occurred while saving data to {"sets_reviews_rag"}: {e}")
    i = i + 1

  0%|          | 0/10556 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [1]:
len(rag_lego_doc)

NameError: name 'rag_lego_doc' is not defined

In [110]:
prompt_template = """
You're a lego shop assistant. Answer the QUESTION based on the CONTEXT from Lego sets database.
Use only the facts from the CONTEXT when answering the QUESTION. Return brief description of the set bsaed on the sample data use bricksetURL to provde it's link

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

def build_prompt(query, search_results):
   

    context = ""
    #+ ''.join(doc.get("extendedData", "")) + ''.join(doc.get("reviews", "")) 
    for doc in search_results:
        #context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
        context = context + doc["name"] + doc["bricksetURL"] + str(doc["year"]) + str(doc["rating"]) + doc["theme"] + doc["category"] + doc["rag_description"] + f"\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [84]:
from elasticsearch import Elasticsearch

In [85]:
es_client = Elasticsearch('http://localhost:9200') 

In [86]:
# Define the mapping for the index
mapping = {
    "properties": {
        "setID": {"type": "long"},
        "number": {"type": "keyword"},
        "numberVariant": {"type": "integer"},
        "name": {"type": "text", "fields": {"keyword": {"type": "keyword"}}},
        "year": {"type": "integer"},
        "theme": {"type": "keyword"},
        "themeGroup": {"type": "keyword"},
        "category": {"type": "keyword"},
        "released": {"type": "boolean"},
        "pieces": {"type": "integer"},
        "minifigs": {"type": "integer"},
        "image": {
            "properties": {
                "thumbnailURL": {"type": "keyword"},
                "imageURL": {"type": "keyword"}
            }
        },
        "bricksetURL": {"type": "keyword"},
        "collection": {"type": "object"},
        "collections": {
            "properties": {
                "ownedBy": {"type": "integer"},
                "wantedBy": {"type": "integer"}
            }
        },
        "LEGOCom": {
            "properties": {
                "US": {
                    "properties": {
                        "retailPrice": {"type": "float"},
                        "dateFirstAvailable": {"type": "date"},
                        "dateLastAvailable": {"type": "date"}
                    }
                },
                "UK": {
                    "properties": {
                        "retailPrice": {"type": "float"},
                        "dateFirstAvailable": {"type": "date"},
                        "dateLastAvailable": {"type": "date"}
                    }
                },
                "CA": {
                    "properties": {
                        "retailPrice": {"type": "float"},
                        "dateFirstAvailable": {"type": "date"},
                        "dateLastAvailable": {"type": "date"}
                    }
                },
                "DE": {"type": "object"}
            }
        },
        "rating": {"type": "float"},
        "reviewCount": {"type": "integer"},
        "packagingType": {"type": "keyword"},
        "availability": {"type": "keyword"},
        "instructionsCount": {"type": "integer"},
        "additionalImageCount": {"type": "integer"},
        "ageRange": {
            "properties": {
                "min": {"type": "integer"},
                "max": {"type": "integer"}
            }
        },
        "dimensions": {
            "properties": {
                "height": {"type": "float"},
                "width": {"type": "float"},
                "depth": {"type": "float"},
                "weight": {"type": "float"}
            }
        },
        "barcode": {
            "properties": {
                "EAN": {"type": "keyword"},
                "UPC": {"type": "keyword"}
            }
        },
        "extendedData": {
            "properties": {
                "tags": {"type": "keyword"},
                "description": {"type": "text"}
            }
        },
        "lastUpdated": {"type": "date"},
        "reviews": {
            "type": "nested",
            "properties": {
                "author": {"type": "keyword"},
                "datePosted": {"type": "date"},
                "rating": {
                    "properties": {
                        "overall": {"type": "float"},
                        "parts": {"type": "float"},
                        "buildingExperience": {"type": "float"},
                        "playability": {"type": "float"},
                        "valueForMoney": {"type": "float"}
                    }
                },
                "title": {"type": "text"},
                "review": {"type": "text"},
                "HTML": {"type": "boolean"}
            }
        },
        "rag_description": {"type": "text"}
    }
}


index_name = "lego_sets"

es_client.options(ignore_status=[400,404]).indices.delete(index=index_name)
es_client.indices.create(index=index_name, body={"mappings": mapping})

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'lego_sets'})

In [87]:
for doc in tqdm(lego_doc):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/4000 [00:00<?, ?it/s]

In [101]:
import re
# try:
#         es = Elasticsearch("http://localhost:9200")
#         if not es.ping():
#             raise ConnectionError("Failed to connect to Elasticsearch")
# except ConnectionError as e:
#     print(f"ConnectionError: {e}")
#     print("Please check if Elasticsearch is running and the connection details are correct.")
    
def elastic_search(user_input):
    # Connect to Elasticsearch

    # Index name
    index_name = "lego_sets"

    # Convert the extracted information into an Elasticsearch query
    es_query = {
        "query": {
            "multi_match": {
                "query": user_input,
                "fields": ["rag_description^2","name^3", "theme", "subtheme"],
                "fuzziness": "AUTO"
            }
        },
        "size": 10
    }

    # Execute the search query
    try:
        response = es_client.search(index=index_name, body=es_query)
    except ConnectionError as e:
        print(f"ConnectionError during search: {e}")
        return []
    except RequestError as e:
        print(f"RequestError during search: {e}")
        print("This might be due to an invalid query structure or non-existent fields.")
        return []
    
    result_docs = []

    # Print the search results
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [21]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [22]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [112]:
query = "What disney sets with minifigures and good reviews can you recomend for a girl"

In [113]:
results = elastic_search(query)
display(results)

[{'setID': 10188,
  'number': '71000',
  'numberVariant': 14,
  'name': 'Mr. Good and Evil',
  'year': 2013,
  'theme': 'Collectable Minifigures',
  'themeGroup': 'Miscellaneous',
  'subtheme': 'Series 9',
  'category': 'Normal',
  'released': True,
  'pieces': 6,
  'minifigs': 1,
  'image': {'thumbnailURL': 'https://images.brickset.com/sets/small/71000-14.jpg',
   'imageURL': 'https://images.brickset.com/sets/images/71000-14.jpg'},
  'bricksetURL': 'https://brickset.com/sets/71000-14',
  'collection': {},
  'collections': {'ownedBy': 8365, 'wantedBy': 2047},
  'LEGOCom': {'US': {}, 'UK': {}, 'CA': {}, 'DE': {}},
  'rating': 3.9,
  'reviewCount': 6,
  'packagingType': 'Foil pack',
  'availability': 'Retail',
  'instructionsCount': 0,
  'additionalImageCount': 0,
  'ageRange': {'min': 5},
  'dimensions': {'height': 1.0, 'width': 8.8, 'depth': 11.7},
  'barcode': {'EAN': '5702014973497'},
  'extendedData': {'tags': ['Flask', 'Science', 'Spooky', 'Villain']},
  'lastUpdated': '2021-04-02T

In [114]:
rag(query)

'I recommend the **Disney Castle** set. Released in 2016, it features classic Disney characters like Mickey Mouse and Tinker Bell. With 4080 pieces and a rating of 4.5, it offers a great building experience. This set is ideal for a girl who loves Disney. You can find more information about it [here](https://brickset.com/sets/71040-120164).'