In [1]:
import json
from tqdm.auto import tqdm

In [2]:
def load_from_json(filename):
    try:
        with open(filename, 'r') as f:
            data = json.load(f)
        print(f"Data successfully loaded from {filename}")
        return data
    except IOError as e:
        print(f"An error occurred while loading data from {filename}: {e}")
        return None

In [3]:
lego_doc = load_from_json("sets_reviews.json")

Data successfully loaded from sets_reviews.json


In [4]:
# Flattening the structure
flattened_lego_doc = []

for year, sets in lego_doc.items():
    flattened_lego_doc.extend(sets)

print(flattened_lego_doc[0])

{'setID': 49041, 'number': '10359', 'numberVariant': 1, 'name': '{?}', 'year': 2025, 'theme': 'Icons', 'themeGroup': 'Model making', 'category': 'Normal', 'released': False, 'image': {}, 'bricksetURL': 'https://brickset.com/sets/10359-1', 'collection': {}, 'collections': {'wantedBy': 135}, 'LEGOCom': {'US': {}, 'UK': {}, 'CA': {}, 'DE': {}}, 'rating': 0.0, 'reviewCount': 0, 'packagingType': '{Not specified}', 'availability': '{Not specified}', 'instructionsCount': 0, 'additionalImageCount': 0, 'ageRange': {}, 'dimensions': {}, 'barcode': {}, 'extendedData': {}, 'lastUpdated': '2024-03-04T15:05:35.09Z'}


In [5]:

print(flattened_lego_doc[2000])

{'setID': 33706, 'number': '60335', 'numberVariant': 1, 'name': 'Train Station', 'year': 2022, 'theme': 'City', 'themeGroup': 'Modern day', 'subtheme': 'Trains', 'category': 'Normal', 'released': True, 'pieces': 907, 'minifigs': 6, 'image': {'thumbnailURL': 'https://images.brickset.com/sets/small/60335-1.jpg', 'imageURL': 'https://images.brickset.com/sets/images/60335-1.jpg'}, 'bricksetURL': 'https://brickset.com/sets/60335-1', 'collection': {}, 'collections': {'ownedBy': 3028, 'wantedBy': 1933}, 'LEGOCom': {'US': {'retailPrice': 99.99, 'dateFirstAvailable': '2022-07-26T00:00:00Z'}, 'UK': {'retailPrice': 69.99, 'dateFirstAvailable': '2022-06-01T00:00:00Z'}, 'CA': {'retailPrice': 129.99, 'dateFirstAvailable': '2022-07-26T00:00:00Z'}, 'DE': {'retailPrice': 79.99, 'dateFirstAvailable': '2022-08-04T00:00:00Z'}}, 'rating': 4.3, 'reviewCount': 1, 'packagingType': 'Box', 'availability': 'Retail', 'instructionsCount': 7, 'additionalImageCount': 10, 'ageRange': {'min': 7}, 'dimensions': {'heigh

In [6]:
from openai import OpenAI

In [7]:
client = OpenAI()

# client = OpenAI(
#     base_url='http://localhost:11434/v1/',
#     'ollama',
# )

In [129]:
# prompt_template = """
# Based on a record genearate 5 questions from 5 different customers who are looking 
# for a lego set recomedation. Note that customers don't always know exact lego set name

# The record:

# name: {name}
# year: {year}
# theme: {theme}
# description: {description}

# Provide the output in parsable JSON without using code blocks:

# ["question1", "question2", ..., "question5"]
# """.strip()

prompt_template = """

use context only from provided record. genearate 5 different user prompts that users might enter when searching for a lego set. 
user might not always remember exact set features 
The record:

name: {name}
year: {year}
theme: {theme}
description: {description}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [124]:
def generate_questions(doc):
    name = doc["name"]
    year = ""
    if "year" in doc:
        year = doc["year"]
    theme = doc["theme"]
    #subtheme = "" 
    # if "subtheme" in doc:
    #     subtheme = doc["subtheme"]
    #tags = "" 
    description = "" 
    if "extendedData" in doc:
        ext = doc["extendedData"]
        # if "tags" in doc:
        #     tags = ''.join(doc["tags"])
        if "description" in ext:
            description = ext["description"]
    # reviews = ""
    # if "reviews" in doc:
    #     reviews_arr = map(lambda x: x["review"], doc["reviews"])
    #     reviews = '#'.join(reviews_arr)
        
    prompt = prompt_template.format(name = name, year = year, theme = theme, description = description)

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [130]:
generate_questions(flattened_lego_doc[10319])

'[\n    "What is the name of the LEGO set with collectible minifigures from 2012?",\n    "Can anyone tell me about the LEGO Series 6 minifigure box?",\n    "I\'m looking for a 2012 LEGO set that has multiple collectible figures.",\n    "What LEGO collectible minifigures were released in Series 6?",\n    "Where can I find a sealed box of LEGO Minifigures Series 6?" \n]'

In [131]:
results = {}

In [132]:
import random

test_data = random.sample(flattened_lego_doc, 1000)

In [133]:
len(test_data)

1000

In [134]:
for doc in tqdm(test_data): 
    doc_id = doc['setID']
    if doc_id in results:
        continue
    questions = generate_questions(doc)
    results[doc_id] = questions

  0%|          | 0/1000 [00:00<?, ?it/s]

In [33]:
len(results)

1000

In [135]:
# filename = 'groundTruth_3.json'
# try:
#     with open(filename, 'w') as f:
#         json.dump(results, f, indent=4)
#     print(f"Data successfully saved to {filename}")
# except IOError as e:
#     print(f"An error occurred while saving data to {filename}: {e}")

Data successfully saved to groundTruth_3.json


In [8]:
prompt_template = """
You're a lego shop assistant. Answer the QUESTION based on the CONTEXT from Lego sets database.
Use only the facts from the CONTEXT when answering the QUESTION. Return brief description of the set bsaed on the sample data use bricksetURL to provde it's link

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

def build_prompt(query, search_results):
   

    context = ""
    #+ ''.join(doc.get("extendedData", "")) + ''.join(doc.get("reviews", "")) 
    for doc in search_results:
        #context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
        context = context + doc["name"] + doc["bricksetURL"] + str(doc["year"]) + doc["theme"] + doc["category"] + f"\n\n"
        if "minifigs" in doc:
            context = context + str(doc["minifigs"])
        if "themeGroup" in doc:
            context = context + doc["themeGroup"]
        if "reviews" in doc:
            reviews_str = "\n\n".join(
                f"Author: {review['author']}\nDate: {review['datePosted']}\nTitle: {review['title']}\nReview: {review['review']}"
                for review in doc["reviews"]
            )
            context = context + reviews_str
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

vector based search 

In [9]:
from sentence_transformers import SentenceTransformer

In [10]:
model_name = 'all-mpnet-base-v2'#multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)



In [11]:
from elasticsearch import Elasticsearch

In [12]:
es_client = Elasticsearch('http://localhost:9200') 

In [13]:
# Define the mapping for the index
mapping = {
    "properties": {
        "name_vector": {
            "type": "dense_vector",
            "dims": 768,
            "index": True,
            "similarity": "cosine",
        },
        "subtheme_vector": {
            "type": "dense_vector",
            "dims": 768,
            "index": True,
            "similarity": "cosine",
        },
        "category_vector": {
            "type": "dense_vector",
            "dims": 768,
            "index": True,
            "similarity": "cosine",
        },
        "theme_vector": {
            "type": "dense_vector",
            "dims": 768,
            "index": True,
            "similarity": "cosine",
        },
        "tags_vector": {
            "type": "dense_vector",
            "dims": 768,
            "index": True,
            "similarity": "cosine",
        },
        "description_vector": {
            "type": "dense_vector",
            "dims": 768,
            "index": True,
            "similarity": "cosine",
        },
        "review_vector": {
            "type": "dense_vector",
            "dims": 768,
            "index": True,
            "similarity": "cosine",
        },
        "year_vector": {
            "type": "dense_vector",
            "dims": 768,
            "index": True,
            "similarity": "cosine",
        },
        "setID": {"type": "long"},
        "number": {"type": "keyword"},
        "numberVariant": {"type": "integer"},
        "name": {"type": "text", "fields": {"keyword": {"type": "keyword"}}},
        "year": {"type": "integer", "null_value": 0},
        "theme": {"type": "keyword",  "null_value": ""},
        "themeGroup": {"type": "keyword",  "null_value": ""},
        "category": {"type": "keyword", "null_value": ""},
        "released": {"type": "boolean"},
        "pieces": {"type": "integer"},
        "minifigs": {"type": "integer", "null_value": 0},
        "image": {
            "properties": {
                "thumbnailURL": {"type": "keyword"},
                "imageURL": {"type": "keyword"}
            }
        },
        "bricksetURL": {"type": "keyword"},
        "collection": {"type": "object"},
        "collections": {
            "properties": {
                "ownedBy": {"type": "integer"},
                "wantedBy": {"type": "integer"}
            }
        },
        "LEGOCom": {
            "properties": {
                "US": {
                    "properties": {
                        "retailPrice": {"type": "float"},
                        "dateFirstAvailable": {"type": "date"},
                        "dateLastAvailable": {"type": "date"}
                    }
                },
                "UK": {
                    "properties": {
                        "retailPrice": {"type": "float"},
                        "dateFirstAvailable": {"type": "date"},
                        "dateLastAvailable": {"type": "date"}
                    }
                },
                "CA": {
                    "properties": {
                        "retailPrice": {"type": "float"},
                        "dateFirstAvailable": {"type": "date"},
                        "dateLastAvailable": {"type": "date"}
                    }
                },
                "DE": {"type": "object"}
            }
        },
        "rating": {"type": "float", "null_value": 0.0},
        "reviewCount": {"type": "integer"},
        "packagingType": {"type": "keyword"},
        "availability": {"type": "keyword"},
        "instructionsCount": {"type": "integer"},
        "additionalImageCount": {"type": "integer"},
        "ageRange": {
            "properties": {
                "min": {"type": "integer"},
                "max": {"type": "integer"}
            }
        },
        "dimensions": {
            "properties": {
                "height": {"type": "float"},
                "width": {"type": "float"},
                "depth": {"type": "float"},
                "weight": {"type": "float"}
            }
        },
        "barcode": {
            "properties": {
                "EAN": {"type": "keyword"},
                "UPC": {"type": "keyword"}
            }
        },
        "extendedData": {
            "properties": {
                "tags": {"type": "keyword"},
                "description": {"type": "text"}
            }
        },
        "lastUpdated": {"type": "date"},
        "reviews": {
            "type": "nested",
            "properties": {
                "author": {"type": "keyword"},
                "datePosted": {"type": "date"},
                "rating": {
                    "properties": {
                        "overall": {"type": "float"},
                        "parts": {"type": "float"},
                        "buildingExperience": {"type": "float"},
                        "playability": {"type": "float"},
                        "valueForMoney": {"type": "float"}
                    }
                },
                "title": {"type": "text"},
                "review": {"type": "text"},
                "HTML": {"type": "boolean"}
            }
        },
    }
}


index_name = "lego_sets"

es_client.options(ignore_status=[400,404]).indices.delete(index=index_name)
es_client.indices.create(index=index_name, body={"mappings": mapping})

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'lego_sets'})

In [14]:
def load_from_json(filename):
    try:
        with open(filename, 'r') as f:
            data = json.load(f)
        print(f"Data successfully loaded from {filename}")
        return data
    except IOError as e:
        print(f"An error occurred while loading data from {filename}: {e}")
        return None
    
ground_truth_json = load_from_json('groundTruth_3.json')

Data successfully loaded from groundTruth_3.json


In [15]:
truth_keys = list(ground_truth_json.keys())


In [16]:
flattened_lego_doc_filtered = list(filter(lambda set: str(set['setID']) in truth_keys, flattened_lego_doc))

In [17]:
for doc in tqdm(flattened_lego_doc_filtered):
    if "name" in doc:
        doc['name_vector'] = model.encode(doc["name"])
    
    if "theme" in doc:
        doc['theme_vector'] = model.encode(doc["theme"])

    if "subtheme" in doc:
        doc['subtheme_vector'] = model.encode(doc["subtheme"])
    
    if "category" in doc:
        doc['category_vector'] = model.encode(doc["category"])

    if "extendedData" in doc:
        if "tags" in doc:
            doc['tags_vector'] = model.encode(''.join(doc["tags"]))
        if "description" in doc:
            doc['description_vector'] = model.encode(doc["description"])
 
    if "reviews" in doc:
        reviews_arr = map(lambda x: x["review"], doc["reviews"])
        doc['review_vector'] = model.encode('#'.join(reviews_arr))
    
    if 'year' in doc:
        doc['year_vector'] = model.encode(str(doc['year']))

  0%|          | 0/1000 [00:00<?, ?it/s]

In [51]:
# from elasticsearch.helpers import bulk

# # Function to read documents from the backup file
# def read_documents(filename):
#     with open(filename, 'r') as f:
#         for line in f:
#             yield json.loads(line.strip())

# # Function to prepare documents for bulk indexing
# def doc_generator(documents):
#     for doc in documents:
#         yield {
#             "_index": index_name,
#             "_source": doc
#         }
# # Read documents from the backup file
# documents = read_documents('lego_sets_backup.json')

# # Bulk index the documents
# success, failed = bulk(es_client, doc_generator(tqdm(documents, desc="Indexing documents")), stats_only=True)

# print(f"Indexed {success} documents successfully. {failed} documents failed.")

Indexing documents: 0it [00:00, ?it/s]

Indexed 20930 documents successfully. 0 documents failed.


In [18]:
for doc in tqdm(flattened_lego_doc_filtered):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [19]:
import spacy

In [20]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

In [21]:
def extract_entities(query):
    doc = nlp(query)
    entities = {
        "name": [],
        "keywords": [],
    }
    
    # Extract named entities
    for ent in doc.ents:
        if ent.label_ == "PERSON" or ent.label_ == "PRODUCT":
            entities["name"].append(ent.text)
    
    # Extract themes and tags (this is a simplistic approach, you might need to refine this)
    for token in doc:
        if token.pos_ == "NOUN":
            entities["keywords"].append(token.text.lower())
    
    return entities

In [22]:
import re
from datetime import datetime

def extract_search_parameters(query):
    params = {
        'user_query': query,
        'year_from': None,
        'year_to': None,
        'pieces_min': None,
        'pieces_max': None
    }
    
    # Extract years
    year_pattern = r'\b(19\d{2}|20\d{2})\b'
    years = re.findall(year_pattern, query)
    if years:
        year_ints = [int(y) for y in years]
        params['year_from'] = min(year_ints)
        params['year_to'] = max(year_ints)
        query = re.sub(year_pattern, '', query)
    
    # Extract date ranges
    date_patterns = [
        (r'\b(?:from|since|after)\s+(19\d{2}|20\d{2})\b', 'year_from'),
        (r'\b(?:to|until|before)\s+(19\d{2}|20\d{2})\b', 'year_to'),
        (r'\b(?:in|during|around)\s+(19\d{2}|20\d{2})\b', 'year_specific')
    ]
    for pattern, param in date_patterns:
        match = re.search(pattern, query, re.IGNORECASE)
        if match:
            year = int(match.group(1))
            if param == 'year_specific':
                params['year_from'] = year
                params['year_to'] = year
            else:
                params[param] = year
            query = re.sub(pattern, '', query, flags=re.IGNORECASE)
    
    # Handle cases where only one year is specified
    current_year = datetime.now().year
    if params['year_from'] and not params['year_to']:
        params['year_to'] = min(params['year_from'] + 5, current_year)
    elif params['year_to'] and not params['year_from']:
        params['year_from'] = max(1949, params['year_to'] - 5)
    
    # Extract piece count
    piece_pattern = r'\b(\d+)(?:\s*-\s*(\d+))?\s*pieces?\b'
    piece_match = re.search(piece_pattern, query, re.IGNORECASE)
    if piece_match:
        params['pieces_min'] = int(piece_match.group(1))
        params['pieces_max'] = int(piece_match.group(2)) if piece_match.group(2) else params['pieces_min']
        query = re.sub(piece_pattern, '', query, flags=re.IGNORECASE)
    
    # Clean up the query
    params['user_query'] = ' '.join(query.split())
    
    return params

In [44]:
def build_elasticsearch_query(params):
    input_embedding = model.encode(params['user_query'])
    query = {
        "query": {
            "bool": {
                "must": [
                    {
                        "multi_match": {
                            "query": params['user_query'],
                            "fields": ["name", "theme", "subtheme", "category"],
                            "type": "best_fields",
                            "fuzziness": "2",
                            "operator": "or"
                        }
                    },
                    {
                        "script_score": {
                        "query": {"match_all": {}},
                        "script": {
                            "source": """
                            double score = 0.0;
                            double vector_score = 0.0;
                            int vector_count = 0;
                            
                            if (doc.containsKey('name_vector') && !doc['name_vector'].empty) {
                                vector_score += cosineSimilarity(params.query_vector, 'name_vector');
                                vector_count++;
                            }
                            if (doc.containsKey('theme_vector') && !doc['theme_vector'].empty) {
                                vector_score += cosineSimilarity(params.query_vector, 'theme_vector');
                                vector_count++;
                            }
                            if (doc.containsKey('subtheme_vector') && !doc['subtheme_vector'].empty) {
                                vector_score += cosineSimilarity(params.query_vector, 'subtheme_vector');
                                vector_count++;
                            }
                            if (doc.containsKey('description_vector') && !doc['description_vector'].empty) {
                                vector_score += cosineSimilarity(params.query_vector, 'description_vector');
                                vector_count++;
                            }

                            if (doc.containsKey('review_vector') && !doc['review_vector'].empty) {
                                vector_score += cosineSimilarity(params.query_vector, 'review_vector');
                                vector_count++;
                            }
                            
                            if (vector_count > 0) {
                                score = vector_score + 1;
                            }

                            return score;
                            """,
                            "params": {
                                "query_vector": input_embedding,
                            }
                    }
                    }
                    }
                ],
                "filter": [
                ]
            },
        },
        "aggs": {
            "themes": {
                "terms": {
                    "field": "theme.keyword",
                    "size": 10
                }
            },
            "years": {
                "date_histogram": {
                    "field": "year",
                    "calendar_interval": "year"
                }
            },
        },
        "size": 15
    }

    # Add year filter if specified
    if params['year_from'] is not None or params['year_to'] is not None:
        year_filter = {"range": {"year": {}}}
        if params['year_from'] is not None:
            year_filter["range"]["year"]["gte"] = params['year_from']
        if params['year_to'] is not None:
            year_filter["range"]["year"]["lte"] = params['year_to']
        query["query"]["bool"]["filter"].append(year_filter)

    # Add piece count filter if specified
    if params['pieces_min'] is not None or params['pieces_max'] is not None:
        piece_filter = {"range": {"pieces": {}}}
        if params['pieces_min'] is not None:
            piece_filter["range"]["pieces"]["gte"] = params['pieces_min']
        if params['pieces_max'] is not None:
            piece_filter["range"]["pieces"]["lte"] = params['pieces_max']
        query["query"]["bool"]["filter"].append(piece_filter)

    return query

In [24]:
from elasticsearch.exceptions import ConnectionError, RequestError

def elastic_search_vector(user_input):
    doc = nlp(user_input)
    cleaned_query = []
    for ent in doc:
        if ent.is_stop == False:
            cleaned_query.append(ent.text)
    user_input = ' '.join(cleaned_query)
    params = extract_search_parameters(user_input) 
    es_query = build_elasticsearch_query(params)

    # Execute the search query
    try:
        response = es_client.search(index=index_name, body=es_query)
    except ConnectionError as e:
        print(f"ConnectionError during search: {e}")
        return str(e)
    except RequestError as e:
        print(f"RequestError during search: {e}")
        print("This might be due to an invalid query structure or non-existent fields.")
        return f"RequestError during search: {e}"

    result_docs = [hit['_source'] for hit in response['hits']['hits']]
    return result_docs

In [36]:
import re

def elastic_search(user_input):
    # Connect to Elasticsearch
#"fields": ["name^3", "theme", "subtheme", "year", "rating", "themeGroup", "minifigs"],
    # Convert the extracted information into an Elasticsearch query
    es_query = {
        "query": {
            "multi_match": {
                "query": user_input,
                "fields": ["name", "theme", "subtheme", "theme"],
                "fuzziness": 2
            }
        },
        "size": 5
    }

    # Execute the search query
    error_info = ""
    try:
        response = es_client.search(index=index_name, body=es_query)
    except ConnectionError as e:
        print(f"ConnectionError during search: {e}")
        error_info = e
        return error_info
    except RequestError as e:
        print(f"RequestError during search: {e}")
        print("This might be due to an invalid query structure or non-existent fields.")
        error_info = f"RequestError during search: {e}"
        return error_info
    
    result_docs = []

    # Print the search results
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [25]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [26]:
def rag(query):
    search_results = elastic_search_vector(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer.strip()

In [27]:
query = "What LEGO gear item is a key light with a momentary switch"

In [28]:
search_params = extract_search_parameters(query)
search_entities = extract_entities(query)

In [29]:
print(f'params = {search_params} \n\n\n entities = {search_entities}')

params = {'user_query': 'What LEGO gear item is a key light with a momentary switch', 'year_from': None, 'year_to': None, 'pieces_min': None, 'pieces_max': None} 


 entities = {'name': [], 'keywords': ['gear', 'item', 'light', 'switch']}


In [30]:
doc = nlp(query)

In [31]:
cleaned_query = []
for ent in doc:
    if ent.is_stop == False:
        cleaned_query.append(ent.text)

In [32]:
query = ' '.join(cleaned_query)
print(query)

LEGO gear item key light momentary switch


In [45]:
results = elastic_search_vector(query)
names = map(lambda x: f'{x["setID"]} {x["name"]}', results)
display(list(names))

['24026 LEGO Friends Emma Key Light',
 '23106 Santa Key Light',
 '23042 2x4 Brick Key Light (Red)',
 '8398 Automatic Right Electric Switch',
 '27217 LEGO House Boy Key Chain',
 '10098 Darth Vader Flashlight',
 '4571 Light Sensor',
 '673 Ultra-Light',
 '2709 Head Wear',
 '134 Road Burner',
 '3395 Aero Hawk',
 '343 Airport Security Squad',
 '5406 1x1 Stud Light Grey',
 '24492 LEGO Ice Brick Tray Red',
 '6692 Shadow Trooper']

In [63]:
rag(query)

'I recommend the **Castle Building Set**. This set features a mini castle that comes with a drawbridge, a minifigure, and some charming elements that would make an ideal backdrop for a rabbit or other whimsical creatures. While there are no specific trees included in the set, it encourages creativity, allowing you to build your own scenes.\n\nYou can find more details about this set here: [Castle Building Set](https://brickset.com/sets/6193-12009)'

In [49]:
# from elasticsearch.helpers import scan

# # Function to handle JSON serialization of special types
# def json_serial(obj):
#     if isinstance(obj, (datetime, date)):
#         return obj.isoformat()
#     raise TypeError(f"Type {type(obj)} not serializable")

# # Scan and scroll through the index
# results = scan(es_client, index=index_name, query={"query": {"match_all": {}}})

# # Save the documents to a file
# with open('lego_sets_backup.json', 'w') as f:
#     for item in tqdm(results, desc="Saving documents"):
#         json.dump(item['_source'], f, default=json_serial)
#         f.write('\n')

# print("Index backup completed and saved to 'lego_sets_backup.json'")

Saving documents: 0it [00:00, ?it/s]

Index backup completed and saved to 'lego_sets_backup.json'


Evaluate Vector

In [34]:
def fixJSON(input_dict):
    """
    Fixes a dictionary containing JSON-like strings and returns a dictionary with properly parsed JSON.
    
    Args:
    input_dict (dict): A dictionary where values are JSON strings representing lists of questions.
    
    Returns:
    dict: A dictionary with the same keys, but values parsed into lists of questions.
    """
    def parse_value(value):
        if isinstance(value, str):
            try:
                # Try to parse the string as JSON
                parsed = json.loads(value)
                # Ensure the parsed result is a list
                if isinstance(parsed, list):
                    return parsed
                else:
                    return [value]  # If not a list, wrap the original string in a list
            except json.JSONDecodeError:
                # If it's not valid JSON, return the original string wrapped in a list
                return [value]
        else:
            # For any other type, return as is
            return value

    return {key: parse_value(value) for key, value in input_dict.items()}

In [35]:
ground_truth = fixJSON(ground_truth_json)

In [36]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [37]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [38]:
def evaluate(ground_truth):
    relevance_total = []

    for setID, questions in tqdm(ground_truth.items()):
        for question in questions:
            results = elastic_search_vector(question)
            ids = map(lambda x: x["setID"], results)
            relevance = [str(d['setID']) == setID for d in results]
            relevance_total.append(relevance)
    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total)
    }

In [46]:
evaluate(ground_truth)

  0%|          | 0/1000 [00:00<?, ?it/s]

{'hit_rate': 0.676, 'mrr': 0.5139198257298251}