In [16]:
import os
import re

import unicodedata
import time
import pandas as pd
import numpy as np
from rapidfuzz import process, fuzz
import torch
from torch import py_int
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline, AutoModel
from transformers import logging as transformers_logging
from sentence_transformers import SentenceTransformer
from tabulate import tabulate
transformers_logging.set_verbosity_error()
import json

import openpyxl

# Silencing TqdmWarning
import warnings
warnings.filterwarnings('ignore')

In [17]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words_to_keep = ["what", "when", "where", "which", "while", "who", "whom", "why", "with", "how", "before", "after","same"]
stop_words = set([s for s in stopwords.words('english') if s not in stop_words_to_keep])

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kevinbrundler/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
class NERParser:
    def __init__(self, model_name: str = "dslim/bert-base-NER", lowercase: bool = False):
        """
        Initialize the NER parser with a model and optionally configure the lowercase preprocessing.
        """
        self.model_name = model_name
        self.lowercase = lowercase
        self.device = self.get_device()
        
        # Load the tokenizer and model
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, do_lower_case=self.lowercase)
        self.model = AutoModelForTokenClassification.from_pretrained(self.model_name)
        
        # Set up the NER pipeline
        self.nlp_pipeline = pipeline(
            "ner", 
            model=self.model, 
            tokenizer=self.tokenizer, 
            device=self.device, 
            aggregation_strategy="simple"
        )

    def get_device(self):
        """
        Determines whether to use MPS, CUDA, or CPU depending on the available hardware.
        """
        if torch.backends.mps.is_available():
            print("MPS device found, using MPS backend.\n")
            return torch.device("mps")
        elif torch.cuda.is_available():
            print(f"CUDA device found, using CUDA backend. Device: {torch.cuda.get_device_name(0)}\n")
            return torch.device("cuda")
        else:
            print("Neither MPS nor CUDA found, using CPU.\n")
            return torch.device("cpu")

    
    def parse_ner_results(self, ner_results: list):
        """
        Parse the NER results and extract entities related to 'PER' (persons) and 'MISC' (potential movie titles).
        """
        per_entities, misc_entities = [], []
        
        for entity in ner_results:
            # Extraction of all Persons
            if entity['entity_group'] == 'PER':
                per_entities.append(entity['word'])
            # Extraction of all Misc that could indicate movies
            elif entity['entity_group'] == 'MISC':
                misc_entities.append(entity['word'])
        
        return per_entities, misc_entities

    
    def process_query(self, query: str):
        """
        Processes a text query, runs NER, and returns the extracted actors and movie names.
        """
        # Optionally lowercase the input if configured
        if self.lowercase:
            query = query.lower()
        
        # Run the NER pipeline
        ner_results = self.nlp_pipeline(query)

        # Parse the results to extract actors and movies
        per_entities, misc_entities = self.parse_ner_results(ner_results)
        
        return per_entities, misc_entities


In [19]:
class DataBase:
    """Handles the extraction of context data for given people and movies from a database, with fuzzy matching for names."""

    def __init__(self):
        self.db = pd.read_pickle(os.path.join(os.getcwd(), r"exports/extended_graph_triples.pkl"))
        
        with open(r"exports/entity_db.json", encoding="utf-8") as f: 
            self.entities = json.load(f)
            self.entity_list = list(subject.lower() for subject, types in self.entities.values())
        
        # otherwise exact matching will fail
        self.db['subject_id'] = self.db['subject_id'].astype(str).str.strip()

    @staticmethod
    def normalize_string(s):
        """Cleans the input entity to a uniform naming convention, by removing non ascii characters, encoding it to utf, setting it to lowercase, and removing redundant spaces"""
        s = s.lower()
        s = unicodedata.normalize('NFKD', s)
        s = s.encode('ascii', 'ignore').decode('utf-8')
        s = re.sub(r'[^\w\s]', '', s)
        s = ' '.join(s.split())
        return s
    
    def fetch(self, entity_lst, search_column):
        
        relevant = self.db[self.db[search_column].isin(entity_lst)].dropna(axis=1)
        
        if relevant.empty:
            return pd.DataFrame()          
        
        pivot_df = relevant.pivot_table(
            index='subject_id',
            columns='predicate_label',
            values='object_label',
            aggfunc=lambda x: ' | '.join(x.astype(str))
        )
    
        pivot_df.reset_index(inplace=True)
    
        return pivot_df

In [20]:
class QueryEmbedder:
    
    def __init__(self):
        self._glove_embeddings = self._load_glove_embeddings("exports/glove.6B/glove.6B.300d.txt")
        
    def _load_glove_embeddings(self, file_path):
        embeddings = {}
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], dtype='float32')
                embeddings[word] = vector
        return embeddings
    
    def embed_phrase(self, phrase):
        words = phrase.split()
        word_vectors = [self._glove_embeddings[word.lower()] for word in words if word.lower() in self._glove_embeddings]
        
        if len(word_vectors) == 0:
            return np.zeros(300)
        
        return np.mean(word_vectors, axis=0)
    

In [21]:
class QueryEmbedderContextualized:
    def __init__(self, model_name='sentence-transformers/all-mpnet-base-v2', device=None):
        """
        Initializes the QueryEmbedder with a SentenceTransformer model for sentence embeddings.
        """
        self.device = self.get_device()
        self.model = SentenceTransformer(model_name, device=self.device)
        self.cache = {}
    
    
    def get_device(self):
        """
        Determines whether to use MPS, CUDA, or CPU depending on the available hardware.
        """
        if torch.backends.mps.is_available():
            print("MPS device found, using MPS backend.\n")
            return torch.device("mps")
        elif torch.cuda.is_available():
            print(f"CUDA device found, using CUDA backend. Device: {torch.cuda.get_device_name(0)}\n")
            return torch.device("cuda")
        else:
            print("Neither MPS nor CUDA found, using CPU.\n")
            return torch.device("cpu")
    
    def embed_phrase(self, phrases):
        """
        Generates embeddings for given phrases using SentenceTransformer.

        Args:
            phrases (str or List[str]): The input phrase(s) to embed.

        Returns:
            np.ndarray: The embedding vector(s) for the phrase(s).
        """
        if isinstance(phrases, str):
            phrases = [phrases]
        elif not isinstance(phrases, list):
            raise TypeError("Input must be a string or a list of strings.")

        embeddings = []
        phrases_to_compute = []
        indices_to_compute = []

        for idx, phrase in enumerate(phrases):
            if phrase in self.cache:
                embeddings.append(self.cache[phrase])
            else:
                embeddings.append(None)
                phrases_to_compute.append(phrase)
                indices_to_compute.append(idx)
        
        if phrases_to_compute:
            new_embeddings = self.model.encode(
                phrases_to_compute, 
                show_progress_bar=False, 
                convert_to_numpy=True, 
                normalize_embeddings=True
            )
            for idx, emb in zip(indices_to_compute, new_embeddings):
                embeddings[idx] = emb
                self.cache[phrases[idx]] = emb
        
        # Return embeddings
        if len(embeddings) == 1:
            return embeddings[0]
        else:
            return np.array(embeddings)

In [22]:
class LLM():
    
    def __init__(self):
        self.qa_model = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad", top_k=1)
    
    def query(self, query, context_df):
        
        top_columns = context_df.columns
        
        context = ""
        for index, row in context_df.iterrows():
            node_label = row.get("node label", "")
            
            row_context = f"This text is about \"{node_label}\":\n"
            
            for col in context_df[top_columns].columns:
                if col == "node label":
                    continue
                
                values = row[col]
                values_lst = str(values).split(",")
                
                if len(values_lst) > 5:
                    row_context += f"{col}: {', '.join(values_lst[:5])}"
                else:
                    row_context += f"{col}: {', '.join(values_lst)}"

            context += row_context + "\n\n"
        
        output = self.qa_model(question=query, context=context)
        
        answer_str = str()
        if isinstance(output, list) and output:
            answer_str = ", ".join([result['answer'] for result in output])
            
        elif isinstance(output, dict):
            answer_str = output['answer']
        
        if not answer_str:
            answer_str = "No answer found."
        
        return answer_str

In [30]:
def cosine_sim(vec1, vec2):
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    
    if norm_vec1 == 0 or norm_vec2 == 0:
        return 0.0 
    
    return np.dot(vec1, vec2) / (norm_vec1 * norm_vec2)


def rescale_probabilities(similarities):
    """
    Rescales the similarity scores so that they sum to 1, turning them into a probability distribution.
    
    Args:
        similarities (List[float]): List of similarity scores.
        
    Returns:
        List[float]: Rescaled probabilities.
    """
    similarity_sum = sum(similarities)
    if similarity_sum == 0:
        return [0] * len(similarities)  # Avoid division by zero
    
    return [sim / similarity_sum for sim in similarities]

def find_closest_columns(query_embeddings, column_embeddings, high_threshold=0.4, top_n=10, rescaled_threshold=0.11):
    """
    Returns columns based on cosine similarity with a two-tiered strategy and rescaled probabilities.
    - If a column has similarity above 'high_threshold', return that column immediately.
    - Otherwise, return all columns with a similarity greater than 'low_threshold'.
    - Rescale the top N column similarities into probabilities and return columns with a rescaled probability greater than rescaled_threshold.
    
    Args:
        query_embeddings (List[np.ndarray]): Embeddings for query words.
        column_embeddings (Dict[str, np.ndarray]): Precomputed embeddings for columns.
        low_threshold (float): Minimum similarity threshold (default: 0.27).
        high_threshold (float): Confidence threshold to return immediately (default: 0.35).
        top_n (int): Number of top columns to consider for rescaling (default: 10).
        rescaled_threshold (float): Minimum rescaled probability threshold (default: 0.1).
    
    Returns:
        List[str]: The selected column names.
    """
    column_similarities = {}

    for col, col_vec in column_embeddings.items():
        similarities = [cosine_sim(col_vec, q_vec) for q_vec in query_embeddings if np.linalg.norm(q_vec) > 0]
        column_similarities[col] = np.mean(similarities) if similarities else -1

    sorted_columns = sorted(column_similarities.items(), key=lambda item: item[1], reverse=True)
    top_columns = sorted_columns[:top_n]
    
    column_names, similarities = zip(*top_columns)
    
    rescaled_probs = rescale_probabilities(similarities)
    
    selected_columns = []
    
    for col, sim in zip(column_names, similarities):
        if sim >= high_threshold:
            print(f"High confidence match found: {col} with similarity {sim: .4f}")
            return [col]
    
    for col, rescaled_prob in zip(column_names, rescaled_probs):
        if rescaled_prob >= rescaled_threshold:
            print(f"Column {col} has similarity {rescaled_prob: .4f}")
            selected_columns.append(col)
    
    return selected_columns

In [31]:
db = DataBase()

#qe = QueryEmbedder()

ner_parser = NERParser(lowercase=False)

qe = QueryEmbedderContextualized()

llm = LLM()

MPS device found, using MPS backend.

MPS device found, using MPS backend.



In [32]:
def filter_query(query, node_label):
                
    if not len(query):
        return []
    
    relevant = []
    for word in query.replace(". ", " ").lower().split(" "):
        cleaned_word = re.sub(r'[^A-Za-z]', '', word)
        if cleaned_word in stop_words or cleaned_word in node_label.lower().replace(" ", "") or cleaned_word == "":
            continue
        
        relevant.append(cleaned_word)
        
    return " ".join(relevant)

In [33]:
def fuzzy_match(query_str, comparison_list, threshold=30, prioritize_exact=True):
    matches = process.extract(query_str, comparison_list, scorer=fuzz.partial_ratio, limit=50)
        
    id_name_score = []
    
    if prioritize_exact and query_str in comparison_list:
        matched_id = next(key for key, value in db.entities.items() if value[0] == query_str)
        id_name_score.append((matched_id, query_str, 100))
    
    for match in matches:
        name = match[0]
        score = match[1]
        matched_id = next(key for key, value in db.entities.items() if value[0] == name)
        
        length_diff = abs(len(name) - len(query_str)) / len(query_str)
        adjusted_score = score * (1 - length_diff)
        
        id_name_score.append((matched_id, name, adjusted_score))
    
    return [id for id, _, score in id_name_score if score >= threshold]

In [34]:
def get_top_matches(df, normalized_query, top_n=2):
    concatenated_rows = df.apply(lambda row: ' '.join(row.astype(str)), axis=1).tolist()
    
    exact_matches = [i for i, row in enumerate(concatenated_rows) if normalized_query == row]
    
    if len(exact_matches) < top_n:
        remaining_slots = top_n - len(exact_matches)
        fuzzy_matches = process.extract(normalized_query, concatenated_rows, scorer=fuzz.partial_ratio, limit=remaining_slots)
        fuzzy_indices = [match[2] for match in fuzzy_matches]
    else:
        fuzzy_indices = []
    
    top_indices = exact_matches + fuzzy_indices
    
    return df.iloc[top_indices]

In [35]:
import time
from functools import wraps

# Decorator to measure execution time
def measure_time(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        start_time = time.time()  # Record the start time
        result = func(*args, **kwargs)  # Call the function
        end_time = time.time()  # Record the end time
        elapsed_time = end_time - start_time  # Calculate elapsed time
        print(f"Execution time for {func.__name__}: {elapsed_time:.4f} seconds")
        return result
    return wrapper

In [88]:
@measure_time
def answer_query(query, correct_answer=""):
    normalized_query = db.normalize_string(query)
    
    entity_matches = fuzzy_match(normalized_query, db.entity_list, threshold=30)
    
    # NER Model and NER Matching
    ner_person, ner_movies = ner_parser.process_query(query)
    
    if len(ner_movies):
        ner_movie_entities = fuzzy_match(" ".join(ner_movies), db.entity_list, threshold=75)
        subjects_ner_movies = db.fetch(ner_movie_entities, "subject_id")
        context_ner_movies = get_top_matches(subjects_ner_movies, normalized_query, top_n=1) 
            
    else:
        context_ner_movies = pd.DataFrame()
    
    if len(ner_person):
        ner_person_entities = fuzzy_match(" ".join(ner_person), db.entity_list, threshold=75)
        subjects_ner_person = db.fetch(ner_person_entities, "subject_id")
        context_ner_person = get_top_matches(subjects_ner_person, normalized_query, top_n=1)   
                
    else:
        context_ner_person = pd.DataFrame()
    
    # Fuzzy Matching
    subjects = db.fetch(entity_matches, "subject_id")   
    context = get_top_matches(subjects, normalized_query, top_n=1)    
    
    ner_context = pd.concat([context_ner_movies, context_ner_person])
        
    if not ner_context.empty:
        context = ner_context
    
    try:
        node_label = context["node label"].values[0]
    except Exception:
        node_label = ""
    
    if context.empty:
        print("No context data found for given IDs or string")
        context = pd.DataFrame()
        
        #Fallback Strategy
        
    # EXPERIMENTAL - remove unused columns
    elements_to_remove = ["image", "color", "sport"]
    context = context.drop(columns=elements_to_remove, errors='ignore')
    
    # EXPERIMENTAL - rename columns
    columns_to_rename = {
        "cast member":"movie cast",
        "notable work": "acted in"
    }
    
    columns_to_rename = {k: v for k, v in columns_to_rename.items() if k in context.columns}
    context = context.rename(columns=columns_to_rename)
    
    columns_to_duplicate = [("acted in", "played in"),
                            ("acted in", "appeared in"),
                            ("movie cast", "actors"),
                            ("movie cast", "players")]
    
    for col_to_duplicate, col in columns_to_duplicate: 
        try:
            context[col] = context[col_to_duplicate].copy()
        except KeyError:
            pass
        
    context.dropna(axis=1, inplace=True)
    
    query_filtered = filter_query(query, node_label)
        
    column_embeddings = {col: qe.embed_phrase(col) for col in context.columns}
    query_embeddings = [qe.embed_phrase(word) for word in query_filtered.split()]  
    top_columns_embeddings = find_closest_columns(query_embeddings, column_embeddings)
    
    # EXPERIMENTAL
    top_columns_dict = process.extract(normalized_query, context.columns, scorer=fuzz.partial_ratio, limit=3)
    top_columns_fuzzy = [c[0] for c in top_columns_dict]
    
    # MANUAL OVERWRITE:
    top_columns_fuzzy = []
        
    # EXPERIMENTAL - always keep columns
    col_always_keep = ["node label"]
    
    combined_columns = set(top_columns_fuzzy + top_columns_embeddings + col_always_keep)
    top_columns = [col for col in combined_columns if col in context.columns]
    filtered_context_df = context[top_columns]

    answer = llm.query(query, filtered_context_df)
    
    print("Answer")
    print(answer)
    
    ### EXPERIMENTAL
    #filtered_context_df["hint"] = answer
    #answer = llm.query(query, filtered_context_df)
        
    #normalized_answer = db.normalize_string(answer)
    #normalized_correct_answer = db.normalize_string(correct_answer)
    
    print(f"{'CORRECT' if answer and answer in correct_answer else 'WRONG'} - {query} - {answer}")


In [37]:
answer_query("Who is the director of Good Will Hunting?", "Gus Van Sant")

High confidence match found: director with similarity  0.6866
Answer
gus van sant
WRONG - Who is the director of Good Will Hunting? - gus van sant
Execution time for answer_query: 0.8751 seconds


In [38]:
answer_query("Who directed The Bridge on the River Kwai?", "David Lean")

High confidence match found: director with similarity  0.5111
Answer
david lean
WRONG - Who directed The Bridge on the River Kwai? - david lean
Execution time for answer_query: 0.7343 seconds


In [39]:
answer_query("Who directed the movie The Godfather?", "Francis Ford Coppola") # or Mario Puzo or Francis Ford Coppola

High confidence match found: director with similarity  0.4993
Answer
francis ford coppola
WRONG - Who directed the movie The Godfather? - francis ford coppola
Execution time for answer_query: 0.8252 seconds


In [40]:
answer_query("Who is the director of The Dark Knight?", "Christopher Nolan")

High confidence match found: director with similarity  0.6866
Answer
christopher nolan
WRONG - Who is the director of The Dark Knight? - christopher nolan
Execution time for answer_query: 0.7688 seconds


In [41]:
answer_query("Who directed The Dark Knight?", "Christopher Nolan")

High confidence match found: director with similarity  0.5111
Answer
christopher nolan
WRONG - Who directed The Dark Knight? - christopher nolan
Execution time for answer_query: 0.5735 seconds


In [42]:
answer_query("Where was Angelina Jolie born?", "Los Angeles")

High confidence match found: place of birth with similarity  0.4204
Answer
los angeles
WRONG - Where was Angelina Jolie born? - los angeles
Execution time for answer_query: 0.7665 seconds


In [43]:
answer_query("Which role had Leonardo Di Caprio in Inception?", "actor")

Column director has similarity  0.1256
Answer
director
WRONG - Which role had Leonardo Di Caprio in Inception? - director
Execution time for answer_query: 0.8958 seconds


In [44]:
answer_query("Who directed Inception?", "christopher nolan")

High confidence match found: director with similarity  0.5111
Answer
christopher nolan
CORRECT - Who directed Inception? - christopher nolan
Execution time for answer_query: 0.3803 seconds


In [45]:
answer_query("What is the genre of Good Neighbors?", "comedy-drama") # Could also be comedy-drama, and comedy film. 

High confidence match found: genre with similarity  0.6418
Answer
art film
WRONG - What is the genre of Good Neighbors? - art film
Execution time for answer_query: 0.5601 seconds


In [46]:
answer_query("What is the MPAA film rating of Weathering with You?", "PG-13")

High confidence match found: mpaa film rating with similarity  0.4317
Answer
nc17
WRONG - What is the MPAA film rating of Weathering with You? - nc17
Execution time for answer_query: 0.7856 seconds


In [47]:
answer_query("Who is the director of Star Wars: Episode VI - Return of the Jedi?", "Richard Marquand")

High confidence match found: director with similarity  0.6866
Answer
richard marquand
WRONG - Who is the director of Star Wars: Episode VI - Return of the Jedi? - richard marquand
Execution time for answer_query: 0.7615 seconds


In [48]:
answer_query("Who is the screenwriter of The Masked Gang: Cyprus?", "Murat Aslan")

High confidence match found: screenwriter with similarity  0.4989
Answer
murat aslan
WRONG - Who is the screenwriter of The Masked Gang: Cyprus? - murat aslan
Execution time for answer_query: 0.5764 seconds


In [49]:
answer_query("In which movie did Angelina Jolie play?", "Interrupted")

Column played in has similarity  0.1238
Column acted in has similarity  0.1176
Answer
changeling
WRONG - In which movie did Angelina Jolie play? - changeling
Execution time for answer_query: 0.4981 seconds


In [50]:
answer_query("In which movie did Brad Pitt act?")

Column actors has similarity  0.1387
Column movie cast has similarity  0.1224
Answer
the tree of life
WRONG - In which movie did Brad Pitt act? - the tree of life
Execution time for answer_query: 0.5178 seconds


In [51]:
answer_query("In which movie did Liam Neeson act?")

High confidence match found: acted in with similarity  0.4722
Answer
star wars
WRONG - In which movie did Liam Neeson act? - star wars
Execution time for answer_query: 0.4318 seconds


In [52]:
answer_query("In which movie did Liam Neeson play?")

Column played in has similarity  0.1265
Column acted in has similarity  0.1202
Answer
star wars
WRONG - In which movie did Liam Neeson play? - star wars
Execution time for answer_query: 0.4612 seconds


In [53]:
answer_query("In which movie did Rebel Wilson act?")

Column actors has similarity  0.1657
Column movie cast has similarity  0.1462
Column imdb id has similarity  0.1247
Column occupation has similarity  0.1230
Answer
ghost rider
WRONG - In which movie did Rebel Wilson act? - ghost rider
Execution time for answer_query: 0.4260 seconds


In [54]:
answer_query("In which movie did Tom Cruise play?")

Column actors has similarity  0.1180
Column players has similarity  0.1113
Answer
days of thunder
WRONG - In which movie did Tom Cruise play? - days of thunder
Execution time for answer_query: 0.4609 seconds


In [55]:
answer_query("When was the Godfather III released?")

Column appeared in has similarity  0.1264
Column publication date has similarity  0.1237
Answer
1974-12-12
WRONG - When was the Godfather III released? - 1974-12-12
Execution time for answer_query: 0.5597 seconds


In [56]:
answer_query("Who are the cast in Jurassic Park?")

High confidence match found: movie cast with similarity  0.5334
Answer
bd wong, laura dern, sam neill, samuel l jackson
WRONG - Who are the cast in Jurassic Park? - bd wong, laura dern, sam neill, samuel l jackson
Execution time for answer_query: 0.5631 seconds


In [57]:
answer_query("Who is an actor in Jurassic Park?")

High confidence match found: actors with similarity  0.5670
Answer
wayne knight
WRONG - Who is an actor in Jurassic Park? - wayne knight
Execution time for answer_query: 0.5303 seconds


In [58]:
answer_query("Who is an actor in Inception?")

High confidence match found: actors with similarity  0.5670
Answer
tohoru masamune
WRONG - Who is an actor in Inception? - tohoru masamune
Execution time for answer_query: 0.5407 seconds


In [61]:
answer_query("Who is the director of Star Wars?")

High confidence match found: director with similarity  0.4614
Answer
james glickenhaus
WRONG - Who is the director of Star Wars? - james glickenhaus
Execution time for answer_query: 0.5750 seconds


In [62]:
answer_query("When was Inception released?")

High confidence match found: publication date with similarity  0.4198
Answer
2010-07-08
WRONG - When was Inception released? - 2010-07-08
Execution time for answer_query: 0.5275 seconds


In [65]:
answer_query("Who was Angelina Jolie married to?")

High confidence match found: spouse with similarity  0.4737
Answer
billy bob thornton
WRONG - Who was Angelina Jolie married to? - billy bob thornton
Execution time for answer_query: 0.6448 seconds


In [66]:
answer_query("Who was Brad Pitt married to?")

High confidence match found: spouse with similarity  0.4737
Answer
jennifer aniston, angelina jolie
WRONG - Who was Brad Pitt married to? - jennifer aniston, angelina jolie
Execution time for answer_query: 0.5027 seconds


In [81]:
answer_query("When was Titanic published?")

High confidence match found: publication date with similarity  0.5336
Answer
1953-01-01
WRONG - When was Titanic published? - 1953-01-01
Execution time for answer_query: 0.4498 seconds


In [90]:
answer_query("Who is the main actor in harry potter and the philosopher's stone?")

Empty DataFrame
Columns: []
Index: []
High confidence match found: actors with similarity  0.4514
Answer
fiona shaw
WRONG - Who is the main actor in harry potter and the philosopher's stone? - fiona shaw
Execution time for answer_query: 0.5463 seconds
