In [1]:
import os
import re

import unicodedata
import time
import pandas as pd
import numpy as np
from rapidfuzz import process, fuzz
import torch
from torch import py_int
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline, AutoModel
from transformers import logging as transformers_logging
from sentence_transformers import SentenceTransformer
from tabulate import tabulate
transformers_logging.set_verbosity_error()
import json

# Silencing TqdmWarning
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words_to_keep = ["what", "when", "where", "which", "while", "who", "whom", "why", "with", "how", "before", "after","same"]
stop_words = set([s for s in stopwords.words('english') if s not in stop_words_to_keep])

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kevinbrundler/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
class NERParser:
    def __init__(self, model_name: str = "dslim/bert-base-NER", lowercase: bool = False):
        """
        Initialize the NER parser with a model and optionally configure the lowercase preprocessing.
        """
        self.model_name = model_name
        self.lowercase = lowercase
        self.device = self.get_device()
        
        # Load the tokenizer and model
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, do_lower_case=self.lowercase)
        self.model = AutoModelForTokenClassification.from_pretrained(self.model_name)
        
        # Set up the NER pipeline
        self.nlp_pipeline = pipeline(
            "ner", 
            model=self.model, 
            tokenizer=self.tokenizer, 
            device=self.device, 
            aggregation_strategy="simple"
        )

    def get_device(self):
        """
        Determines whether to use MPS, CUDA, or CPU depending on the available hardware.
        """
        if torch.backends.mps.is_available():
            print("MPS device found, using MPS backend.\n")
            return torch.device("mps")
        elif torch.cuda.is_available():
            print(f"CUDA device found, using CUDA backend. Device: {torch.cuda.get_device_name(0)}\n")
            return torch.device("cuda")
        else:
            print("Neither MPS nor CUDA found, using CPU.\n")
            return torch.device("cpu")

    
    def parse_ner_results(self, ner_results: list):
        """
        Parse the NER results and extract entities related to 'PER' (persons) and 'MISC' (potential movie titles).
        """
        per_entities, misc_entities = [], []
        
        for entity in ner_results:
            # Extraction of all Persons
            if entity['entity_group'] == 'PER':
                per_entities.append(entity['word'])
            # Extraction of all Misc that could indicate movies
            elif entity['entity_group'] == 'MISC':
                misc_entities.append(entity['word'])
        
        return per_entities, misc_entities

    
    def process_query(self, query: str):
        """
        Processes a text query, runs NER, and returns the extracted actors and movie names.
        """
        # Optionally lowercase the input if configured
        if self.lowercase:
            query = query.lower()
        
        # Run the NER pipeline
        ner_results = self.nlp_pipeline(query)

        # Parse the results to extract actors and movies
        per_entities, misc_entities = self.parse_ner_results(ner_results)
        
        return per_entities, misc_entities

ner_parser = NERParser(lowercase=False)

MPS device found, using MPS backend.



In [4]:
class DataBase:
    """Handles the extraction of context data for given people and movies from a database, with fuzzy matching for names."""

    def __init__(self):
        self.db = pd.read_pickle(os.path.join(os.getcwd(), r"exports/extended_graph_triples.pkl"))
        
        with open(r"exports/entity_db.json", encoding="utf-8") as f: 
            self.entities = json.load(f)
            self.entity_list = list(subject.lower() for subject, types in self.entities.values())
        
        # otherwise exact matching will fail
        self.db['subject_id'] = self.db['subject_id'].astype(str).str.strip()

    @staticmethod
    def normalize_string(s):
        """Cleans the input entity to a uniform naming convention, by removing non ascii characters, encoding it to utf, setting it to lowercase, and removing redundant spaces"""
        s = s.lower()
        s = unicodedata.normalize('NFKD', s)
        s = s.encode('ascii', 'ignore').decode('utf-8')
        s = re.sub(r'[^\w\s]', '', s)
        s = ' '.join(s.split())
        return s
    
    def fetch(self, entity_lst, search_column):
        
        relevant = self.db[self.db[search_column].isin(entity_lst)].dropna(axis=1)
           
        if relevant.empty:
            print(f"No context data found for given information.")
            return pd.DataFrame()          
        
        pivot_df = relevant.pivot_table(
            index='subject_id',
            columns='predicate_label',
            values='object_label',
            aggfunc=lambda x: ' | '.join(x.astype(str))
        )
    
        pivot_df.reset_index(inplace=True)
    
        return pivot_df

In [5]:
class QueryEmbedder:
    
    def __init__(self):
        self._glove_embeddings = self._load_glove_embeddings("exports/glove.6B/glove.6B.300d.txt")
        
    def _load_glove_embeddings(self, file_path):
        embeddings = {}
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], dtype='float32')
                embeddings[word] = vector
        return embeddings
    
    def embed_phrase(self, phrase):
        words = phrase.split()
        word_vectors = [self._glove_embeddings[word.lower()] for word in words if word.lower() in self._glove_embeddings]
        
        if len(word_vectors) == 0:
            return np.zeros(300)
        
        return np.mean(word_vectors, axis=0)
    

In [6]:
class QueryEmbedderContextualized:
    def __init__(self, model_name='sentence-transformers/all-MiniLM-L6-v2', device=None):
        """
        Initializes the QueryEmbedder with a SentenceTransformer model for sentence embeddings.
        """
        self.device = self.get_device()
        self.model = SentenceTransformer(model_name, device=self.device)
        self.cache = {}
    
    
    def get_device(self):
        """
        Determines whether to use MPS, CUDA, or CPU depending on the available hardware.
        """
        if torch.backends.mps.is_available():
            print("MPS device found, using MPS backend.\n")
            return torch.device("mps")
        elif torch.cuda.is_available():
            print(f"CUDA device found, using CUDA backend. Device: {torch.cuda.get_device_name(0)}\n")
            return torch.device("cuda")
        else:
            print("Neither MPS nor CUDA found, using CPU.\n")
            return torch.device("cpu")
    
    def embed_phrase(self, phrases):
        """
        Generates embeddings for given phrases using SentenceTransformer.

        Args:
            phrases (str or List[str]): The input phrase(s) to embed.

        Returns:
            np.ndarray: The embedding vector(s) for the phrase(s).
        """
        # Ensure phrases is a list
        if isinstance(phrases, str):
            phrases = [phrases]
        elif not isinstance(phrases, list):
            raise TypeError("Input must be a string or a list of strings.")

        embeddings = []
        phrases_to_compute = []
        indices_to_compute = []

        for idx, phrase in enumerate(phrases):
            if phrase in self.cache:
                embeddings.append(self.cache[phrase])
            else:
                embeddings.append(None)
                phrases_to_compute.append(phrase)
                indices_to_compute.append(idx)
        
        if phrases_to_compute:
            new_embeddings = self.model.encode(
                phrases_to_compute, 
                show_progress_bar=False, 
                convert_to_numpy=True, 
                normalize_embeddings=True
            )
            for idx, emb in zip(indices_to_compute, new_embeddings):
                embeddings[idx] = emb
                self.cache[phrases[idx]] = emb
        
        # Return embeddings
        if len(embeddings) == 1:
            return embeddings[0]
        else:
            return np.array(embeddings)

In [7]:
class LLM():
    
    def __init__(self):
        self.qa_model = pipeline("question-answering", model="distilbert-base-cased-distilled-squad", top_k=1)
    
    def query(self, query, context_df):
        
        top_columns = context_df.columns
        
        context = ""
        for index, row in context_df.iterrows():
            node_label = row.get("node label", "Unknown")
            row_context = f"This text is about \"{node_label}\":\n"
            row_context += "\n".join([f"{col}: {row[col]}" for col in context_df[top_columns].columns if col != "node label"])
            row_context += "\n\n"
            context += row_context
        
        output = self.qa_model(question=query, context=context)
        
        answer_str = str()
        if isinstance(output, list) and output:
            answer_str = ", ".join([result['answer'] for result in output])
            
        elif isinstance(output, dict):
            answer_str = output['answer']
        
        if not answer_str:
            answer_str = "No answer found."
        
        return answer_str

In [316]:
def cosine_sim(vec1, vec2):
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    
    if norm_vec1 == 0 or norm_vec2 == 0:
        return 0.0 
    
    return np.dot(vec1, vec2) / (norm_vec1 * norm_vec2)


def rescale_probabilities(similarities):
    """
    Rescales the similarity scores so that they sum to 1, turning them into a probability distribution.
    
    Args:
        similarities (List[float]): List of similarity scores.
        
    Returns:
        List[float]: Rescaled probabilities.
    """
    similarity_sum = sum(similarities)
    if similarity_sum == 0:
        return [0] * len(similarities)  # Avoid division by zero
    
    return [sim / similarity_sum for sim in similarities]

def find_closest_columns(query_embeddings, column_embeddings, high_threshold=0.4, top_n=10, rescaled_threshold=0.1):
    """
    Returns columns based on cosine similarity with a two-tiered strategy and rescaled probabilities.
    - If a column has similarity above 'high_threshold', return that column immediately.
    - Otherwise, return all columns with a similarity greater than 'low_threshold'.
    - Rescale the top N column similarities into probabilities and return columns with a rescaled probability greater than rescaled_threshold.
    
    Args:
        query_embeddings (List[np.ndarray]): Embeddings for query words.
        column_embeddings (Dict[str, np.ndarray]): Precomputed embeddings for columns.
        low_threshold (float): Minimum similarity threshold (default: 0.27).
        high_threshold (float): Confidence threshold to return immediately (default: 0.35).
        top_n (int): Number of top columns to consider for rescaling (default: 10).
        rescaled_threshold (float): Minimum rescaled probability threshold (default: 0.1).
    
    Returns:
        List[str]: The selected column names.
    """
    column_similarities = {}

    for col, col_vec in column_embeddings.items():
        similarities = [cosine_sim(col_vec, q_vec) for q_vec in query_embeddings if np.linalg.norm(q_vec) > 0]
        column_similarities[col] = np.mean(similarities) if similarities else -1

    sorted_columns = sorted(column_similarities.items(), key=lambda item: item[1], reverse=True)
    top_columns = sorted_columns[:top_n]
    
    column_names, similarities = zip(*top_columns)
    
    rescaled_probs = rescale_probabilities(similarities)
    
    selected_columns = []
    
    for col, sim in zip(column_names, similarities):
        if sim >= high_threshold:
            print(f"High confidence match found: {col} with similarity {sim: .4f}")
            return [col]
    
    for col, rescaled_prob in zip(column_names, rescaled_probs):
        print(f"Column {col} has similarity {rescaled_prob: .4f}")
        if rescaled_prob >= rescaled_threshold:
            selected_columns.append(col)
    
    return selected_columns

In [317]:
db = DataBase()

#qe = QueryEmbedder()

qe = QueryEmbedderContextualized()

llm = LLM()

MPS device found, using MPS backend.



In [318]:
def filter_query(query, node_label):
                
    if not len(query):
        return []
    
    relevant = []
    for word in query.replace(". ", " ").lower().split(" "):
        cleaned_word = re.sub(r'[^A-Za-z]', '', word)
        if cleaned_word in stop_words or cleaned_word in node_label.lower().replace(" ", "") or cleaned_word == "":
            continue
        
        relevant.append(cleaned_word)
        
    return " ".join(relevant)

In [319]:
def fuzzy_match(query_str, comparison_list, threshold=30):
    matches = process.extract(query_str, comparison_list, scorer=fuzz.partial_ratio, limit=100)
    
    id_name_score = []
    for match in matches:
        name = match[0]
        score = match[1]
        matched_id = next(key for key, value in db.entities.items() if value[0] == name)
            
        length_ratio = len(name) / len(query_str)
        adjusted_score = score * length_ratio
                
        id_name_score.append((matched_id, name, adjusted_score))
        
    # print(tabulate(id_name_score[:3], headers=["id", "Name", "Score"], tablefmt="grid"))
    
    return [id for id, _, score in id_name_score if score >= threshold]

In [320]:
def get_top_matches(df, normalized_query, top_n=2):
    concatenated_rows = df.apply(lambda row: ' '.join(row.astype(str)), axis=1).tolist()
    top_matches = process.extract(normalized_query, concatenated_rows, scorer=fuzz.partial_ratio, limit=top_n)
    
    top_indices = [match[2] for match in top_matches]
    
    return df.iloc[top_indices]

In [344]:
def answer_query(query, correct_answer=""):
    normalized_query = db.normalize_string(query)
    
    index_matches = fuzzy_match(normalized_query, db.entity_list, threshold=30)
        
    context = db.fetch(index_matches, "subject_id")
        
    # EXPERIMENTAL
    context = get_top_matches(context, normalized_query, top_n=1)
    
    try:
        node_label = context["node label"].values[0]
    except Exception:
        node_label = ""
    
    if context.empty:
        print("No context data found for given IDs or string")
        context = pd.DataFrame()
        
    # EXPERIMENTAL - remove unused columns
    elements_to_remove = ["image"]
    context = context.drop(columns=elements_to_remove, errors='ignore')
    
    # EXPERIMENTAL - rename columns
    columns_to_rename = {
        "cast member":"movie cast"
    }
    columns_to_rename = {k: v for k, v in columns_to_rename.items() if k in context.columns}
    context = context.rename(columns=columns_to_rename)
    
    query_filtered = filter_query(query, node_label)
        
    column_embeddings = {col: qe.embed_phrase(col) for col in context.columns}
    query_embeddings = [qe.embed_phrase(word) for word in query_filtered.split()]  
    top_columns_embeddings = find_closest_columns(query_embeddings, column_embeddings)
    
    # EXPERIMENTAL
    top_columns_dict = process.extract(normalized_query, context.columns, scorer=fuzz.partial_ratio, limit=3)
    top_columns_fuzzy = [c[0] for c in top_columns_dict]
    
    # MANUAL OVERWRITE:
    top_columns_fuzzy = []
        
    # EXPERIMENTAL - always keep columns
    col_always_keep = []
    
    combined_columns = set(top_columns_fuzzy + top_columns_embeddings + col_always_keep)
    top_columns = [col for col in combined_columns if col in context.columns]
    filtered_context_df = context[top_columns]

    answer = llm.query(query, filtered_context_df)
    
    print("Answer")
    print(answer)
    
    ### EXPERIMENTAL
    filtered_context_df["hint"] = answer
    answer = llm.query(query, filtered_context_df)
        
    normalized_answer = db.normalize_string(answer)
    normalized_correct_answer = db.normalize_string(correct_answer)
    
    print(f"{'CORRECT' if normalized_correct_answer and normalized_correct_answer in normalized_answer else 'WRONG'} - {query} - {answer}")


In [345]:
answer_query("Who is the director of Good Will Hunting?", "Gus Van Sant")

High confidence match found: director with similarity  0.7044
Answer
Gus Van Sant
CORRECT - Who is the director of Good Will Hunting? - Gus Van Sant
hint: Gus Van Sant


In [346]:
answer_query("Who directed The Bridge on the River Kwai?", "David Lean")

High confidence match found: director with similarity  0.4908
Answer
David Lean
CORRECT - Who directed The Bridge on the River Kwai? - David Lean


In [347]:
answer_query("Who directed the movie The Godfather?", "Francis Ford Coppola") # or Mario Puzo or Francis Ford Coppola

High confidence match found: director with similarity  0.4927
Answer
Francis Ford Coppola
CORRECT - Who directed the movie The Godfather? - Francis Ford Coppola


In [348]:
answer_query("Who is the director of The Dark Knight?", "Christopher Nolan")

High confidence match found: director with similarity  0.7044
Answer
Christopher Nolan
CORRECT - Who is the director of The Dark Knight? - Christopher Nolan


In [349]:
answer_query("Who directed The Dark Knight?", "Christopher Nolan")

High confidence match found: director with similarity  0.4908
Answer
Christopher Nolan
CORRECT - Who directed The Dark Knight? - Christopher Nolan


In [350]:
answer_query("Where was Angelina Jolie born?", "Los Angeles")

High confidence match found: place of birth with similarity  0.5401
Answer
Los Angeles
CORRECT - Where was Angelina Jolie born? - Los Angeles


In [351]:
answer_query("Which role had Lenardo di Caprio in Inception?", "actor")

Column director has similarity  0.1229
Column occupation has similarity  0.1138
Column executive producer has similarity  0.1018
Column winner has similarity  0.0991
Column screenwriter has similarity  0.0969
Column nominated for has similarity  0.0960
Column instance of has similarity  0.0948
Column performer has similarity  0.0937
Column movie cast has similarity  0.0926
Column IMDb ID has similarity  0.0883
Answer
actor
director
CORRECT - Which role had Lenardo di Caprio in Inception? - actor
director


In [352]:
answer_query("Who directed Inception?", "christopher nolan")

High confidence match found: director with similarity  0.4404
Answer
Sidney J. Furie
WRONG - Who directed Inception? - Sidney J. Furie


In [353]:
answer_query("What is the genre of Good Neighbors?", "art film") # Could also be comedy-drama, and comedy film. 

High confidence match found: genre with similarity  0.6792
Answer
art film, comedy-drama
CORRECT - What is the genre of Good Neighbors? - art film


In [354]:
answer_query("What is the MPAA film rating of Weathering with You?", "PG-13")

High confidence match found: MPAA film rating with similarity  0.4311
Answer
NC-17
WRONG - What is the MPAA film rating of Weathering with You? - NC-17


In [355]:
answer_query("Who is the director of Star Wars: Episode VI - Return of the Jedi?", "Richard Marquand")

High confidence match found: director with similarity  0.7044
Answer
Richard Marquand
CORRECT - Who is the director of Star Wars: Episode VI - Return of the Jedi? - Richard Marquand


In [356]:
answer_query("Who is the screenwriter of The Masked Gang: Cyprus?", "Murat Aslan")

High confidence match found: screenwriter with similarity  0.5085
Answer
Murat Aslan
CORRECT - Who is the screenwriter of The Masked Gang: Cyprus? - Murat Aslan


In [357]:
answer_query("In which movie did Angelina Jolie play?")

Column child has similarity  0.1181
Column mother has similarity  0.1063
Column director has similarity  0.1055
Column father has similarity  0.1049
Column sibling has similarity  0.1004
Column religion has similarity  0.0979
Column relative has similarity  0.0949
Column instance of has similarity  0.0916
Column winner has similarity  0.0910
Column occupation has similarity  0.0894
Answer
Land of Blood and Honey
WRONG - In which movie did Angelina Jolie play? - Land of Blood and Honey


In [358]:
answer_query("In which movie did Brad Pitt play?")

Column narrator has similarity  0.1141
Column director has similarity  0.1127
Column movie cast has similarity  0.1066
Column color has similarity  0.1040
Column screenwriter has similarity  0.1036
Column IMDb ID has similarity  0.0988
Column occupation has similarity  0.0978
Column genre has similarity  0.0931
Column production company has similarity  0.0890
Column instance of has similarity  0.0802
Answer
Unknown
WRONG - In which movie did Brad Pitt play? - Unknown


In [359]:
answer_query("In which movie did Liam Neeson play?")

High confidence match found: sport with similarity  0.4461
Answer
Unknown
WRONG - In which movie did Liam Neeson play? - Unknown


In [360]:
answer_query("In which movie did Liam Neeson have a role?")

High confidence match found: director with similarity  0.4111
Answer
Unknown
WRONG - In which movie did Liam Neeson have a role? - Unknown
