In [138]:
import os
import re

import unicodedata
import time
import pandas as pd
import numpy as np
from rapidfuzz import process, fuzz
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline, AutoModel
from transformers import logging as transformers_logging
from sentence_transformers import SentenceTransformer
from tabulate import tabulate
transformers_logging.set_verbosity_error()
import json

# Silencing TqdmWarning
import warnings
warnings.filterwarnings('ignore')

In [139]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words_to_keep = ["what", "when", "where", "which", "while", "who", "whom", "why", "with", "how", "before", "after","same"]
stop_words = set([s for s in stopwords.words('english') if s not in stop_words_to_keep])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sandr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [140]:
class NERParser:
    def __init__(self, model_name: str = "dslim/bert-base-NER", lowercase: bool = False):
        """
        Initialize the NER parser with a model and optionally configure the lowercase preprocessing.
        """
        self.model_name = model_name
        self.lowercase = lowercase
        self.device = self.get_device()
        
        # Load the tokenizer and model
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, do_lower_case=self.lowercase)
        self.model = AutoModelForTokenClassification.from_pretrained(self.model_name)
        
        # Set up the NER pipeline
        self.nlp_pipeline = pipeline(
            "ner", 
            model=self.model, 
            tokenizer=self.tokenizer, 
            device=self.device, 
            aggregation_strategy="simple"
        )

    def get_device(self):
        """
        Determines whether to use MPS, CUDA, or CPU depending on the available hardware.
        """
        if torch.backends.mps.is_available():
            print("MPS device found, using MPS backend.\n")
            return torch.device("mps")
        elif torch.cuda.is_available():
            print(f"CUDA device found, using CUDA backend. Device: {torch.cuda.get_device_name(0)}\n")
            return torch.device("cuda")
        else:
            print("Neither MPS nor CUDA found, using CPU.\n")
            return torch.device("cpu")

    
    def parse_ner_results(self, ner_results: list):
        """
        Parse the NER results and extract entities related to 'PER' (persons) and 'MISC' (potential movie titles).
        """
        per_entities, misc_entities = [], []
        
        for entity in ner_results:
            # Extraction of all Persons
            if entity['entity_group'] == 'PER':
                per_entities.append(entity['word'])
            # Extraction of all Misc that could indicate movies
            elif entity['entity_group'] == 'MISC':
                misc_entities.append(entity['word'])
        
        return per_entities, misc_entities

    
    def process_query(self, query: str):
        """
        Processes a text query, runs NER, and returns the extracted actors and movie names.
        """
        # Optionally lowercase the input if configured
        if self.lowercase:
            query = query.lower()
        
        # Run the NER pipeline
        ner_results = self.nlp_pipeline(query)

        # Parse the results to extract actors and movies
        per_entities, misc_entities = self.parse_ner_results(ner_results)
        
        return per_entities, misc_entities

ner_parser = NERParser(lowercase=False)

Neither MPS nor CUDA found, using CPU.



In [141]:
class DataBase:
    """Handles the extraction of context data for given people and movies from a database, with fuzzy matching for names."""

    def __init__(self):
        self.db = pd.read_pickle(os.path.join(os.getcwd(), r"exports/extended_graph_triples.pkl"))
        
        with open(r"exports/entity_db.json", encoding="utf-8") as f: 
            self.entities = json.load(f)
            self.entity_list = list(subject.lower() for subject, types in self.entities.values())
        
        # otherwise exact matching will fail
        self.db['subject_id'] = self.db['subject_id'].astype(str).str.strip()

    @staticmethod
    def normalize_string(s):
        """Cleans the input entity to a uniform naming convention, by removing non ascii characters, encoding it to utf, setting it to lowercase, and removing redundant spaces"""
        s = s.lower()
        s = unicodedata.normalize('NFKD', s)
        s = s.encode('ascii', 'ignore').decode('utf-8')
        s = re.sub(r'[^\w\s]', '', s)
        s = ' '.join(s.split())
        return s
    
    def fetch(self, entity_lst, search_column):
        
        relevant = self.db[self.db[search_column].isin(entity_lst)].dropna(axis=1)
           
        if relevant.empty:
            print(f"No context data found for given information.")
            return pd.DataFrame()          
        
        pivot_df = relevant.pivot_table(
            index='subject_id',
            columns='predicate_label',
            values='object_label',
            aggfunc=lambda x: ' | '.join(x.astype(str))
        )
    
        pivot_df.reset_index(inplace=True)
    
        return pivot_df

In [142]:
class QueryEmbedder:
    
    def __init__(self):
        self._glove_embeddings = self._load_glove_embeddings("exports/glove.6B/glove.6B.300d.txt")
        
    def _load_glove_embeddings(self, file_path):
        embeddings = {}
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], dtype='float32')
                embeddings[word] = vector
        return embeddings
    
    def embed_phrase(self, phrase):
        words = phrase.split()
        word_vectors = [self._glove_embeddings[word.lower()] for word in words if word.lower() in self._glove_embeddings]
        
        if len(word_vectors) == 0:
            return np.zeros(300)
        
        return np.mean(word_vectors, axis=0)
    

In [143]:
class QueryEmbedderContextualized:
    def __init__(self, model_name='sentence-transformers/all-MiniLM-L6-v2', device=None):
        """
        Initializes the QueryEmbedder with a SentenceTransformer model for sentence embeddings.
        """
        self.device = self.get_device()
        self.model = SentenceTransformer(model_name, device=self.device)
        self.cache = {}
    
    
    def get_device(self):
        """
        Determines whether to use MPS, CUDA, or CPU depending on the available hardware.
        """
        if torch.backends.mps.is_available():
            print("MPS device found, using MPS backend.\n")
            return torch.device("mps")
        elif torch.cuda.is_available():
            print(f"CUDA device found, using CUDA backend. Device: {torch.cuda.get_device_name(0)}\n")
            return torch.device("cuda")
        else:
            print("Neither MPS nor CUDA found, using CPU.\n")
            return torch.device("cpu")
    
    def embed_phrase(self, phrases):
        """
        Generates embeddings for given phrases using SentenceTransformer.

        Args:
            phrases (str or List[str]): The input phrase(s) to embed.

        Returns:
            np.ndarray: The embedding vector(s) for the phrase(s).
        """
        # Ensure phrases is a list
        if isinstance(phrases, str):
            phrases = [phrases]
        elif not isinstance(phrases, list):
            raise TypeError("Input must be a string or a list of strings.")

        embeddings = []
        phrases_to_compute = []
        indices_to_compute = []

        for idx, phrase in enumerate(phrases):
            if phrase in self.cache:
                embeddings.append(self.cache[phrase])
            else:
                embeddings.append(None)
                phrases_to_compute.append(phrase)
                indices_to_compute.append(idx)
        
        if phrases_to_compute:
            new_embeddings = self.model.encode(
                phrases_to_compute, 
                show_progress_bar=False, 
                convert_to_numpy=True, 
                normalize_embeddings=True
            )
            for idx, emb in zip(indices_to_compute, new_embeddings):
                embeddings[idx] = emb
                self.cache[phrases[idx]] = emb
        
        # Return embeddings
        if len(embeddings) == 1:
            return embeddings[0]
        else:
            return np.array(embeddings)

In [263]:
class LLM():
    
    def __init__(self):
        self.qa_model = pipeline("question-answering", model="distilbert-base-cased-distilled-squad", top_k=1)
    
    def query(self, query, context_df):
        
        top_columns = context_df.columns
        
        context = ""
        for index, row in context_df.iterrows():
            node_label = row.get("name", "Unknown")
            row_context = f"This text is about \"{node_label}\":\n"
            row_context += "\n".join([f"{col}: {row[col]}" for col in context_df[top_columns].columns])
            row_context += "\n\n"
            context += row_context
        
        output = self.qa_model(question=query, context=context)
        
        answer_str = str()
        if isinstance(output, list) and output:
            answer_str = ", ".join([result['answer'] for result in output])
            
        elif isinstance(output, dict):
            answer_str = output['answer']
        
        if not answer_str:
            answer_str = "No answer found."
        
        return answer_str

In [264]:
def cosine_sim(vec1, vec2):
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    
    if norm_vec1 == 0 or norm_vec2 == 0:
        return 0.0 
    
    return np.dot(vec1, vec2) / (norm_vec1 * norm_vec2)

def find_closest_columns(query_embeddings, column_embeddings, top_n=5):
    column_similarities = {}

    for col, col_vec in column_embeddings.items():
        similarities = [cosine_sim(col_vec, q_vec) for q_vec in query_embeddings if np.linalg.norm(q_vec) > 0]
        column_similarities[col] = np.mean(similarities) if similarities else -1

    sorted_columns = sorted(column_similarities.items(), key=lambda item: item[1], reverse=True)
    
    return [col for col, sim in sorted_columns[:top_n]]

In [265]:
db = DataBase()

#qe = QueryEmbedder()

qe = QueryEmbedderContextualized()

llm = LLM()

Neither MPS nor CUDA found, using CPU.



In [266]:
def fuzzy_match(query_str, comparison_list, threshold=30):
    matches = process.extract(query_str, comparison_list, scorer=fuzz.partial_ratio, limit=100)
    
    id_name_score = []
    for match in matches:
        name = match[0]
        score = match[1]
        matched_id = next(key for key, value in db.entities.items() if value[0] == name)
            
        length_ratio = len(name) / len(query_str)
        adjusted_score = score * length_ratio
                
        id_name_score.append((matched_id, name, adjusted_score))
        
    # print(tabulate(id_name_score[:3], headers=["id", "Name", "Score"], tablefmt="grid"))
    
    return [id for id, _, score in id_name_score if score >= threshold]

In [267]:
def get_top_matches(df, normalized_query, top_n=2):
    concatenated_rows = df.apply(lambda row: ' '.join(row.astype(str)), axis=1).tolist()
    top_matches = process.extract(normalized_query, concatenated_rows, scorer=fuzz.partial_ratio, limit=top_n)
    
    top_indices = [match[2] for match in top_matches]
    
    return df.iloc[top_indices]

In [285]:
def answer_query(query, correct_answer=""):
    normalized_query = db.normalize_string(query)
    
    index_matches = fuzzy_match(normalized_query, db.entity_list, threshold=30)
    
    context = db.fetch(index_matches, "subject_id")
    
    # EXPERIMENTAL
    context = get_top_matches(context, normalized_query, top_n=1)
    
    if context.empty:
        print("No context data found for given IDs or string")
        context = pd.DataFrame()
        
    # EXPERIMENTAL - remove unused columns
    elements_to_remove = ["image"]
    context = context.drop(columns=elements_to_remove, errors='ignore')
    
    # EXPERIMENTAL - rename columns
    columns_to_rename = {
        "node label": "name",
        "cast member":"movie cast member"
    }
    columns_to_rename = {k: v for k, v in columns_to_rename.items() if k in context.columns}
    context = context.rename(columns=columns_to_rename)
    
    
    column_embeddings = {col: qe.embed_phrase(col) for col in context.columns}
    query_embeddings = [qe.embed_phrase(word) for word in query.split()]  
    top_columns_embeddings = find_closest_columns(query_embeddings, column_embeddings, top_n=7)
    
    # EXPERIMENTAL
    top_columns_dict = process.extract(normalized_query, context.columns, scorer=fuzz.partial_ratio, limit=3)
    top_columns_fuzzy = [c[0] for c in top_columns_dict]
    
    # print("Fuzyy Columns:")
    # print("\n".join(top_columns_fuzzy))
    
    # print("\nEmbedding Columns:")
    # print("\n".join(top_columns_embeddings))
    
    # print(context)
    
    # EXPERIMENTAL - always keep columns
    col_always_keep = ["name"]
    
    combined_columns = set(top_columns_fuzzy + top_columns_embeddings + col_always_keep)
    top_columns = [col for col in combined_columns if col in context.columns]
    filtered_context_df = context[top_columns]

    answer = llm.query(query, filtered_context_df)
    
    ### EXPERIMENTAL
    # Check if the "name" column exists in the filtered context
    if "name" in filtered_context_df.columns:
        name = filtered_context_df.iloc[0]["name"]
        hint_df = pd.DataFrame([[answer, name]], columns=["hint", "name"])
        combined_df = pd.concat([hint_df, filtered_context_df], ignore_index=True)
        answer = llm.query(query, combined_df)
        
    normalized_answer = db.normalize_string(answer)
    normalized_correct_answer = db.normalize_string(correct_answer)
    
    print(f"{'CORRECT' if normalized_correct_answer in normalized_answer else 'WRONG'} - {query} - {answer}")


In [286]:
answer_query("Who is the director of Good Will Hunting?", "Gus Van Sant")

CORRECT - Who is the director of Good Will Hunting? - gus van sant


In [287]:
answer_query("Who directed The Bridge on the River Kwai?", "David Lean")

CORRECT - Who directed The Bridge on the River Kwai? - david lean


In [288]:
answer_query("Who directed the movie The Godfather?", "Francis Ford Coppola") # or Mario Puzo or Francis Ford Coppola

CORRECT - Who directed the movie The Godfather? - francis ford coppola


In [289]:
answer_query("Who is the director of The Dark Knight?", "Christopher Nolan")

CORRECT - Who is the director of The Dark Knight? - christopher nolan


In [290]:
answer_query("Who directed The Dark Knight?", "Christopher Nolan")

CORRECT - Who directed The Dark Knight? - christopher nolan


In [291]:
answer_query("Where was Angelina Jolie born?", "Los Angeles")

WRONG - Where was Angelina Jolie born? - Ángela Molina


In [292]:
answer_query("Which role had Lenardo di Caprio in Inception?", "actor")

CORRECT - Which role had Lenardo di Caprio in Inception? - male actor in a supporting role


In [293]:
answer_query("Who directed Inception?", "christopher nolan")

WRONG - Who directed Inception? - sidney j furie


In [294]:
answer_query("What is the genre of Good Neighbors?", "comedydrama") # Could also be comedy-drama, and comedy film. 

CORRECT - What is the genre of Good Neighbors? - comedydrama


In [295]:
answer_query("What is the MPAA film rating of Weathering with You?", "PG-13")

WRONG - What is the MPAA film rating of Weathering with You? - mpaa


In [296]:
answer_query("Who is the director of Star Wars: Episode VI - Return of the Jedi?", "Richard Marquand")

CORRECT - Who is the director of Star Wars: Episode VI - Return of the Jedi? - richard marquand


In [297]:
answer_query("Who is the screenwriter of The Masked Gang: Cyprus?", "Murat Aslan")

CORRECT - Who is the screenwriter of The Masked Gang: Cyprus? - murat aslan


In [298]:
answer_query("In which movie did Angelina Jolie play?")

CORRECT - In which movie did Angelina Jolie play? - Angelina Jolie
