In [93]:
import os
import re

import pandas as pd
import numpy as np
from rapidfuzz import process, fuzz
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from transformers import logging as transformers_logging
from tabulate import tabulate
transformers_logging.set_verbosity_error()

In [34]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words_to_keep = ["what", "when", "where", "which", "while", "who", "whom", "why", "with", "how", "before", "after","same"]
stop_words = set([s for s in stopwords.words('english') if s not in stop_words_to_keep])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sandr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [35]:
class NERParser:
    def __init__(self, model_name: str = "dslim/bert-base-NER", lowercase: bool = False):
        """
        Initialize the NER parser with a model and optionally configure the lowercase preprocessing.
        """
        self.model_name = model_name
        self.lowercase = lowercase
        self.device = self.get_device()
        
        # Load the tokenizer and model
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, do_lower_case=self.lowercase)
        self.model = AutoModelForTokenClassification.from_pretrained(self.model_name)
        
        # Set up the NER pipeline
        self.nlp_pipeline = pipeline("ner", 
                                     model=self.model, 
                                     tokenizer=self.tokenizer, 
                                     device=self.device, 
                                     aggregation_strategy="simple")

    def get_device(self):
        """
        Determines whether to use MPS, CUDA, or CPU depending on the available hardware.
        """
        if torch.backends.mps.is_available():
            print("MPS device found, using MPS backend.\n")
            return torch.device("mps")
        elif torch.cuda.is_available():
            print(f"CUDA device found, using CUDA backend. Device: {torch.cuda.get_device_name(0)}\n")
            return torch.device("cuda")
        else:
            print("Neither MPS nor CUDA found, using CPU.\n")
            return torch.device("cpu")

    
    def parse_ner_results(self, ner_results: list):
        """
        Parse the NER results and extract entities related to 'PER' (persons) and 'MISC' (potential movie titles).
        """
        per_entities, misc_entities = [], []
        
        for entity in ner_results:
            # Extraction of all Persons
            if entity['entity_group'] == 'PER':
                per_entities.append(entity['word'])
            # Extraction of all Misc that could indicate movies
            elif entity['entity_group'] == 'MISC':
                misc_entities.append(entity['word'])
        
        return per_entities, misc_entities

    
    def process_query(self, query: str):
        """
        Processes a text query, runs NER, and returns the extracted actors and movie names.
        """
        # Optionally lowercase the input if configured
        if self.lowercase:
            query = query.lower()
        
        # Run the NER pipeline
        ner_results = self.nlp_pipeline(query)

        # Parse the results to extract actors and movies
        per_entities, misc_entities = self.parse_ner_results(ner_results)
        
        return per_entities, misc_entities

ner_parser = NERParser(lowercase=False)

Neither MPS nor CUDA found, using CPU.



In [94]:
class DataBase:
    """Handles the extraction of context data for given people and movies from a database, with fuzzy matching for names."""

    def __init__(self):
        self.db = pd.read_pickle(os.path.join(os.getcwd(), r"exports/extended_graph_triples.pkl"))
        self.reverse_index = pd.read_pickle(os.path.join(os.getcwd(), r"exports/reverse_index.pkl"))
        
        # otherwise exact matching will fail
        self.db['subject_id'] = self.db['subject_id'].astype(str).str.strip()


    def get_context(self, people: list, movies: list):
        """Fetch context data for the given people and movies from the database.
        First tries matching with the entire people or movie string. If that fails,
        it processes each person or movie individually.

        Args:
            people (list): List of people names to search.
            movies (list): List of movie titles to search.

        Returns:
            pd.DataFrame: DataFrame containing context for the given people and movies.
        """
        combined_people = " ".join(people) if people else ""
        combined_movies = " ".join(movies) if movies else ""
        
        context_person = pd.DataFrame()
        context_movie = pd.DataFrame()

        if combined_people:
            context_person = self._fetch_relevant(combined_people)
        
        if combined_movies:
            context_movie = self._fetch_relevant(combined_movies)
        
        if context_person.empty and people:
            person_results = [self._fetch_relevant(p) for p in people]
            if any(not res.empty for res in person_results):
                context_person = pd.concat(person_results, axis=1)

        if context_movie.empty and movies:
            movie_results = [self._fetch_relevant(m) for m in movies]
            if any(not res.empty for res in movie_results):
                context_movie = pd.concat(movie_results, axis=1)

        if not context_person.empty and not context_movie.empty:
            return pd.concat([context_person, context_movie], axis=1)
        
        elif not context_person.empty:
            return context_person
        
        elif not context_movie.empty:
            return context_movie
        
        else:
            return pd.DataFrame()

    def _fetch_relevant(self, name: str):
        """Fetch relevant information for a given name (either combined or individual) from the database.
        Uses fuzzy matching if no exact match is found.

        Args:
            name (str): The name of the person or movie to search for.

        Returns:
            pd.DataFrame: Relevant data for the name, or an empty DataFrame if no match is found.
        """

        matches = process.extractOne(name, self.reverse_index.subject_label, scorer=fuzz.ratio)
        
        if not matches or not matches[1] > 90:
            return pd.DataFrame()
        
        match_label = matches[0]
        print(f"Matched '{name}' to '{match_label}' with a score of {matches[1]}.")
        
        lst_of_ids = self.reverse_index.get(self.reverse_index.subject_label == match_label, []).subject_id.to_list()
        
        if len(lst_of_ids) == 0:
            return pd.DataFrame()
        
        relevant = self.db[self.db.subject_id.isin(lst_of_ids)].dropna(axis=1)

        if relevant.empty:
            print(f"No context data found for {match_label}. Performing partial match search.")
            relevant = self.db[self.db.subject_label.str.lower().str.startswith(name.lower())].dropna(axis=1)
            
        if relevant.empty:
            print(f"No context data found for {match_label}, even with partial match search.")
            return pd.DataFrame()
        
        pivot_df = relevant.pivot_table(
            index='subject_id',
            columns='predicate_label',
            values='object_label',
            aggfunc=lambda x: ' | '.join(x.astype(str))
        )
    
        pivot_df.reset_index(inplace=True)
        
        # print(tabulate(pivot_df, headers='keys', tablefmt='psql'))
    
        return pivot_df

In [37]:
class QueryEmbedder:
    
    def __init__(self):
        self._glove_embeddings = self._load_glove_embeddings("exports/glove.6B/glove.6B.300d.txt")
        
    def _load_glove_embeddings(self, file_path):
        embeddings = {}
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], dtype='float32')
                embeddings[word] = vector
        return embeddings
    
    def embed_phrase(self, phrase):
        words = phrase.split()
        word_vectors = [self._glove_embeddings[word.lower()] for word in words if word.lower() in self._glove_embeddings]
        
        if len(word_vectors) == 0:
            return np.zeros(300)
        
        return np.mean(word_vectors, axis=0)
    

In [38]:
class LLM():
    
    def __init__(self):
        self.qa_model = pipeline("question-answering", model="distilbert-base-cased-distilled-squad", top_k=1)
    
    def _query(self, query, context_df):
        
        context = ""
        for index, row in context_df.iterrows():
            row_context = " ".join([f"{col}: {row[col]}" for col in context_df[top_columns].columns])
            context += row_context + " "
        
        output = self.qa_model(question=query, context=context)
        
        answer_str = str()
        if isinstance(output, list):
            answer_str = ", ".join([result['answer'] for result in output])
            
        elif isinstance(output, dict):
            answer_str = output['answer']
        
        if not answer_str:
            answer_str = "No answer found."
        
        return answer_str

In [39]:
def filter_query(query, person, movies):
        
    person_list = [word.lower() for p in person for word in p.split(" ")] if len(person) else []
    movie_list = [word.lower() for m in movies for word in m.split(" ")] if len(movies) else []
        
    if not len(query):
        return []
    
    relevant = []
    for word in query.replace(". ", " ").lower().split(" "):
        cleaned_word = re.sub(r'[^A-Za-z]', '', word)
        if cleaned_word in stop_words or cleaned_word in person_list or cleaned_word in movie_list or cleaned_word == "":
            continue
        
        relevant.append(cleaned_word)
        
    return relevant

In [40]:
def cosine_sim(vec1, vec2):
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    
    if norm_vec1 == 0 or norm_vec2 == 0:
        return 0.0 
    
    return np.dot(vec1, vec2) / (norm_vec1 * norm_vec2)

def find_closest_columns(query_embeddings, column_embeddings, top_n=5):
    column_similarities = {}

    for col, col_vec in column_embeddings.items():
        similarities = [cosine_sim(col_vec, q_vec) for q_vec in query_embeddings if np.linalg.norm(q_vec) > 0]
        column_similarities[col] = np.mean(similarities) if similarities else -1

    sorted_columns = sorted(column_similarities.items(), key=lambda item: item[1], reverse=True)
    
    return [col for col, sim in sorted_columns[:top_n]]

In [96]:
# Main
db = DataBase()

qe = QueryEmbedder()

llm = LLM()

In [97]:
query = "Who is the screenwriter of The Masked Gang: Cyprus?"
person, movies = ner_parser.process_query(query)

context = db.get_context(person,movies)
column_embeddings = {col: qe.embed_phrase(col) for col in context.columns}

filtered_query = filter_query(query, person, movies)
query_embeddings = [qe.embed_phrase(word) for word in filtered_query]


Matched 'The Masked Gang' to 'The Masked Gang' with a score of 100.0.
+----+-----------------------------------------+-----------+----------------------------------------------------------------------------+---------+---------------------+-------------+------------------+--------------------+-------------+---------------+----------------------+--------------------------+-----------------+----------------------------------------+----------------------+--------------------+----------------+
|    | subject_id                              | IMDb ID   | cast member                                                                | color   | country of origin   | director    | distributed by   | filming location   | genre       | instance of   | narrative location   | node description         | node label      | original language of film or TV show   | production company   | publication date   | screenwriter   |
|----+-----------------------------------------+-----------+----------------------

In [52]:
top_columns = find_closest_columns(query_embeddings, column_embeddings, top_n=10)
filtered_context_df = context[top_columns]

In [47]:
llm._query(query, filtered_context_df)

'Murat Aslan'