In [241]:
import pandas as pd
from rapidfuzz import process, fuzz
import json

In [242]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from transformers import logging as transformers_logging
transformers_logging.set_verbosity_error()

class NERParser:
    def __init__(self, model_name: str = "dslim/bert-base-NER", lowercase: bool = False):
        """
        Initialize the NER parser with a model and optionally configure the lowercase preprocessing.
        """
        self.model_name = model_name
        self.lowercase = lowercase
        self.device = self.get_device()
        
        # Load the tokenizer and model
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, do_lower_case=self.lowercase)
        self.model = AutoModelForTokenClassification.from_pretrained(self.model_name)
        
        # Set up the NER pipeline
        self.nlp_pipeline = pipeline("ner", 
                                     model=self.model, 
                                     tokenizer=self.tokenizer, 
                                     device=self.device, 
                                     aggregation_strategy="simple")

    def get_device(self):
        """
        Determines whether to use MPS, CUDA, or CPU depending on the available hardware.
        """
        if torch.backends.mps.is_available():
            print("MPS device found, using MPS backend.\n")
            return torch.device("mps")
        elif torch.cuda.is_available():
            print(f"CUDA device found, using CUDA backend. Device: {torch.cuda.get_device_name(0)}\n")
            return torch.device("cuda")
        else:
            print("Neither MPS nor CUDA found, using CPU.\n")
            return torch.device("cpu")

    
    def parse_ner_results(self, ner_results: list):
        """
        Parse the NER results and extract entities related to 'PER' (persons) and 'MISC' (potential movie titles).
        """
        per_entities, misc_entities = [], []
        
        for entity in ner_results:
            # Extraction of all Persons
            if entity['entity_group'] == 'PER':
                per_entities.append(entity['word'])
            # Extraction of all Misc that could indicate movies
            elif entity['entity_group'] == 'MISC':
                misc_entities.append(entity['word'])
        
        return per_entities, misc_entities

    
    def process_query(self, query: str):
        """
        Processes a text query, runs NER, and returns the extracted actors and movie names.
        """
        # Optionally lowercase the input if configured
        if self.lowercase:
            query = query.lower()
        
        # Run the NER pipeline
        ner_results = self.nlp_pipeline(query)

        # Parse the results to extract actors and movies
        per_entities, misc_entities = self.parse_ner_results(ner_results)
        
        return per_entities, misc_entities

ner_parser = NERParser(lowercase=False)

Neither MPS nor CUDA found, using CPU.



In [243]:
# db = pd.read_pickle('exports/graph.pkl')
db = pd.read_pickle("exports/extended_graph_triples.pkl")

In [244]:
query = "Where was Angelina Jolie born?"
query = "In which movies did Angela Jolie have a role?"
query = "Who is the director of Star Wars: Episode VI - Return of the Jedi?" # Richard Marquand
query = "Who is the screenwriter of The Masked Gang: Cyprus?" # Cengiz Küçükayvaz
query = "When was 'The Godfather' released?" # 1972
person, movies = ner_parser.process_query(query)

In [245]:
context_person = []

for p in person:
    # Exact Lookup
    relevant = db[db.index == p].dropna(axis=1)
    
    if not relevant.empty:
        context_person.append(relevant.iloc[0])
    
    else:
        # Levenshtein matching for closest match
        matches = process.extractOne(p, db.index, scorer=fuzz.ratio)
        
        if matches and matches[1] > 80:
            closest_match = matches[0]
            relevant = db[db.index == closest_match].dropna(axis=1)
            context_person.append(relevant.iloc[0])
   
context_df_person = pd.DataFrame(context_person)

context_movie = [] 
   
for m in movies:
    # Exact Lookup
    relevant = db[db.index == m].dropna(axis=1)
    
    if not relevant.empty:
        context_movie.append(relevant.iloc[0])
    
    else:
        # Levenshtein matching for closest match
        matches = process.extractOne(m, db.index, scorer=fuzz.ratio)
        
        if matches and matches[1] > 80:
            closest_match = matches[0]
            relevant = db[db.index == closest_match].dropna(axis=1)
            context_movie.append(relevant.iloc[0])   
        
context_df_movie = pd.DataFrame(context_movie)

In [246]:
context_df_movie = context_df_movie.rename(columns={'node label': 'name'})
context_df_person = context_df_person.rename(columns={'node label': 'name'})

context_df_movie = context_df_movie.rename(columns={'publication date': 'release date publication'})
context_df_person = context_df_person.rename(columns={'cast member': 'movies cast role play'})

In [247]:
context_df_person

In [248]:
context_df_movie

Predicate,BBFC rating,CNC film rating (France),FSK film rating,IMDb ID,after a work by,aspect ratio,assessment,author,award received,based on,...,original language of film or TV show,part of the series,performer,platform,present in work,production company,production designer,release date publication,publisher,screenwriter
The Godfather,"15 certificate, X certificate, 18 certificate",no minors under twelve,FSK 16,"tt0068646, tt0442674",Mario Puzo,widescreen,"Bechdel test, reverse Bechdel Test, Mako Mori ...",Mario Puzo,"Academy Award for Best Actor, Academy Award fo...","The Godfather, The Mook, the Chef, the Wife an...",...,"English, Italian",The Godfather trilogy,Nino Rota,"Microsoft Windows, Wii, PlayStation 2","Vito Corleone, Mark McCluskey","Alfran Productions, Paramount Pictures",Dean Tavoularis,"1972-03-15, 1969-03-10, 2006-03-21",Electronic Arts,"Mario Puzo, Francis Ford Coppola"


In [249]:
context_df_person.columns

Index([], dtype='object')

In [250]:
import requests
import os 

input_dir = 'input/glove.6B.zip'
url = 'http://nlp.stanford.edu/data/glove.6B.zip'

if not os.path.exists(input_dir):
    response = requests.get(url)
    
    with open(input_dir, 'wb') as file:
        file.write(response.content)
    print("Download complete.")
else:
    print("File already exists, skipped download.")

File already exists, skipped download.


In [251]:
import zipfile
import os

zip_file_path = 'input/glove.6B.zip'
extract_to_path = 'exports/glove.6B'

if not os.path.exists(extract_to_path):
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to_path)
    print("Unzipping complete!")
else:
    print("Out dir already exists, skipped unzipping.")


Out dir already exists, skipped unzipping.


In [252]:
import numpy as np

def load_glove_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

# Load the GloVe embeddings (300-dimensional)
glove_embeddings = load_glove_embeddings("exports/glove.6B/glove.6B.300d.txt")

In [253]:
def embed_phrase(phrase, embeddings):
    words = phrase.split()
    word_vectors = [embeddings[word.lower()] for word in words if word.lower() in embeddings]
    
    if len(word_vectors) == 0:
        return np.zeros(300)  # Return zero vector if no words have embeddings
    
    return np.mean(word_vectors, axis=0)

# Embed column names of knowledge graph db
column_embeddings = {col: embed_phrase(col, glove_embeddings) for col in context_df_person.columns}

In [254]:
column_embeddings

{}

In [255]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sandr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [256]:
def filter_query(query):
    
    person_list = [p.split(" ") for p in person][0] if len(person) else []
    movie_list = [m.split(" ") for m in movies][0] if len(movies) else []
        
    if not len(query):
        return []
    
    relevant = []
    for word in query.split(" "):
        if word in stop_words:
            continue
        
        if word in person_list:            
            continue
        
        if word in movie_list:
            continue
        
        relevant.append(word)
        
    return relevant
    

In [257]:
# Embed the query keywords
filtered_query = filter_query(query)
query_embeddings = [embed_phrase(word, glove_embeddings) for word in filtered_query]


In [258]:
def cosine_sim(vec1, vec2):
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    
    if norm_vec1 == 0 or norm_vec2 == 0:
        return 0.0 
    
    return np.dot(vec1, vec2) / (norm_vec1 * norm_vec2)

def find_closest_columns(query_embeddings, column_embeddings, top_n=5):
    column_similarities = {}

    for col, col_vec in column_embeddings.items():
        similarities = [cosine_sim(col_vec, q_vec) for q_vec in query_embeddings if np.linalg.norm(q_vec) > 0]
        column_similarities[col] = np.mean(similarities) if similarities else -1

    sorted_columns = sorted(column_similarities.items(), key=lambda item: item[1], reverse=True)
    
    return [col for col, sim in sorted_columns[:top_n]]

top_columns = find_closest_columns(query_embeddings, column_embeddings, top_n=5)

print("Top columns:", top_columns)

Top columns: []


In [259]:
context_df = pd.concat([context_df_person, context_df_movie], axis=0)

In [260]:
context_df[top_columns]

Predicate
The Godfather


In [261]:
# Introduce a swift llm that can generate the answer based on the context_df
from transformers import pipeline

# Load pre-trained DistilBERT model for question answering
qa_model = pipeline("question-answering", model="distilbert-base-cased-distilled-squad", top_k=1)

In [262]:
# Context and query
context = ""
for index, row in context_df.iterrows():
    row_context = " ".join([f"{col}: {row[col]}" for col in context_df[top_columns].columns])
    context += row_context + " "

# Generate the answer
output = qa_model(question=query, context=context)

answer_str = str()
if isinstance(output, list):
    answer_str = ", ".join([result['answer'] for result in output])
elif isinstance(output, dict):
    answer_str = output['answer']

if not answer_str:
    answer_str = "No answer found."

In [267]:
print(f"DB:\n{json.dumps(context_df_movie.to_dict(), indent=2)}\n")
print(f"Columns:\n{context_df[top_columns]}\n")
print(f"Context:\n{context}\n")

DB:
{
  "BBFC rating": {
    "The Godfather": "15 certificate, X certificate, 18 certificate"
  },
  "CNC film rating (France)": {
    "The Godfather": "no minors under twelve"
  },
  "FSK film rating": {
    "The Godfather": "FSK 16"
  },
  "IMDb ID": {
    "The Godfather": "tt0068646, tt0442674"
  },
  "after a work by": {
    "The Godfather": "Mario Puzo"
  },
  "aspect ratio": {
    "The Godfather": "widescreen"
  },
  "assessment": {
    "The Godfather": "Bechdel test, reverse Bechdel Test, Mako Mori test"
  },
  "author": {
    "The Godfather": "Mario Puzo"
  },
  "award received": {
    "The Godfather": "Academy Award for Best Actor, Academy Award for Best Writing, Adapted Screenplay, National Film Registry, Academy Award for Best Picture"
  },
  "based on": {
    "The Godfather": "The Godfather, The Mook, the Chef, the Wife and Her Homer, The Godfather Part II"
  },
  "box office": {
    "The Godfather": "268500000"
  },
  "cast member": {
    "The Godfather": "Saro Urz\u00ec, 

In [268]:
print(f"LLM output:\n{output}\n")
print(f"Question:\n{query}\n")
print(f"Answer:\n{answer_str}")

LLM output:
{'score': 0.0, 'start': 0, 'end': 0, 'answer': ''}

Question:
When was 'The Godfather' released?

Answer:
No answer found.
