In [185]:
import pandas as pd
from rapidfuzz import process, fuzz
import json
import zipfile
import os
import requests
import numpy as np
import nltk
from nltk.corpus import stopwords
from transformers import pipeline

In [186]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from transformers import logging as transformers_logging
transformers_logging.set_verbosity_error()

class NERParser:
    def __init__(self, model_name: str = "dslim/bert-base-NER", lowercase: bool = False):
        """
        Initialize the NER parser with a model and optionally configure the lowercase preprocessing.
        """
        self.model_name = model_name
        self.lowercase = lowercase
        self.device = self.get_device()
        
        # Load the tokenizer and model
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, do_lower_case=self.lowercase)
        self.model = AutoModelForTokenClassification.from_pretrained(self.model_name)
        
        # Set up the NER pipeline
        self.nlp_pipeline = pipeline("ner", 
                                     model=self.model, 
                                     tokenizer=self.tokenizer, 
                                     device=self.device, 
                                     aggregation_strategy="simple")

    def get_device(self):
        """
        Determines whether to use MPS, CUDA, or CPU depending on the available hardware.
        """
        if torch.backends.mps.is_available():
            print("MPS device found, using MPS backend.\n")
            return torch.device("mps")
        elif torch.cuda.is_available():
            print(f"CUDA device found, using CUDA backend. Device: {torch.cuda.get_device_name(0)}\n")
            return torch.device("cuda")
        else:
            print("Neither MPS nor CUDA found, using CPU.\n")
            return torch.device("cpu")

    
    def parse_ner_results(self, ner_results: list):
        """
        Parse the NER results and extract entities related to 'PER' (persons) and 'MISC' (potential movie titles).
        """
        per_entities, misc_entities = [], []
        
        for entity in ner_results:
            # Extraction of all Persons
            if entity['entity_group'] == 'PER':
                per_entities.append(entity['word'])
            # Extraction of all Misc that could indicate movies
            elif entity['entity_group'] == 'MISC':
                misc_entities.append(entity['word'])
        
        return per_entities, misc_entities

    
    def process_query(self, query: str):
        """
        Processes a text query, runs NER, and returns the extracted actors and movie names.
        """
        # Optionally lowercase the input if configured
        if self.lowercase:
            query = query.lower()
        
        # Run the NER pipeline
        ner_results = self.nlp_pipeline(query)

        # Parse the results to extract actors and movies
        per_entities, misc_entities = self.parse_ner_results(ner_results)
        
        return per_entities, misc_entities

ner_parser = NERParser(lowercase=False)

Neither MPS nor CUDA found, using CPU.



In [187]:
# db = pd.read_pickle('exports/graph.pkl')
db = pd.read_pickle("exports/extended_graph_triples.pkl")
reverse_index = pd.read_pickle("./exports/reverse_index.pkl")

In [188]:
# query = "Where was Angela Jolie born?"
# query = "In which movies did Angela Jolie have a role?"
query = "Who is the director of Star Wars: Episode VI - Return of the Jedi?" # Richard Marquand
# query = "Who is the screenwriter of The Masked Gang: Cyprus?" # Cengiz Küçükayvaz
# query = "When was 'The Godfather' released?" # 1972
person, movies = ner_parser.process_query(query)

In [189]:
db['subject_id'] = db['subject_id'].str.strip()
db['subject_label'] = db['subject_label'].str.strip()
db['predicate_label'] = db['predicate_label'].str.strip()
db['object_label'] = db['object_label'].str.strip()

In [232]:
def get_df_by_label(labels:list):
    context_person = list()
    for p in labels:
    
        # Levenshtein matching for closest match
        matched_subject_label = process.extractOne(p, reverse_index.subject_label, scorer=fuzz.ratio)

        if not matched_subject_label or not matched_subject_label[1] > 80:
            continue
    
        closest_match_label = matched_subject_label[0]
        
        lst_of_ids = reverse_index.get(reverse_index.subject_label == closest_match_label, []).subject_id.to_list()
        
        if len(lst_of_ids) == 0:
            continue
    
        relevant = db[db.subject_id.isin(lst_of_ids)].dropna(axis=1)
        for index, row in relevant.iterrows():
            context_person.append(row)
        
    c_df = pd.DataFrame(
        data=context_person,
        columns=['subject_id', 'subject_label', 'predicate_label', 'object_label']
    )
    
    c_df.set_index('subject_id', inplace=True)

    # Pivot the DataFrame with a custom aggregation function
    pivot_df = c_df.pivot_table(
        index='subject_id',
        columns='predicate_label',
        values='object_label',
        aggfunc=lambda x: ' | '.join(x.astype(str))
    )
    
    pivot_df.reset_index(inplace=True)
    
    return pivot_df


In [233]:
person, movies

([], ['Star Wars', 'Episode VI', 'Return of the Jedi'])

In [234]:
context_df_person = get_df_by_label(person)
context_df_movie = get_df_by_label(movies)

[('Star Wars', 100.0, 119592), ('Sakura Wars', 80.0, 112553), ('Stuart Waks', 80.0, 121184)]
[('episode', 70.58823529411764, 144706), ('prisoner', 55.55555555555556, 147739), ('La Flor - Episode 1', 55.172413793103445, 73110)]
[('Return of the Idiot', 86.48648648648648, 107168), ('Return of the Hero', 83.33333333333334, 107167), ('Return of the Seven', 81.08108108108108, 107178)]


In [235]:
context_df_movie = context_df_movie.rename(columns={'subject_id': 'id'})
context_df_person = context_df_person.rename(columns={'subject_id': 'id'})

context_df_movie = context_df_movie.rename(columns={'node label': 'name'})
context_df_person = context_df_person.rename(columns={'node label': 'name'})

context_df_movie = context_df_movie.rename(columns={'publication date': 'release date publication'})
context_df_person = context_df_person.rename(columns={'cast member': 'movies cast role play'})

In [236]:
context_df_person.head()

predicate_label,id


In [237]:
context_df_movie.head()

predicate_label,id,IMDb ID,assessment,award received,cast member,color,costume designer,country of origin,director,director of photography,...,make-up artist,node description,name,nominated for,original language of film or TV show,participant in,production company,release date publication,screenwriter,winner
0,http://www.wikidata.org/entity/Q3795587,tt0145929,reverse Mako Mori test,Czech Lion Awards,"Petr Vydra, Zdena Hadrbolcová, Jiří Macháček, ...",color,Milica Gedeonová,"Germany, Czech Republic",Saša Gedeon,Štěpán Kučera,...,Michaela Belíková,1999 film by Saša Gedeon,Return of the Idiot,European Film Award for Best Screenwriter,Czech,12th European Film Awards,"Negativ Film Productions, Česká televize",1999-02-25,Saša Gedeon,Czech Lion for Best Film


In [238]:
context_df_movie.columns

Index(['id', 'IMDb ID', 'assessment', 'award received', 'cast member', 'color',
       'costume designer', 'country of origin', 'director',
       'director of photography', 'film editor', 'genre', 'inspired by',
       'instance of', 'main subject', 'make-up artist', 'node description',
       'name', 'nominated for', 'original language of film or TV show',
       'participant in', 'production company', 'release date publication',
       'screenwriter', 'winner'],
      dtype='object', name='predicate_label')

In [239]:
context_df_person.columns

Index(['id'], dtype='object', name='predicate_label')

In [240]:
input_dir = 'input/glove.6B.zip'
url = 'http://nlp.stanford.edu/data/glove.6B.zip'

if not os.path.exists(input_dir):
    response = requests.get(url)
    
    with open(input_dir, 'wb') as file:
        file.write(response.content)
    print("Download complete.")
else:
    print("File already exists, skipped download.")

File already exists, skipped download.


In [241]:
zip_file_path = 'input/glove.6B.zip'
extract_to_path = 'exports/glove.6B'

if not os.path.exists(extract_to_path):
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to_path)
    print("Unzipping complete!")
else:
    print("Out dir already exists, skipped unzipping.")

Out dir already exists, skipped unzipping.


In [242]:
def load_glove_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

# Load the GloVe embeddings (300-dimensional)
glove_embeddings = load_glove_embeddings("exports/glove.6B/glove.6B.300d.txt")

In [243]:
def embed_phrase(phrase, embeddings):
    words = phrase.split()
    word_vectors = [embeddings[word.lower()] for word in words if word.lower() in embeddings]
    
    if len(word_vectors) == 0:
        return np.zeros(300)  # Return zero vector if no words have embeddings
    
    return np.mean(word_vectors, axis=0)

# Embed column names of knowledge graph db
column_embeddings = {col: embed_phrase(col, glove_embeddings) for col in context_df_person.columns}

In [201]:
column_embeddings

{}

In [202]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sandr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [203]:
def filter_query(query):
    
    person_list = [p.split(" ") for p in person][0] if len(person) else []
    movie_list = [m.split(" ") for m in movies][0] if len(movies) else []
        
    if not len(query):
        return []
    
    relevant = []
    for word in query.split(" "):
        if word in stop_words:
            continue
        
        if word in person_list:            
            continue
        
        if word in movie_list:
            continue
        
        relevant.append(word)
        
    return relevant
    

In [204]:
filtered_query = filter_query(query)
query_embeddings = [embed_phrase(word, glove_embeddings) for word in filtered_query]


In [205]:
def cosine_sim(vec1, vec2):
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    
    if norm_vec1 == 0 or norm_vec2 == 0:
        return 0.0 
    
    return np.dot(vec1, vec2) / (norm_vec1 * norm_vec2)

def find_closest_columns(query_embeddings, column_embeddings, top_n=5):
    column_similarities = {}

    for col, col_vec in column_embeddings.items():
        similarities = [cosine_sim(col_vec, q_vec) for q_vec in query_embeddings if np.linalg.norm(q_vec) > 0]
        column_similarities[col] = np.mean(similarities) if similarities else -1

    sorted_columns = sorted(column_similarities.items(), key=lambda item: item[1], reverse=True)
    
    return [col for col, sim in sorted_columns[:top_n]]

top_columns = find_closest_columns(query_embeddings, column_embeddings, top_n=5)

print("Top columns:", top_columns)

Top columns: []


In [206]:
context_df = pd.concat([context_df_person, context_df_movie], axis=0)

In [207]:
context_df[top_columns]

predicate_label
subject_id
http://www.wikidata.org/entity/Q3795587


In [208]:
# Introduce a swift llm that can generate the answer based on the context_df
# Load pre-trained DistilBERT model for question answering
qa_model = pipeline("question-answering", model="distilbert-base-cased-distilled-squad", top_k=1)

In [209]:
# Context and query
context = ""
for index, row in context_df.iterrows():
    row_context = " ".join([f"{col}: {row[col]}" for col in context_df[top_columns].columns])
    context += row_context + " "

# Generate the answer
output = qa_model(question=query, context=context)

answer_str = str()
if isinstance(output, list):
    answer_str = ", ".join([result['answer'] for result in output])
elif isinstance(output, dict):
    answer_str = output['answer']

if not answer_str:
    answer_str = "No answer found."

In [210]:
print(f"DB:\n{json.dumps(context_df_movie.to_dict(), indent=2)}\n")
print(f"Columns:\n{context_df[top_columns]}\n")
print(f"Context:\n{context}\n")

DB:
{
  "IMDb ID": {
    "http://www.wikidata.org/entity/Q3795587": "tt0145929"
  },
  "assessment": {
    "http://www.wikidata.org/entity/Q3795587": "reverse Mako Mori test"
  },
  "award received": {
    "http://www.wikidata.org/entity/Q3795587": "Czech Lion Awards"
  },
  "cast member": {
    "http://www.wikidata.org/entity/Q3795587": "Petr Vydra, Zdena Hadrbolcov\u00e1, Ji\u0159\u00ed Mach\u00e1\u010dek, Ji\u0159\u00ed Langmajer, Jan Tepl\u00fd, Pavel Li\u0161ka, Josef Oplt, Anna Geislerov\u00e1, Tatiana Vilhelmov\u00e1, Jitka Smutn\u00e1, Anna Pol\u00edvkov\u00e1, Pavel Marek, Zuzana Stiv\u00ednov\u00e1"
  },
  "color": {
    "http://www.wikidata.org/entity/Q3795587": "color"
  },
  "costume designer": {
    "http://www.wikidata.org/entity/Q3795587": "Milica Gedeonov\u00e1"
  },
  "country of origin": {
    "http://www.wikidata.org/entity/Q3795587": "Germany, Czech Republic"
  },
  "director": {
    "http://www.wikidata.org/entity/Q3795587": "Sa\u0161a Gedeon"
  },
  "director of 

In [211]:
print(f"LLM output:\n{output}\n")
print(f"Question:\n{query}\n")
print(f"Answer:\n{answer_str}")

LLM output:
{'score': 0.0, 'start': 0, 'end': 0, 'answer': ''}

Question:
Who is the director of Star Wars: Episode VI - Return of the Jedi?

Answer:
No answer found.
