In [4]:
import pandas as pd
from rapidfuzz import process, fuzz

In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from transformers import logging as transformers_logging
transformers_logging.set_verbosity_error()

class NERParser:
    def __init__(self, model_name: str = "dslim/bert-base-NER", lowercase: bool = False):
        """
        Initialize the NER parser with a model and optionally configure the lowercase preprocessing.
        """
        self.model_name = model_name
        self.lowercase = lowercase
        self.device = self.get_device()
        
        # Load the tokenizer and model
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, do_lower_case=self.lowercase)
        self.model = AutoModelForTokenClassification.from_pretrained(self.model_name)
        
        # Set up the NER pipeline
        self.nlp_pipeline = pipeline("ner", 
                                     model=self.model, 
                                     tokenizer=self.tokenizer, 
                                     device=self.device, 
                                     aggregation_strategy="simple")

    def get_device(self):
        """
        Determines whether to use MPS, CUDA, or CPU depending on the available hardware.
        """
        if torch.backends.mps.is_available():
            print("MPS device found, using MPS backend.\n")
            return torch.device("mps")
        elif torch.cuda.is_available():
            print(f"CUDA device found, using CUDA backend. Device: {torch.cuda.get_device_name(0)}\n")
            return torch.device("cuda")
        else:
            print("Neither MPS nor CUDA found, using CPU.\n")
            return torch.device("cpu")

    
    def parse_ner_results(self, ner_results: list):
        """
        Parse the NER results and extract entities related to 'PER' (persons) and 'MISC' (potential movie titles).
        """
        per_entities, misc_entities = [], []
        
        for entity in ner_results:
            # Extraction of all Persons
            if entity['entity_group'] == 'PER':
                per_entities.append(entity['word'])
            # Extraction of all Misc that could indicate movies
            elif entity['entity_group'] == 'MISC':
                misc_entities.append(entity['word'])
        
        return per_entities, misc_entities

    
    def process_query(self, query: str):
        """
        Processes a text query, runs NER, and returns the extracted actors and movie names.
        """
        # Optionally lowercase the input if configured
        if self.lowercase:
            query = query.lower()
        
        # Run the NER pipeline
        ner_results = self.nlp_pipeline(query)

        # Parse the results to extract actors and movies
        per_entities, misc_entities = self.parse_ner_results(ner_results)
        
        return per_entities, misc_entities

ner_parser = NERParser(lowercase=False)

Neither MPS nor CUDA found, using CPU.



In [6]:
# db = pd.read_pickle('exports/graph.pkl')
db = pd.read_pickle("exports/extended_graph_triples.pkl")

In [58]:
query = "Where was Angelina Jolie born?"
query = "In which movies did Angela Jolie have a role?"
person, movies = ner_parser.process_query(query)

In [59]:
context_person = []

for p in person:
    # Exact Lookup
    relevant = db[db.index == p].dropna(axis=1)
    
    if not relevant.empty:
        context_person.append(relevant.iloc[0])
    
    else:
        # Levenshtein matching for closest match
        matches = process.extractOne(p, db.index, scorer=fuzz.ratio)
        
        if matches and matches[1] > 80:
            closest_match = matches[0]
            relevant = db[db.index == closest_match].dropna(axis=1)
            context_person.append(relevant.iloc[0])
   
context_df_person = pd.DataFrame(context_person)

context_movie = [] 
   
for m in movies:
    # Exact Lookup
    relevant = db[db.index == m].dropna(axis=1)
    
    if not relevant.empty:
        context_movie.append(relevant.iloc[0])
    
    else:
        # Levenshtein matching for closest match
        matches = process.extractOne(m, db.index, scorer=fuzz.ratio)
        
        if matches and matches[1] > 80:
            closest_match = matches[0]
            relevant = db[db.index == closest_match].dropna(axis=1)
            context_movie.append(relevant.iloc[0])   
        
context_df_movie = pd.DataFrame(context_movie)

In [60]:
context_df_movie = context_df_movie.rename(columns={'node label': 'name'})
context_df_person = context_df_person.rename(columns={'node label': 'name'})

context_df_person = context_df_person.rename(columns={'cast member': 'movies'})


In [61]:
context_df_person

Predicate,IMDb ID,ancestral home,award received,movies,child,country of citizenship,described by source,director,executive producer,father,...,relative,religion,residence,screenwriter,sibling,significant event,spouse,unmarried partner,voice actor,winner
Angelina Jolie,nm0001401,Germany,"Saturn Award for Best Actress, Hollywood Film ...","Hell's Kitchen, Mr. & Mrs. Smith, The Good She...","Marcheline Bertrand, Jon Voight","United States of America, Cambodia",Obalky knih.cz,"In the Land of Blood and Honey, Unbroken, By t...","Difret, Maleficent",Jon Voight,...,Chip Taylor,Catholicism,Los Angeles,"In the Land of Blood and Honey, First They Kil...",James Haven,adoption,"Billy Bob Thornton, Brad Pitt, Jonny Lee Miller","Brad Pitt, Jenny Shimizu","Tigress, Kung Fu Panda 3, Kung Fu Panda 2, Sha...",Golden Globe Award for Best Supporting Actress...


In [62]:
context_df_movie

In [63]:
context_df_person.columns

Index(['IMDb ID', 'ancestral home', 'award received', 'movies', 'child',
       'country of citizenship', 'described by source', 'director',
       'executive producer', 'father', 'image', 'instance of',
       'languages spoken, written or signed', 'mother', 'native language',
       'node description', 'name', 'nominated for', 'notable work',
       'occupation', 'performer', 'place of birth', 'relative', 'religion',
       'residence', 'screenwriter', 'sibling', 'significant event', 'spouse',
       'unmarried partner', 'voice actor', 'winner'],
      dtype='object', name='Predicate')

In [64]:
import requests
import os 

input_dir = 'input/glove.6B.zip'
url = 'http://nlp.stanford.edu/data/glove.6B.zip'

if not os.path.exists(input_dir):
    response = requests.get(url)
    
    with open(input_dir, 'wb') as file:
        file.write(response.content)
    print("Download complete.")
else:
    print("File already exists, skipped download.")

File already exists, skipped download.


In [65]:
import zipfile
import os

zip_file_path = 'input/glove.6B.zip'
extract_to_path = 'exports/glove.6B'

if not os.path.exists(extract_to_path):
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to_path)
    print("Unzipping complete!")
else:
    print("Out dir already exists, skipped unzipping.")


Out dir already exists, skipped unzipping.


In [66]:
import numpy as np

def load_glove_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

# Load the GloVe embeddings (300-dimensional)
glove_embeddings = load_glove_embeddings("exports/glove.6B/glove.6B.300d.txt")

In [67]:
def embed_phrase(phrase, embeddings):
    words = phrase.split()
    word_vectors = [embeddings[word.lower()] for word in words if word.lower() in embeddings]
    
    if len(word_vectors) == 0:
        return np.zeros(300)  # Return zero vector if no words have embeddings
    
    return np.mean(word_vectors, axis=0)

# Embed column names of knowledge graph db
column_embeddings = {col: embed_phrase(col, glove_embeddings) for col in context_df_person.columns}

In [68]:
column_embeddings

{'IMDb ID': array([ 0.24651998,  0.07036   , -0.14642501,  0.116624  ,  0.056493  ,
        -0.22114   , -0.09043   , -0.075065  ,  0.00623   ,  0.04271001,
         0.137815  ,  0.33861   ,  0.440625  ,  0.16979034, -0.35968   ,
         0.061608  , -0.174365  , -0.03683   , -0.22728   , -0.42122   ,
         0.03673501,  0.04362   , -0.528735  , -0.296985  ,  0.07916079,
         0.24066399,  0.120575  ,  0.462405  ,  0.18459   , -0.377335  ,
        -0.31038502, -0.0232805 , -0.04938   ,  0.31717497,  0.005455  ,
        -0.235765  , -0.265915  , -0.0691155 , -0.132442  , -0.470625  ,
         0.28242052, -0.412005  ,  0.32108   ,  0.62975   , -0.30144352,
        -0.18836328,  0.036367  , -0.484735  , -0.39131   , -0.63302004,
         0.0996885 ,  0.139675  ,  0.49345   , -0.405625  ,  0.026585  ,
        -0.12878   ,  0.1299315 , -0.229645  , -0.50788   , -0.2958    ,
        -0.173519  ,  0.2378495 , -0.11698   ,  0.134761  , -0.13264701,
        -0.10877001, -0.159163  ,  0.019

In [69]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sandr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [70]:
def filter_query(query):
    
    person_list = [p.split(" ") for p in person][0] if len(person) else []
    movie_list = [m.split(" ") for m in movies][0] if len(movies) else []
        
    if not len(query):
        return []
    
    relevant = []
    for word in query.split(" "):
        if word in stop_words:
            continue
        
        if word in person_list:            
            continue
        
        if word in movie_list:
            continue
        
        relevant.append(word)
        
    return relevant
    

In [71]:
# Embed the query keywords
filtered_query = filter_query(query)
query_embeddings = [embed_phrase(word, glove_embeddings) for word in filtered_query]


In [72]:
def cosine_sim(vec1, vec2):
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    
    if norm_vec1 == 0 or norm_vec2 == 0:
        return 0.0 
    
    return np.dot(vec1, vec2) / (norm_vec1 * norm_vec2)

def find_closest_columns(query_embeddings, column_embeddings, top_n=5):
    column_similarities = {}

    for col, col_vec in column_embeddings.items():
        similarities = [cosine_sim(col_vec, q_vec) for q_vec in query_embeddings if np.linalg.norm(q_vec) > 0]
        column_similarities[col] = np.mean(similarities) if similarities else -1

    sorted_columns = sorted(column_similarities.items(), key=lambda item: item[1], reverse=True)
    
    return [col for col, sim in sorted_columns[:top_n]]

top_columns = find_closest_columns(query_embeddings, column_embeddings, top_n=5)

print("Top columns:", top_columns)

Top columns: ['movies', 'instance of', 'notable work', 'place of birth', 'nominated for']


In [73]:
context_df = pd.concat([context_df_person, context_df_movie], axis=0)

In [74]:
context_df[top_columns]

Predicate,movies,instance of,notable work,place of birth,nominated for
Angelina Jolie,"Hell's Kitchen, Mr. & Mrs. Smith, The Good She...",human,"Girl, Interrupted, Changeling, Gia",Los Angeles,"Academy Award for Best Actress, Screen Actors ..."


In [None]:
# Introduce a swift llm that can generate the answer based on the context_df
from transformers import pipeline

# Load pre-trained DistilBERT model for question answering
qa_model = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

In [24]:
# Context and query
context = ""
for index, row in context_df.iterrows():
    row_context = " ".join([f"{col}: {row[col]}" for col in context_df[top_columns].columns])
    context += row_context + " "

# Generate the answer
result = qa_model(question=query, context=context)

print(result['answer']) 

Girl, Interrupted, Changeling, Gia
