In [40]:
import os
import re
import unicodedata
import pandas as pd
import numpy as np
from rapidfuzz import process, fuzz
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    pipeline,
    T5Tokenizer,
    T5ForConditionalGeneration
)

from transformers import logging as transformers_logging
from sentence_transformers import SentenceTransformer
transformers_logging.set_verbosity_error()
import json
import time
from functools import wraps

# Silencing TqdmWarning
import warnings
warnings.filterwarnings('ignore')

In [41]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words_to_keep = ["what", "when", "where", "which", "while", "who", "whom", "why", "with", "how", "before", "after","same"]
stop_words = set([s for s in stopwords.words('english') if s not in stop_words_to_keep])

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kevinbrundler/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [42]:
def measure_time(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        start_time = time.time()  # Record the start time
        result = func(*args, **kwargs)  # Call the function
        end_time = time.time()  # Record the end time
        elapsed_time = end_time - start_time  # Calculate elapsed time
        print(f"Execution time for {func.__name__}: {elapsed_time:.4f} seconds")
        return result
    return wrapper

In [43]:
class NERParser:
    def __init__(self, model_name="dslim/bert-base-NER", lowercase=False):
        self.lowercase = lowercase
        self.device = self.get_device()

        self.nlp_pipeline = pipeline(
            "ner", 
            model=AutoModelForTokenClassification.from_pretrained(model_name),
            tokenizer=AutoTokenizer.from_pretrained(model_name, do_lower_case=lowercase),
            device=self.device, 
            aggregation_strategy="simple"
        )

    def get_device(self):
        if torch.backends.mps.is_available():
            return torch.device("mps")
        elif torch.cuda.is_available():
            return torch.device("cuda")
        return torch.device("cpu")

    def parse_ner_results(self, ner_results):
        per_entities = [e['word'] for e in ner_results if e['entity_group'] == 'PER']
        misc_entities = [e['word'] for e in ner_results if e['entity_group'] == 'MISC']
        return per_entities, misc_entities

    def process_query(self, query):
        if self.lowercase:
            query = query.lower()
        return self.parse_ner_results(self.nlp_pipeline(query))



In [44]:
class DataBase:
    """Handles context data extraction for people and movies from a database with fuzzy matching support."""

    def __init__(self):
        self.db = pd.read_pickle(os.path.join(os.getcwd(), "exports/extended_graph_triples.pkl"))
        
        with open("exports/entity_db.json", encoding="utf-8") as f:
            self.entities = json.load(f)
            self.entity_list = [subject.lower() for subject, _ in self.entities.values()]

        self.db['subject_id'] = self.db['subject_id'].astype(str).str.strip()

    @staticmethod
    def normalize_string(s):
        """Normalizes strings by removing non-ASCII characters, punctuation, and redundant spaces."""
        return ' '.join(re.sub(r'[^\w\s]', '', unicodedata.normalize('NFKD', s.lower())
                               .encode('ascii', 'ignore').decode('utf-8')).split())

    def fetch(self, entity_list, search_column):
        """Fetches relevant rows from the database where `search_column` matches values in `entity_list`."""
        relevant = self.db[self.db[search_column].isin(entity_list)].dropna(axis=1)
        
        if relevant.empty:
            return pd.DataFrame()

        return relevant.pivot_table(
            index='subject_id',
            columns='predicate_label',
            values='object_label',
            aggfunc=lambda x: ' | '.join(x.astype(str))
        ).reset_index()


In [45]:
class QueryEmbedderContextualized:
    def __init__(self, model_name='sentence-transformers/all-mpnet-base-v2'):
        """Initializes the QueryEmbedder with a SentenceTransformer model and device setup."""
        self.device = self.get_device()
        self.model = SentenceTransformer(model_name, device=self.device)
        self.cache = {}
    
    @staticmethod
    def get_device():
        """Determines the available hardware device (MPS, CUDA, or CPU)."""
        if torch.backends.mps.is_available():
            return torch.device("mps")
        if torch.cuda.is_available():
            return torch.device("cuda")
        return torch.device("cpu")

    def embed_phrase(self, phrases):
        """
        Generates embeddings for given phrases using SentenceTransformer, with caching.

        Args:
            phrases (str or List[str]): Input phrase(s) to embed.

        Returns:
            np.ndarray: Embedding vector(s) for the input phrase(s).
        """
        if isinstance(phrases, str):
            phrases = [phrases]
        elif not isinstance(phrases, list):
            raise TypeError("Input must be a string or a list of strings.")
        
        phrases_to_compute = [p for p in phrases if p not in self.cache]
        cached_embeddings = [self.cache[p] for p in phrases if p in self.cache]

        if phrases_to_compute:
            new_embeddings = self.model.encode(
                phrases_to_compute, 
                show_progress_bar=False, 
                convert_to_numpy=True, 
                normalize_embeddings=True
            )
            
            for phrase, emb in zip(phrases_to_compute, new_embeddings):
                self.cache[phrase] = emb
            cached_embeddings.extend(new_embeddings)
        
        return cached_embeddings[0] if len(cached_embeddings) == 1 else np.array(cached_embeddings)


In [46]:
class QuestionAnsweringAgent():
    
    def __init__(self):
        self.qa_model = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad", top_k=1)
    
    def query(self, query, context_df):
        
        top_columns = context_df.columns
        
        context = ""
        for index, row in context_df.iterrows():
            node_label = row.get("node label", "")
            
            row_context = f"This text is about \"{node_label}\":\n"
            
            for col in context_df[top_columns].columns:
                if col == "node label":
                    continue
                
                values = row[col]
                values_lst = str(values).split(",")
                
                if len(values_lst) > 5:
                    row_context += f"{col}: {', '.join(values_lst[:5])}"
                else:
                    row_context += f"{col}: {', '.join(values_lst)}"

            context += row_context + "\n\n"
        
        output = self.qa_model(question=query, context=context)
        
        answer_str = str()
        if isinstance(output, list) and output:
            answer_str = ", ".join([result['answer'] for result in output])
            
        elif isinstance(output, dict):
            answer_str = output['answer']
        
        if not answer_str:
            answer_str = "No answer found."
        
        return answer_str

In [47]:
class ConversationAgent:
    def __init__(self, model_name="google/flan-t5-large", max_length=150):
        self.device = self.get_device()
        self.tokenizer = T5Tokenizer.from_pretrained(model_name)
        self.model = T5ForConditionalGeneration.from_pretrained(model_name).to(self.device)
        self.max_length = max_length

    @staticmethod
    def get_device():
        if torch.cuda.is_available():
            return torch.device("cuda")
        elif torch.backends.mps.is_available():
            return torch.device("mps")
        else:
            return torch.device("cpu")

    def generate_response(self, prompt):
        """
        Generates a response based on the given prompt.
        """
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
        outputs = self.model.generate(
            **inputs,
            max_length=self.max_length,
            num_beams=5,
            early_stopping=True,
        )
        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response.strip()


In [48]:
def cosine_sim(vec1, vec2):
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    
    if norm_vec1 == 0 or norm_vec2 == 0:
        return 0.0 
    
    return np.dot(vec1, vec2) / (norm_vec1 * norm_vec2)


def rescale_probabilities(similarities):
    """
    Rescales the similarity scores so that they sum to 1, turning them into a probability distribution.
    
    Args:
        similarities (List[float]): List of similarity scores.
        
    Returns:
        List[float]: Rescaled probabilities.
    """
    similarity_sum = sum(similarities)
    if similarity_sum == 0:
        return [0] * len(similarities)  # Avoid division by zero
    
    return [sim / similarity_sum for sim in similarities]

def find_closest_columns(query_embeddings, column_embeddings, high_threshold=0.4, top_n=10, rescaled_threshold=0.11):
    """
    Returns columns based on cosine similarity with a two-tiered strategy and rescaled probabilities.
    - If a column has similarity above 'high_threshold', return that column immediately.
    - Otherwise, return all columns with a similarity greater than 'low_threshold'.
    - Rescale the top N column similarities into probabilities and return columns with a rescaled probability greater than rescaled_threshold.
    
    Args:
        query_embeddings (List[np.ndarray]): Embeddings for query words.
        column_embeddings (Dict[str, np.ndarray]): Precomputed embeddings for columns.
        low_threshold (float): Minimum similarity threshold (default: 0.27).
        high_threshold (float): Confidence threshold to return immediately (default: 0.35).
        top_n (int): Number of top columns to consider for rescaling (default: 10).
        rescaled_threshold (float): Minimum rescaled probability threshold (default: 0.1).
    
    Returns:
        List[str]: The selected column names.
    """
    column_similarities = {}

    for col, col_vec in column_embeddings.items():
        similarities = [cosine_sim(col_vec, q_vec) for q_vec in query_embeddings if np.linalg.norm(q_vec) > 0]
        column_similarities[col] = np.mean(similarities) if similarities else -1

    sorted_columns = sorted(column_similarities.items(), key=lambda item: item[1], reverse=True)
    top_columns = sorted_columns[:top_n]
    
    column_names, similarities = zip(*top_columns)
    
    rescaled_probs = rescale_probabilities(similarities)
    
    selected_columns = []
    
    for col, sim in zip(column_names, similarities):
        if sim >= high_threshold:
            print(f"High confidence match found: {col} with similarity {sim: .4f}")
            return [col]
    
    for col, rescaled_prob in zip(column_names, rescaled_probs):
        if rescaled_prob >= rescaled_threshold:
            print(f"Column {col} has similarity {rescaled_prob: .4f}")
            selected_columns.append(col)
    
    return selected_columns

In [49]:
def filter_query(query, node_label):
                
    if not len(query):
        return []
    
    relevant = []
    for word in query.replace(". ", " ").lower().split(" "):
        cleaned_word = re.sub(r'[^A-Za-z]', '', word)
        if cleaned_word in stop_words or cleaned_word in node_label.lower().replace(" ", "") or cleaned_word == "":
            continue
        
        relevant.append(cleaned_word)
        
    return " ".join(relevant)

In [50]:
def fuzzy_match(query_str, comparison_list, threshold=30, prioritize_exact=True):
    matches = process.extract(query_str, comparison_list, scorer=fuzz.partial_ratio, limit=50)
        
    id_name_score = []
    
    if prioritize_exact and query_str in comparison_list:
        matched_id = next(key for key, value in db.entities.items() if value[0] == query_str)
        id_name_score.append((matched_id, query_str, 100))
    
    for match in matches:
        name = match[0]
        score = match[1]
        matched_id = next(key for key, value in db.entities.items() if value[0] == name)
        
        length_diff = abs(len(name) - len(query_str)) / len(query_str)
        adjusted_score = score * (1 - length_diff)
        
        id_name_score.append((matched_id, name, adjusted_score))
    
    return [id for id, _, score in id_name_score if score >= threshold]

In [51]:
def get_top_matches(df, normalized_query, top_n=2):
    concatenated_rows = df.apply(lambda row: ' '.join(row.astype(str)), axis=1).tolist()
    
    exact_matches = [i for i, row in enumerate(concatenated_rows) if normalized_query == row]
    
    if len(exact_matches) < top_n:
        remaining_slots = top_n - len(exact_matches)
        fuzzy_matches = process.extract(normalized_query, concatenated_rows, scorer=fuzz.partial_ratio, limit=remaining_slots)
        fuzzy_indices = [match[2] for match in fuzzy_matches]
    else:
        fuzzy_indices = []
    
    top_indices = exact_matches + fuzzy_indices
    
    return df.iloc[top_indices]

In [39]:
db = DataBase()

ner_parser = NERParser(lowercase=False)

qe = QueryEmbedderContextualized()

qa = QuestionAnsweringAgent()

ca = ConversationAgent(model_name="google/flan-t5-xl") #### Could take some time, approx. 10 GB storage :)


FileNotFoundError: [Errno 2] No such file or directory: '/Users/kevinbrundler/Desktop/ATAI/movie-bot/development/exports/extended_graph_triples.pkl'

In [14]:
@measure_time
def answer_query(query, correct_answer=""):
    normalized_query = db.normalize_string(query)
    
    entity_matches = fuzzy_match(normalized_query, db.entity_list, threshold=30)
    
    # NER Model and NER Matching
    ner_person, ner_movies = ner_parser.process_query(query)
    
    if len(ner_movies):
        ner_movie_entities = fuzzy_match(" ".join(ner_movies), db.entity_list, threshold=75)
        subjects_ner_movies = db.fetch(ner_movie_entities, "subject_id")
        context_ner_movies = get_top_matches(subjects_ner_movies, normalized_query, top_n=1) 
            
    else:
        context_ner_movies = pd.DataFrame()
    
    if len(ner_person):
        ner_person_entities = fuzzy_match(" ".join(ner_person), db.entity_list, threshold=75)
        subjects_ner_person = db.fetch(ner_person_entities, "subject_id")
        context_ner_person = get_top_matches(subjects_ner_person, normalized_query, top_n=1)   
                
    else:
        context_ner_person = pd.DataFrame()
    
    is_domain_specific = bool(ner_person or ner_movies)

    if not is_domain_specific:
        small_talk = ca.generate_response(query)
        print(small_talk)
        return
    
    # Fuzzy Matching
    subjects = db.fetch(entity_matches, "subject_id")   
    context = get_top_matches(subjects, normalized_query, top_n=1)    
    
    ner_context = pd.concat([context_ner_movies, context_ner_person])
        
    if not ner_context.empty:
        context = ner_context
    
    try:
        node_label = context["node label"].values[0]
    except Exception:
        node_label = ""
    
    if context.empty:
        print("No context data found for given IDs or string")
        context = pd.DataFrame()
                
        #Fallback Strategy
        small_talk = ca.generate_small_talk(query)
        print(small_talk)
        return
       
    # EXPERIMENTAL - remove unused columns
    elements_to_remove = ["image", "color", "sport"]
    context = context.drop(columns=elements_to_remove, errors='ignore')
    
    
    # EXPERIMENTAL - rename columns
    columns_to_rename = {
        "cast member":"movie cast",
        "notable work": "acted in"
    }
    
    columns_to_rename = {k: v for k, v in columns_to_rename.items() if k in context.columns}
    context = context.rename(columns=columns_to_rename)
    
    columns_to_duplicate = [("acted in", "played in"),
                            ("acted in", "appeared in"),
                            ("movie cast", "actors"),
                            ("movie cast", "players")]
    
    for col_to_duplicate, col in columns_to_duplicate: 
        try:
            context[col] = context[col_to_duplicate].copy()
        except KeyError:
            pass
        
    context.dropna(axis=1, inplace=True)
    
    query_filtered = filter_query(query, node_label)
        
    column_embeddings = {col: qe.embed_phrase(col) for col in context.columns}
    query_embeddings = [qe.embed_phrase(word) for word in query_filtered.split()]  
    top_columns_embeddings = find_closest_columns(query_embeddings, column_embeddings)
    
    # EXPERIMENTAL
    top_columns_dict = process.extract(normalized_query, context.columns, scorer=fuzz.partial_ratio, limit=3)
    top_columns_fuzzy = [c[0] for c in top_columns_dict]
    
    # MANUAL OVERWRITE:
    top_columns_fuzzy = []
        
    # EXPERIMENTAL - always keep columns
    col_always_keep = ["node label"]
    
    combined_columns = set(top_columns_fuzzy + top_columns_embeddings + col_always_keep)
    top_columns = [col for col in combined_columns if col in context.columns]
    filtered_context_df = context[top_columns]

    answer = qa.query(query, filtered_context_df)
    formatted_answer = ca.generate_response(f""""Format the answer to the question into a sentence.
                                            If you think the answer is completely off, overrule with your own knowledge.
                                            Question: {query}\nAnswer: {answer}""")

    print(formatted_answer)


In [15]:
answer_query("Tell me a joke")

i saw a man in a tuxedo and he was wearing a tuxedo
Execution time for answer_query: 5.3766 seconds


In [16]:
answer_query("Tell me a good joke")

if you want to be a doctor, you have to be a dentist.
Execution time for answer_query: 2.3642 seconds


In [17]:
answer_query("Hello, how is life?")

I'm fine, thanks.
Execution time for answer_query: 1.8606 seconds


In [18]:
answer_query("Hello, how are you?")

I'm fine, thanks.
Execution time for answer_query: 1.3556 seconds


In [19]:
answer_query("Hell, how are you doing?")

I'm fine, thanks.
Execution time for answer_query: 1.3519 seconds


In [20]:
answer_query("Hello, what is the capital of switzerland?")

Bern
Execution time for answer_query: 1.0394 seconds


In [24]:
answer_query("Hello, what is the capital of the Paris?")

capital of france
Execution time for answer_query: 1.0653 seconds


In [25]:
answer_query("Who is the director of Good Will Hunting?", "Gus Van Sant")

High confidence match found: director with similarity  0.6866
Gus Van Sant is the director of Good Will Hunting.
Execution time for answer_query: 3.8853 seconds


In [26]:
answer_query("Who directed The Bridge on the River Kwai?", "David Lean")

High confidence match found: director with similarity  0.5111
David Lean directed The Bridge on the River Kwai.
Execution time for answer_query: 2.6572 seconds


In [22]:
answer_query("Who directed The Dark Knight?", "Christopher Nolan")

High confidence match found: director with similarity  0.5111
The Dark Knight was directed by Christopher Nolan.
Execution time for answer_query: 2.5928 seconds


In [23]:
answer_query("Where was Angelina Jolie born?", "Los Angeles")

High confidence match found: place of birth with similarity  0.4204
Angelina Jolie was born in Los Angeles.
Execution time for answer_query: 2.6926 seconds


In [24]:
answer_query("Who is the main actor in harry potter and the philosopher's stone?")

Rupert Grint
Execution time for answer_query: 1.3153 seconds


In [25]:
answer_query("Who was Brad Pitt married to?")

High confidence match found: spouse with similarity  0.4737
Jennifer Aniston and Angelina Jolie were married to Brad Pitt.
Execution time for answer_query: 2.6542 seconds


In [26]:
answer_query("When was Inception released?")

High confidence match found: publication date with similarity  0.4198
2010-07-08 was Inception released.
Execution time for answer_query: 2.2322 seconds


In [27]:
answer_query("Who is the director of Star Wars?")

High confidence match found: director with similarity  0.4614
James Glickenhaus is the director of Star Wars.
Execution time for answer_query: 2.3958 seconds


In [28]:
answer_query("When was the Godfather III published?")

High confidence match found: publication date with similarity  0.4091
1972-03-15 was the Godfather III published.
Execution time for answer_query: 1.9495 seconds


In [29]:
answer_query("Who is the director of Star Wars?")

High confidence match found: director with similarity  0.4614
James Glickenhaus is the director of Star Wars.
Execution time for answer_query: 1.7895 seconds


In [30]:
answer_query("When was Inception released?")

High confidence match found: publication date with similarity  0.4198
2010-07-08 was Inception released.
Execution time for answer_query: 1.7443 seconds


In [31]:
answer_query("Who was Angelina Jolie married to?")

High confidence match found: spouse with similarity  0.4737
Angelina Jolie was married to Billy Bob Thornton.
Execution time for answer_query: 2.2110 seconds


In [32]:
answer_query("Who was Brad Pitt married to?")

High confidence match found: spouse with similarity  0.4737
Jennifer Aniston and Angelina Jolie were married to Brad Pitt.
Execution time for answer_query: 2.1282 seconds


In [33]:
answer_query("What is the religion of Tom Cruise?")

High confidence match found: religion with similarity  0.6583
Scientology is the religion of Tom Cruise.
Execution time for answer_query: 1.6911 seconds


In [34]:
answer_query("Who is the main actor in harry potter and the philosopher's stone?")

Rupert Grint
Execution time for answer_query: 0.9352 seconds


In [35]:
answer_query("Who are the cast in Jurassic Park?")

High confidence match found: movie cast with similarity  0.5334
Bd wong, laura dern, sam neill, samuel l jackson are the cast in Jurassic Park.
Execution time for answer_query: 6.4506 seconds


In [36]:
answer_query("Who acted in Jurassic Park?")

High confidence match found: actors with similarity  0.4118
Wayne Knight acted in Jurassic Park.
Execution time for answer_query: 1.7174 seconds


In [37]:
answer_query("Who played in Jurassic Park?")

High confidence match found: players with similarity  0.4161
Wayne Knight played in Jurassic Park.
Execution time for answer_query: 1.3895 seconds


In [38]:
answer_query("In which movie did Tom Cruise play?")

Column actors has similarity  0.1180
Column players has similarity  0.1113
Days of Thunder is a movie that Tom Cruise played in.
Execution time for answer_query: 1.9912 seconds


In [39]:
answer_query("In which movie did Rebel Wilson act?")

Column actors has similarity  0.1657
Column movie cast has similarity  0.1462
Column imdb id has similarity  0.1247
Column occupation has similarity  0.1230
Rebel Wilson acted in Ghost Rider.
Execution time for answer_query: 2.1311 seconds


In [40]:
answer_query("In which movie did Liam Neeson play?")

Column played in has similarity  0.1265
Column acted in has similarity  0.1202
Liam Neeson played in Star Wars.
Execution time for answer_query: 1.9041 seconds


In [41]:
answer_query("Who is an actor in Taken 2?")

High confidence match found: actors with similarity  0.5670
Maggie Grace is an actor in Taken 2.
Execution time for answer_query: 1.5475 seconds


In [42]:
answer_query("What is the role of Vin Diesel in Fast and Furious?")

Column director has similarity  0.1230
Column actors has similarity  0.1118
Vin Diesel is the director of Fast and Furious.
Execution time for answer_query: 2.0773 seconds


In [43]:
answer_query("For which movie did Leonardo Di Caprio win an Oscar?")

Column actors has similarity  0.1623
Column imdb id has similarity  0.1592
Column movie cast has similarity  0.1467
Column nominated for has similarity  0.1392
Column players has similarity  0.1264
For which movie did Leonardo Di Caprio win an Oscar?
Execution time for answer_query: 2.7314 seconds
