<a href="https://colab.research.google.com/github/sumsumcity/chatbot/blob/main/ChatbotRaphael.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
# https://www.wikidata.org/wiki/Wikidata:List_of_properties/all_in_one_table might use this for predicates etc.
!pip install -q rdflib transformers spacy-universal-sentence-encoder
!pip install -q transformers[sentencepiece]
!pip install -q jdc requests
!pip install -q rdflib transformers spacy-universal-sentence-encoder

!wget -q --no-check-certificate https://files.ifi.uzh.ch/ddis/teaching/2021/ATAI/dataset/ddis-movie-graph.nt.zip && unzip -u ddis-movie-graph.nt.zip 
!wget -q --no-check-certificate https://files.ifi.uzh.ch/ddis/teaching/2021/ATAI/dataset/ddis-graph-embeddings.zip && unzip -u ddis-graph-embeddings.zip 
!wget -q --no-check-certificate https://files.ifi.uzh.ch/ddis/teaching/2021/ATAI/dataset/movienet/images.json.zip && unzip -u images.json.zip 
!wget -q --no-check-certificate https://files.ifi.uzh.ch/ddis/teaching/2021/ATAI/dataset/crowd_data/crowd_data.tsv
!wget -q --no-check-certificate https://raw.githubusercontent.com/radubauzh/AI_ChatBot/main/bloaters.py


import numpy as np
import pandas as pd
from tqdm import tqdm
from PIL import Image
from bloaters import *
from io import BytesIO
from datetime import datetime
from rdflib import Graph, graph
from collections import defaultdict
from IPython.display import Javascript
from IPython.display import clear_output
from sklearn.metrics import pairwise_distances
from sklearn.neighbors import NearestNeighbors
from nltk.corpus import stopwords as stopwords_nltk
from rdflib.namespace import RDF, RDFS, XSD, Namespace
from nltk.tokenize import RegexpTokenizer, word_tokenize
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
import jdc, rdflib, spacy, spacy_universal_sentence_encoder, csv, torch, json, nltk, difflib, itertools, requests, time, warnings, atexit, getpass, random, urllib.parse
#from bloaters import QSTN_INTENT, PREDICATES, CLASSIFIED_PREDICATES, SPARQL_TEMPLATE_ENTITY, SPARQL_TEMPLATE, SPARQL_TEMPLATE_RATINGS, SPARQL_TEMPLATE_DESCRIPTION, linkErrorDict, humans_query, HUMANS_DICT, HUMANS_LBL, film_query, FILM_DICT, FILM_LBL


nltk.download('stopwords')
nltk.download('punkt')

clear_output()

## Pipelines (Runtime +- 4 Minutes)

In [2]:
if(torch.cuda.is_available()):
  spacy.require_gpu()

def load_graph(filename, format='turtle'):
  graph = Graph()
  graph.parse(filename, format=format)
  return graph

def load_zero_shot():
  zero_shot_pipeline = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=0 if(torch.cuda.is_available()) else -1)
  return zero_shot_pipeline

def load_ner_pipeline():
  tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/camembert-ner")
  model = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/camembert-ner")
  ner_pipeline = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
  return ner_pipeline

def load_sentence_encoder():
  snt_encoder = spacy_universal_sentence_encoder.load_model('en_use_lg')
  return snt_encoder

def load_graph(filename, format='turtle'):
  graph = Graph()
  graph.parse(filename, format=format)
  return graph

def load_graph_embeddings():
  entity_emb = np.load('./ddis-graph-embeddings/entity_embeds.npy')
  relation_emb = np.load('./ddis-graph-embeddings/relation_embeds.npy')

  with open('./ddis-graph-embeddings/entity_ids.del', 'r') as ifile: 
    ent2id = {rdflib.term.URIRef(ent): int(idx) for idx, ent in csv.reader(ifile, delimiter='\t')}
    id2ent = {v: k for k, v in ent2id.items()}
  with open('./ddis-graph-embeddings/relation_ids.del', 'r') as ifile:
    rel2id = {rdflib.term.URIRef(rel): int(idx) for idx, rel in csv.reader(ifile, delimiter='\t')}
    id2rel = {v: k for k, v in rel2id.items()}

  ent2lbl = {ent: str(lbl) for ent, lbl in graph.subject_objects(RDFS.label)}
  lbl2ent = {lbl: ent for ent, lbl in ent2lbl.items()}
  return entity_emb, relation_emb, ent2id, rel2id, id2ent, ent2lbl, lbl2ent

def load_crowd_data():
  dictList = []
  with open("crowd_data.tsv") as file:
    tsv_file = csv.reader(file, delimiter="\t")
    keys=tsv_file.__next__()
    for line in tsv_file:
      dic = {}
      if (tsv_file.line_num!=1):
        for idx,row in enumerate(line):
          dic[keys[idx]]=row
      dictList.append(dic)
  return dictList

In [3]:
sentence_encoder = load_sentence_encoder()
ner = load_ner_pipeline()
zero_shot = load_zero_shot()
graph = load_graph('14_graph.nt')
entity_emb, relation_emb, ent2id, rel2id, id2ent, ent2lbl, lbl2ent = load_graph_embeddings()
crowdList = load_crowd_data()

WD = Namespace('http://www.wikidata.org/entity/')
WDT = Namespace('http://www.wikidata.org/prop/direct/')
RDFS = rdflib.namespace.RDFS
SCHEMA = Namespace('http://schema.org/')
DDIS = Namespace('http://ddis.ch/atai/')
PREDICATE_EMBDS = {intent:{lbls:sentence_encoder(lbls) for lbls in CLASSIFIED_PREDICATES[intent]} for intent in QSTN_INTENT}
IMAGES = json.load(open('images.json'))

Downloaded https://tfhub.dev/google/universal-sentence-encoder-large/5, Total size: 577.10MB



Downloading:   0%|          | 0.00/269 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/892 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/811k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/210 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [4]:
class Entity:
  def __init__(self, label, uri, cls):
    self.label = label
    self.uri = uri
    self.cls = cls
  def __str__(self):
    if(self.uri!=None):
      return "Entity Label: "+ self.label+ ", "+ "Class: "+ self.cls+ ", URI:"+ self.uri.toPython()
    return "None"

class Predicate:
  def __init__(self, label, uri):
    self.label = label
    self.uri = uri
  def __str__(self):
    if(self.uri!=None):
      return "Predicate Label: "+ self.label+ ", " +  ", URI:"+ self.uri.toPython()
    return "None"

In [5]:
humans_query = graph.query("""
PREFIX ddis: <http://ddis.ch/atai/>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX schema: <http://schema.org/>

SELECT DISTINCT ?b ?label WHERE{
  ?b wdt:P31/wdt:P279* wd:Q5.
  ?b rdfs:label ?label
}
""")
HUMANS_DICT = {}
for k, v in humans_query:
  v = v.toPython().lower()
  while(v in HUMANS_DICT):
    v+="0"
  HUMANS_DICT[v] = k
HUMANS_LBL = list(HUMANS_DICT.keys())
# ----------------------------------------------------
film_query = graph.query("""
PREFIX ddis: <http://ddis.ch/atai/>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX schema: <http://schema.org/>

SELECT DISTINCT ?b ?label WHERE{
  ?b wdt:P31/wdt:P279* wd:Q11424.
  ?b rdfs:label ?label
}
""")

FILM_DICT = {}
for k, v in film_query:
  v = v.toPython().lower()
  while(v in FILM_DICT):
    v+="0"
  FILM_DICT[v] = k
FILM_LBL = list(FILM_DICT.keys())

### Bloaters

In [6]:
tokenizer = RegexpTokenizer(r'\w+')
greetings = ["hello", "hi", "hey", "howdy", "greetings", "welcome", "hi there", "hello there", "good morning", "good afternoon", "good evening","good day", "hello friend", "what's up", "how's it going", "hoi", "ola", "cheers", "hallo", "grüezi", "moin", "guten tag", "guten morgen"]
stopwords = stopwords_nltk.words('english')
newStopWords = ['where', 'which', 'how', 'can', 'when', 'what', 'whose', 'show',":"]
stopwords.extend(newStopWords)
return_greetings = ["Hi, human", "Heeeeeeey :)", "Oi!!! Hello there human", "Moin", "Ni hao :)"]
crazy_array = ["My recommendations are: ", "These are my recommendations: ", "My recommendations would include: ", "Oh you should watch the following: ", "Good choices! You could watch the following: "]
all_ratings = {"G":"G - General Audiences", "PG":"PG - Parental Guidance Suggested", "PG-13":"PG-13 - Parents Strongly Cautioned", "R":"R - Restricted", "NC-17":"NC-17 - Adults Only"}

IMGS_WORD = ['poster', 'picture', 'image', 'potrait', 'photo', 'photograph','look like', 'looks like'] # Gives a pictrue back
FRAME_WORDS = ["frame"]
WEBPAGES_WORDS = ["webpage", "link", "website", "page"]
OCCUPATION_WORDS = ["occupation", "does for a living", "do for a living"]
TYPE_WORDS = ['genre']

# Fleiss kappa

In [7]:
# Filter malicious workers
crowdList = [x for x in crowdList if not int(x["WorkTimeInSeconds"])<10]
crowdList = [x for x in crowdList if not int(x["LifetimeApprovalRate"][:-1])<50]
crowdList = [x for x in crowdList if "I don't understand" not in x["FixPosition"]]

def get_dataframes():
  listOfDf = []
  df=pd.DataFrame(crowdList)
  df=df[["HITId","HITTypeId","WorkerId","AnswerID"]]
  listOfHITTypeId = df.HITTypeId.unique()

  for hitType in listOfHITTypeId:
    mask = df[(df.HITTypeId == hitType)]
    listOfDf.append(mask)
  return listOfDf, listOfHITTypeId

def calculate_fleissKappa(listOfDf, listOfHITTypeId):
  final_dic = {}

  for idx,dataframe in enumerate(listOfDf):
    correct, incorrect, listOfPi = [],[],[]
    N = dataframe.HITId.nunique()
    for num in range(int(dataframe.iloc[0].HITId),int(dataframe.iloc[0].HITId)+N):

      newDf = dataframe[(dataframe.HITId == str(num))]
      
      # calculate p_e
      n = len(newDf) # Check how many distinct workers there are
      correct.append((newDf.AnswerID=="1").sum() / n)
      incorrect.append((newDf.AnswerID=="2").sum() / n)
      
      # calculate p
      yes = ((newDf.AnswerID=="1").sum())
      no = ((newDf.AnswerID=="2").sum())
      summe = (yes * (yes-1)) + (no * (no - 1))
      p_i = (1/(n*(n-1)) * summe) / N
      listOfPi.append(p_i)
    
    p_j1 = sum(correct) / (len(dataframe.HITId.unique()))
    p_j2 = sum(incorrect) / (len(dataframe.HITId.unique()))
    p_e = (p_j1) ** 2 + (p_j2) ** 2
    p_line = sum(listOfPi)
    
    # Calculate Fleiss Kappa
    fleissKappa = (p_line - p_e) / (1 - p_e)
    final_dic[listOfHITTypeId[idx]] = fleissKappa
  
  return final_dic

listOfDf, listOfHITTypeId = get_dataframes()
fleissKappaDict = calculate_fleissKappa(listOfDf, listOfHITTypeId)

## Make final crowd data with updated fixes as triple

In [8]:
# Change the triple with the correct data (fixvalue/fixposition)
# Input: List of all crowd data of one triple
# Returns correct triple even if it is wrong or right
def correctTriple(tempList):
  remove_list = ["Title", "Reward", "AssignmentId", 'WorkerId', 'AssignmentStatus', 'WorkTimeInSeconds', 'LifetimeApprovalRate', 'AnswerID', 'AnswerLabel', 'FixPosition', 'FixValue']
  counterCorrect, counterIncorrect = 0, 0
  fixPosList, fixValList = [], []
  for dic in tempList:
    fixPosList.append(dic["FixPosition"])
    fixValList.append(dic["FixValue"])
    if dic["AnswerID"] == "1":
      counterCorrect += 1
    else:
      counterIncorrect += 1
      
  if counterCorrect >= counterIncorrect:
    returnDic = tempList[0]
    returnDic = {key: returnDic[key] for key in returnDic if key not in remove_list}
    returnDic["numCorrect"] = counterCorrect
    returnDic["numIncorrect"] = counterIncorrect
  else:
    returnDic = tempList[0]
    returnDic = {key: returnDic[key] for key in returnDic if key not in remove_list}
    returnDic["numCorrect"] = counterCorrect
    returnDic["numIncorrect"] = counterIncorrect
    fixPosList = [x for x in fixPosList if x != '']
    fixValList = [x for x in fixValList if x != '']
    
    if len(fixPosList)>0 and fixPosList[0] == "Subject" and len(fixValList)>0:
      if fixValList[0].startswith("Q"):
        returnDic["Input1ID"] = "wd:"+fixValList[0]
      else:  
        returnDic["Input1ID"] = fixValList[0]
        
    elif len(fixPosList)>0 and fixPosList[0] == "Predicate" and len(fixValList)>0:
      if fixValList[0].startswith("P"):
        returnDic["Input2ID"] = "wdt:"+fixValList[0]
      else:  
        returnDic["Input2ID"] = fixValList[0]
        
    elif len(fixPosList)>0 and fixPosList[0] == "Object" and len(fixValList)>0:
      if fixValList[0].startswith("Q"):
        returnDic["Input3ID"] = "wd:"+fixValList[0]
      else:  
        returnDic["Input3ID"] = fixValList[0]
  return returnDic

def makeFinalCrowdList():
  finalList, tempList = [], []
  currentHITId = crowdList[0]["HITId"]

  for d in crowdList:
    if d["HITId"] == currentHITId:
      tempList.append(d)
    else:
      currentHITId = d["HITId"]
      correctDic = correctTriple(tempList) 
      finalList.append(correctDic)
      tempList = []
      tempList.append(d)
            
  return finalList

finalCrowdList = makeFinalCrowdList()

# Question Class

In [9]:
class Question:
  def __init__(self, text):
    self.text = text

    self.processed_question = text
    self.intent = None
    self.entity = []
    self.predicates = None
    self.greet = False
    self.subclass = False

    self.check_greetings()

    if(self.greet):
      return

    self.preprocess()


  def get_predicates(self, embedding):
    max_score = -1e6
    predicate_lbl = None
    
    if self.subclass:
      return Predicate("indirect subclass of", DDIS.indirectSubclassOf)

    for lbl in PREDICATE_EMBDS[self.intent].keys():
      score = PREDICATE_EMBDS[self.intent][lbl].similarity(embedding) - 0.1*nltk.edit_distance(lbl, self.processed_question) / (len(self.processed_question) + len(lbl))
      
      if(score>max_score):
        predicate_lbl = lbl
        max_score = score

    predicate_uri = PREDICATES[predicate_lbl]

    return Predicate(predicate_lbl, predicate_uri)

  def preprocess(self):
    self.processed_question = self.processed_question.replace("IMDB", 'imdb')
    self.processed_question = self.processed_question.replace("IMDb", 'imdb')
    self.processed_question = self.processed_question.replace("Wikidata", 'wikidata')
    self.processed_question = self.processed_question.replace("WikiData", 'wikidata')

    #self.processed_question = self.remove_punctuations(self.processed_question) # removes points
    self.processed_question = self.remove_greetings(self.processed_question) # removes greeting words
    
    self.processed_question = self.check_subclass(self.processed_question) # Checks if DDIS.indirectSubclassOf is needed

    self.entity = self.get_entity(self.remove_stopwords(self.processed_question, newStopWords)) # get the entities(PARAM: removes all the newStopWords)
    if (self.entity != None):
      for i in range(len(self.entity)):
        if(self.entity[i]!=None):
          self.processed_question = self.remove_entities(self.processed_question, self.entity[i].label)

    #for entity in self.entity:
      #print(f"ENTITIES AFTER REMOVE: {entity.label} ")

    self.check_greetings()
    if(self.greet):
      return 
    
    self.intent = self.get_intents(self.processed_question, QSTN_INTENT) # First guess as intent
    flag = True

    #print(f"INTENT after get_intents: {self.intent}")

    IMGS_WORD = ['poster', 'picture', 'image', 'potrait', 'photo', 'photograph','look like', 'looks like'] # Gives a pictrue back
    FRAME_WORDS = ["frame"]
    WEBPAGES_WORDS = ["webpage", "link", "website", "page"]
    OCCUPATION_WORDS = ["occupation", "does for a living", "do for a living"]
    TYPE_WORDS = ['genre']

    
    # Check crowd_data if entity is in Input1ID, Input2ID, Input3ID
    for dic in crowdList:
      for ent in self.entity:
        linkTransform1 = ent.uri.replace("http://www.wikidata.org/entity/","wd:") # ent.uri like crowd_data
        linkTransform2 = ent.uri.replace("http://www.wikidata.org/entity/","") # ent.uri like crowd_data
        #linkTransform3 = ent.uri.replace("http://www.wikidata.org/prop/direct/","wdt:") # rel.uri like crowd_data
        if dic["Input1ID"]==linkTransform1 or dic["Input3ID"]==linkTransform1 or dic["FixValue"] == linkTransform1 or dic["FixValue"] == linkTransform2:
          self.intent = "crowdsourcing"
          flag = False

    # Checkes sentence for other intents than first guess
    for w in IMGS_WORD:
      if(w in self.processed_question.lower() and flag) or (w in self.entity[0].label): # poster can be taken into entity and therefore it will be removed from the question
        self.intent = 'images'
        flag = False
        break
    for w in FRAME_WORDS:
      if(w in self.processed_question.lower() and flag):
        self.intent = 'frame'
        flag = False
        break
    for w in WEBPAGES_WORDS:
      if(w in self.processed_question.lower() and ("wikidata" in self.processed_question.lower()) and flag):
        self.intent = 'wikidata'
        flag= False
        break
      elif(w in self.processed_question.lower() and flag):
        self.intent = "imdb"
        flag = False
        break
    for w in OCCUPATION_WORDS:
      if(w in self.processed_question.lower() and flag):
        self.intent = 'description'
        flag = False
        break
    for w in TYPE_WORDS:
      if(w in self.processed_question.lower() and flag):
        #print(self.processed_question)
        self.intent = 'type'
        flag = False
        break
    # TODO: Remove duplicate
    for w in OCCUPATION_WORDS:
      if(w in self.processed_question.lower() and flag):
        self.intent = 'description'
        flag = False
        break

    self.processed_question = self.remove_stopwords(self.processed_question, stopwords) # removes all stopwords


    embedding = sentence_encoder(self.processed_question)

    if(self.intent in ["person", "location", "time", "crowdsourcing"]):
      self.predicates = self.get_predicates(embedding)

  def remove_punctuations(self, question):  
      processed_question = ' '.join(tokenizer.tokenize(question))
      return processed_question

  def remove_greetings(self, question):    
      text_tokens = word_tokenize(question)
      tokens_without_grt = [word for word in text_tokens if not word.lower() in greetings]

      processed_question = ' '.join(tokens_without_grt)
      return processed_question

  def remove_entities(self, question, entity):
      processed_question = question.replace(entity, "").lower()
      return processed_question

  # Gives back the question without the stopwords
  def remove_stopwords(self, question, sws):
    text_tokens = word_tokenize(question)
    tokens_without_sw = [word for word in text_tokens if not word.lower() in sws]
    processed_question = ' '.join(tokens_without_sw)
    return processed_question

  def get_entity(self, question_text):
    nerErrors = {"Conan the Barbarian":"MISC", "written work":"O"} # Some errors in the ner pipeline
    #print("TEXT: ",question_text)
    entity_list = []
    cls_list = []
    
    for e in nerErrors:
      if e in question_text: ######
        entity_list.append(e)
        cls_list.append(nerErrors[e])
    
    entities = ner(question_text, aggregation_strategy="simple")
    if len(entity_list)==0 or len(cls_list)==0:
      # Select longest entity
      for i in entities:
        if (i['word'] in nerErrors.keys()):
          entity_list.append(i['word'])
          cls_list.append(nerErrors[i['word']])
        else:
          entity_list.append(i['word'])
          cls_list.append(i['entity_group'])
    
    entities = zip(entity_list, cls_list)
    #print(entity_list)
    #xxx
    #sorted_list = list(sorted(entities, key = lambda x: len(x[0])))
    sorted_list = list(sorted(entities, key = lambda x: len(x[0])))

    #print("Hellllllo")
    #print(sorted_list)
    #print(sorted_list[-1][0])

    if(len(sorted_list)==0):
      return [None]

    return_list = []
    for i in range(len(sorted_list)):
      entity_lbl1 = sorted_list[-(i+1)][0]
      entity_clss1 = sorted_list[-(i+1)][1]
      entity_uri1 = self.get_entity_uri(entity_lbl1.lower(), entity_clss1) # e.g. batman, MISC
      #print(f"Entity Label{i}: {entity_lbl1}, Entity Class{i}: {entity_clss1}, Entity URI{i}: {entity_uri1}")
      if(entity_uri1==None):
        return_list.append(None)
      else:
        return_list.append(Entity(entity_lbl1, entity_uri1, entity_clss1))

    #print("Hallo",entity_list, cls_list)
    #print([x for x in return_list])

    return return_list

  def get_entity_uri(self, entity_lbl, entity_clss):
    linkErrors = linkErrorDict
    for key in linkErrors.keys(): # Check if it is in error dic because it takes those wrong
      if key in entity_lbl:
        return linkErrors[key]
    if(entity_clss=="PER"):
      LBL = HUMANS_LBL
      DICT = HUMANS_DICT
    else:
      LBL = FILM_LBL
      DICT = FILM_DICT
    match = difflib.get_close_matches(entity_lbl, LBL, n=10)
    match_list = []
    match_list.append(match[0])
    for el in match:
      if el.endswith("0"):
        match_list.append(el)

    if(len(match)==0):
      return None
    if(len(match_list)==1 or entity_clss=="PER"):
      return DICT[match[0]]
    else:
      # Filter by awards and if all no awards then it should take the older film
      awardDict = {}
      yearDict = {}
      
      temp = [x for x in match_list if entity_lbl in x] # only those entities which had right label as substring
      if len(temp) > 0:
        match_list = temp
        
      for el in match_list:
        #return if entity is in crowdlist --> So always check first if in crowd_data
        linkTransform = DICT[el].replace("http://www.wikidata.org/entity/","wd:") 
        for dic in crowdList:
          if dic["Input1ID"] == linkTransform or dic["Input3ID"] == linkTransform:
            return DICT[el]
        
        awardList = []
        for obj in graph.objects(DICT[el],rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P166')): # Filter by awards
          awardList.append(obj)
        for obj in graph.objects(DICT[el],rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P577')): # Filter by year
          yearDict[el] = obj
        awardDict[el] = awardList
      hasAward = False
      for l in awardDict.values():
        if len(l) > 0:
          hasAward = True
      if hasAward:
        awardDict = sorted(awardDict, key=lambda k: len(awardDict[k]), reverse=True)
        return DICT[awardDict[0]]
      else:
        yearDict = sorted(yearDict, key=yearDict.get)
        return DICT[yearDict[0]]

  def get_intents(self, question, intents):
    intents = zero_shot(question, intents) # Get the intents
    top_intent = intents['labels'][0]
    return top_intent

  def check_greetings(self):
    if(self.text.lower() in greetings or len(self.text)<7):
      self.greet = True
      return 
    if(len(self.processed_question)==0):
      self.greet = True
      return 
    
  def check_subclass(self, question):
    if("subclass" in self.text.lower()):
      self.subclass = True
      processed_question = question.replace('subclass', "").lower()
      if("indirect" in self.text.lower()):
        processed_question = processed_question.replace('indirect', "").lower()
      return processed_question
    else:
      return question

  # For debugging the Question Class
  def __str__(self):
    out = ""
    if(self.processed_question!=None):
      out = "Processed Question: " + self.processed_question + "\n"

    if (self.entity != None):
      for i in range(len(self.entity)):
        if(self.entity[i]!=None):
          out+=self.entity[i].__str__()
          out+="\n"

    if(self.predicates!=None):
      out+=self.predicates.__str__()
      out+="\n"
    if(self.intent!=None):
      out+="Intent: "+ self.intent
    return out if(out!="") else "None"


## Get entity list with the closest match

In [10]:
def get_entity_uri_list(entity_lbl, entity_clss):
  if(entity_clss=="PER"):
    LBL = HUMANS_LBL
    DICT = HUMANS_DICT
  else:
    LBL = FILM_LBL
    DICT = FILM_DICT

  match = []
  for lbl in LBL:
    if(entity_lbl.lower() in lbl.lower()):
      match.append(lbl)
    if(len(match)>=25):
      break
  if(len(match)==0):
    return []
  return [DICT[m] for m in match]

# Chatbot Class

In [40]:
class ChatBot:
  def __init__(self):
    self.prev_entity = None

  def process_question(self, question_text):
      question = Question(question_text)
      print(question)
      return question

  def get_answer(self, question_text):
    question = self.process_question(question_text)
    if(question.greet):
      return random.choice(return_greetings)
    elif len(question.entity)>1 and question.predicates!=None and question.intent!="crowdsourcing" and question.intent!="recommendations": # For Yes and no questions
      ans = "Yes" if self.assertion_statements(question.entity[0], question.entity[1], question.predicates) else "No, but hey, I was not trained for yes/no questions :)"
    elif question.intent=="images":
      ans = self.get_images(question)
    elif question.intent=="frame":
      ans = self.get_frame(question)
    elif question.intent=="wikidata":
      ans = self.get_wikidata(question)
    elif question.intent=="imdb":
      ans = self.get_imdb(question)
    elif(question.intent=="recommendations"):
      ans = self.get_recommendation(question)
    elif(question.intent=="rating"):
      ans = self.get_rating(question)
    elif(question.intent=="description"):
      ans = self.get_description(question)
    elif(question.intent=="crowdsourcing"):
      ans = self.get_answer_from_crowdsourcing(question)
    elif(question.intent=="type"):
      ans = self.get_genre(question)
    elif question.intent in ["person", "location"]:
      ans = self.get_answer_from_graph(question)
    elif question.intent=="time":
      ans = self.get_dates(question)
    else:
      ans = "Could you please rephrase your question?"
    
    if(question.entity[0]!=None):
      self.prev_entity = question.entity[0]
    if(ans==None or ans=="None"):
      ans = "I'm sorry, I don't know that. But you can ask me something like: What does Julia Roberts look like? (She is georgous)"

    return str(ans)

## Get_recommendation

In [41]:
%%add_to ChatBot

def get_recommendation(self, question):
  entity = None
  recommendation_array = []

  for i in range(len(question.entity)):
    if(question.entity[i]==None and self.prev_entity!=None):
      entity = self.prev_entity
    elif(question.entity[i]!=None):
      entity = question.entity[i]
    elif(question.entity[i]==None):
      return None
    if(entity.cls=="PER"):
      recommendation_array.extend(self.get_per_recommendation(entity))
    elif(entity.cls=="LOC"):
      recommendation_array.extend(self.get_loc_recommendation(entity))
    else:
      recommendation_array.extend(self.get_movie_recommendation(entity))
  
  # Calculate mean
  finalD = {}
  numberSameKey = {}
  for d in recommendation_array:
      if list(d.keys())[0] not in finalD.keys():
          finalD[list(d.keys())[0]] = list(d.values())[0]
          numberSameKey[list(d.keys())[0]] = 1
      else:
          finalD[list(d.keys())[0]] = finalD[list(d.keys())[0]] + list(d.values())[0] - 300 
          numberSameKey[list(d.keys())[0]] = numberSameKey[list(d.keys())[0]] + 1

  for key in finalD.keys():
      finalD[key] = finalD[key] / numberSameKey[key]

  sorted_mydict = dict(sorted(finalD.items(), key=lambda x: x[1]))
  
  # unwanted matches checks if it is a substring of a key and the delete this key and also the "The" for "The Beauty and the Beast"
  unwanted_matches = ["universe"]
  # removes asked entity name in recomendation
  for i in range(len(question.entity)):
    if question.entity[i].label in sorted_mydict.keys():
      del sorted_mydict[question.entity[i].label]      
    unwanted_matches.extend(difflib.get_close_matches(question.entity[i].label, sorted_mydict.keys(), n=10, cutoff=0.9))
  
  # removes the strings which have unwanted matches as substring
  temp = list(sorted_mydict)
  for el in temp:
    for unwanted in unwanted_matches:
      if unwanted in el:
        del sorted_mydict[el]

  recommendation_array = list(sorted_mydict.keys())
  output = random.choice(crazy_array) +', '.join(str(item) for item in recommendation_array[:4]) 
  #output = "My recommendations are: "+', '.join(str(item) for item in recommendation_array[:4]) 
  return output

## Get_loc_recommendation

In [42]:
%%add_to ChatBot

def get_loc_recommendation(self, entity):
  for s,p,o in graph.triples((None, RDFS.label, rdflib.term.Literal(entity.label, lang='en'))):
    if(((s, WDT.P31, WD.Q6256) in graph) or ((s, WDT.P31, WD.Q515) in graph)):
      entity.uri = s
      print("Updated entity", s)
      break

  head = entity_emb[ent2id[entity.uri]]
  pred = relation_emb[rel2id[WDT.P915]]
  lhs = head - pred 
  dist = pairwise_distances(lhs.reshape(1, -1), entity_emb).reshape(-1)
  most_likely = dist.argsort()

  ans = pd.DataFrame([
      (id2ent[idx][len(WD):], ent2lbl[id2ent[idx]], dist[idx], rank+1)
      for rank, idx in enumerate(most_likely[:4])],
      columns=('Entity', 'Label', 'Score', 'Rank'))

  scoreList = list(ans["Score"])
  labelList = list(ans['Label'])
  ans = []
  for idx, label in enumerate(labelList):
    ans.append({label: scoreList[idx]})
  return ans

## Get_movie_recommendation

In [43]:
# legacy code
#%%add_to ChatBot

#def get_movie_recommendation(self, entity):

  # WDT.P136 = Genre
  # WD.P144  = Based on
  # WD.P31   = Instance of
  # WD.P577 = Publication Date
  # temp = []
  # head = entity_emb[ent2id[entity.uri]]
  # for el in [WDT.P136,WDT.P144,WDT.P31]:
  #   pred = relation_emb[rel2id[el]]
  #   lhs = head - pred
  #   dist = pairwise_distances(lhs.reshape(1, -1), entity_emb).reshape(-1)
  #   most_likely = dist.argsort()


  #   data = pd.DataFrame([
  #     (
  #       id2ent[idx][len(WD):], # qid
  #       ent2lbl[id2ent[idx]],  # label
  #       dist[idx],             # score
  #       rank+1,                # rank
  #     )
  #     for rank, idx in enumerate(most_likely[:20])],
  #     columns=('Entity', 'Label', 'Score', 'Rank'))
  #   ans = list(data['Label'])[1:]
  #   temp = temp + ans
  # return temp

In [44]:
%%add_to ChatBot

def get_movie_recommendation(self, entity):
  temp = []
  ent = entity_emb[ent2id[entity.uri]]
  dist = pairwise_distances(ent.reshape(1, -1), entity_emb).reshape(-1)
  most_likely = dist.argsort()

  ans = pd.DataFrame([
    (
      id2ent[idx][len(WD):], # qid
      ent2lbl[id2ent[idx]],  # label
      dist[idx],             # score
      rank+1,                # rank
    )
    for rank, idx in enumerate(most_likely[:20])],
    columns=('Entity', 'Label', 'Score', 'Rank'))

  scoreList = list(ans["Score"])
  labelList = list(ans['Label'])
  ans = []
  for idx, label in enumerate(labelList):
    ans.append({label: scoreList[idx]})
  return ans

## Get_per_recommendation

In [45]:
%%add_to ChatBot

def get_per_recommendation(self, entity):
  description = [o for s,p,o in graph.triples((entity.uri, SCHEMA.description, None))][0].toPython()
  predicate = WDT.P57 # director
  if("actor" in description or "actress" in description):
    predicate = WDT.P161 # actors and actoresses

  head = entity_emb[ent2id[entity.uri]]
  pred = relation_emb[rel2id[predicate]]
  lhs = head + pred 
  dist = pairwise_distances(lhs.reshape(1, -1), entity_emb).reshape(-1)
  most_likely = dist.argsort()

  ans = pd.DataFrame([
      (id2ent[idx][len(WD):], ent2lbl[id2ent[idx]], dist[idx], rank+1)
      for rank, idx in enumerate(most_likely[:4])],
      columns=('Entity', 'Label', 'Score', 'Rank'))
  
  scoreList = list(ans["Score"])
  labelList = list(ans['Label'])
  ans = []
  for idx, label in enumerate(labelList):
    ans.append({label: scoreList[idx]})
  return ans

## Get_rating

In [46]:
%%add_to ChatBot
def get_rating(self, question):
  if(question.entity[0]!=None):
    entity = question.entity[0]
  # Entity Label: Weathering with You ?, Class: MISC, URI:http://www.wikidata.org/entity/Q59692464
  else:
    return None
  predicate = WDT.P1657
  answer_list = self.get_answer_from_embeddings(entity, predicate)
  answer = []
  for el in answer_list:
    if el in all_ratings.keys():
      answer.append(all_ratings[el])
  

  if len(answer) > 0:
    return f"The rating of {entity.label} is "+ answer[0]
  else:
    google_helper = urllib.parse.urlencode({"q": f"rating of {entity.label}"})
    google_search_url = "https://www.google.com/search?" + google_helper
    return f"Sorry I haven't found the rating of {entity.label}, maybe you'll find something here: {google_search_url}"

## Get_genre

In [47]:
%%add_to ChatBot
def get_genre(self, question):
  if(question.entity[0]!=None):
    entity = question.entity[0]
  else:
    return None
  predicate = WDT.P136
  ans = self.get_answer_from_embeddings(entity, predicate)
  if len(ans) > 0:
    return f"The genre of {entity.label} is "+", ".join(ans[:3])
  else:
    google_helper = urllib.parse.urlencode({"q": f"genre of {entity.label}"})
    google_search_url = "https://www.google.com/search?" + google_helper
    return f"Sorry I haven't found the genre of {entity.label}, maybe you'll find something here: {google_search_url}"

## Get_description

In [48]:
%%add_to ChatBot

def get_description(self, question):
  if(question.entity[0]!=None):
      entity = question.entity[0]
  else:
    return None    
  
  ans = [o for s,p,o in graph.triples((entity.uri, SCHEMA.description, None))]

  if(len(ans)>0):    
    return str(ans[0].toPython())
  return None

## Get_dates

In [49]:
%%add_to ChatBot

def get_dates(self, question):
  if(question.entity[0]==None and self.prev_entity!=None):
    entity = self.prev_entity
  elif(question.entity[0]!=None):
    entity = question.entity[0]
  else:
    return None

  ans = [s for s in graph.query(SPARQL_TEMPLATE.format(predicate_uri = question.predicates.uri, entity_uri = entity.uri.n3()))][0][1]

  return ans

## Get_images

In [50]:
%%add_to ChatBot

def get_images(self, question):
  if(question.entity[0]==None and self.prev_entity!=None):
    entity = self.prev_entity
  elif(question.entity[0]!=None):
    entity = question.entity[0]
  else:
    return None

  output = [o for s,p,o in graph.triples((entity.uri, WDT.P345, None))]
  if(len(output)==0):
    return "Sorry I cannot find the image"

  output = output[0].toPython()

  img_type = None
  entity_type = None
  if(entity.cls=="PER"):
    img_type = "event"
    entity_type ="cast"
  else:
    img_type = "poster"
    entity_type ="movie"

  for image in IMAGES:
    if(output in image[entity_type] and image['type']==img_type):
      return f"Here is an image of {entity.label} image:"+image['img'][: image['img'].find('.')]
  
  google_helper = urllib.parse.urlencode({"q": f"image of {entity.label}"})
  google_search_url = "https://www.google.com/search?" + google_helper
  return f"Sorry I haven't found an image of {entity.label}, maybe you'll find something here: {google_search_url}"

## Get_imdb

In [51]:
%%add_to ChatBot

def get_imdb(self, question):
  if(question.entity[0]==None and self.prev_entity!=None):
    entity = self.prev_entity
  elif(question.entity[0]!=None):
    entity = question.entity[0]
  else:
    return None

  output = [o for s,p,o in graph.triples((entity.uri, WDT.P345, None))]
  if(len(output)==0):
    google_helper = urllib.parse.urlencode({"q": f"imdb of {entity.label}"})
    google_search_url = "https://www.google.com/search?" + google_helper
    return f"Sorry I haven't found the webpage of {entity.label}, maybe you'll find something here: {google_search_url}"
    #return "Sorry I can't find the webpage, I guess I need more training"

  output = output[0].toPython()
  return "Here is the IMDb page you're looking for imdb:"+output

## Get_wikidata

In [52]:
%%add_to ChatBot

def get_wikidata(self, question):
  if(question.entity[0]==None and self.prev_entity!=None):
    entity = self.prev_entity
  elif(question.entity[0]!=None):
    entity = question.entity[0]
  else:
    return None

  output = entity.uri.n3()[entity.uri.n3().find('Q'):-1]
  return "Here is the wikidata page you are looking for wd:"+output

## Get_frame

In [53]:
%%add_to ChatBot

def get_frame(self, question):
  if(question.entity[0]==None and self.prev_entity!=None):
    entity = self.prev_entity
  elif(question.entity[0]!=None):
    entity = question.entity[0]
  else:
    return None

  if(entity.cls=='PER'):
    return self.get_images(question)
    
  output = [o for s,p,o in graph.triples((entity.uri, WDT.P345, None))]
  if(len(output)==0):
    google_helper = urllib.parse.urlencode({"q": f"image of {entity.label}"})
    google_search_url = "https://www.google.com/search?" + google_helper
    return f"Sorry I haven't found an image of {entity.label}, maybe you'll find something here: {google_search_url}"
    #return "Sorry I can't find the image you're looking for"

  output = output[0].toPython()

  img_type = "still_frame"
  entity_type = "movie"
  for image in IMAGES:
    if(output in image[entity_type] and image['type']==img_type):
      return "Here is the still frame image:"+image['img'][: image['img'].find('.')]
  
  google_helper = urllib.parse.urlencode({"q": f"image of {entity.label}"})
  google_search_url = "https://www.google.com/search?" + google_helper
  return f"Sorry I haven't found an image of {entity.label}, maybe you'll find something here: {google_search_url}"
  #return "Sorry I can't find the image you're looking for"

## Get_answer_from_embeddings

In [54]:
%%add_to ChatBot

def get_answer_from_embeddings(self, entity, predicate):

  head = entity_emb[ent2id[entity.uri]]
  pred = relation_emb[rel2id[predicate]]
  lhs = head + pred
  dist = pairwise_distances(lhs.reshape(1, -1), entity_emb).reshape(-1)
  most_likely = dist.argsort()

  ans = pd.DataFrame([
      (id2ent[idx][len(WD):], ent2lbl[id2ent[idx]], dist[idx], rank+1)
      for rank, idx in enumerate(most_likely[:10])],
      columns=('Entity', 'Label', 'Score', 'Rank'))

  return list(ans['Label'])

## Get_answer_from_graph

In [55]:
%%add_to ChatBot

def get_answer_from_graph(self, question):
  if(question.entity[0]==None and self.prev_entity!=None):
    entity = self.prev_entity
  elif(question.entity[0]!=None):
    entity = question.entity[0]
  else:
    return None
  num_answers = 1 

  if(entity.cls=="LOC" and question.predicates.label=="filming location"):
    return self.get_loc_recommendation(entity)
  if(question.predicates.label in ["actor", "cast member"]):
    num_answers = 3
  graph_ret = [o for s,p,o in graph.triples((entity.uri, question.predicates.uri, None))]
  if(len(graph_ret)==0): 
    num_answers = 3 
    ans = self.get_answer_from_embeddings(entity, question.predicates.uri)[:num_answers]
  else:
    ans = []
    for ret in graph_ret:
      ans.extend([o.toPython() for s,p,o in graph.triples((ret, RDFS.label, None))])
  

  return "My answer is " +", ".join(ans[:num_answers])

## Get_answer_from_crowdsourcing

In [56]:
%%add_to ChatBot

def get_answer_from_crowdsourcing(self, question):
  batch = "" # In which batch was the triple corrected
  numCorrect = 0
  numIncorrect = 0
  ans = ""
    
  for dic in finalCrowdList:
    tempEntList = []
    for ent in question.entity:
      linkTransform1 = ent.uri.replace("http://www.wikidata.org/entity/","wd:") # ent.uri like crowd_data
      linkTransform2 = question.predicates.uri.replace("http://www.wikidata.org/prop/direct/","wdt:") # rel.uri like crowd_data
      linkTransform3 = question.predicates.uri.replace("http://ddis.ch/atai/","ddis:") 
      if dic["Input1ID"] == linkTransform1 and (dic["Input2ID"] == linkTransform2 or dic["Input2ID"] == linkTransform3):
        batch = dic["HITTypeId"]
        ans = dic["Input3ID"]
        numCorrect = dic["numCorrect"]
        numIncorrect = dic["numIncorrect"]
      elif dic["Input3ID"] == linkTransform1 and (dic["Input2ID"] == linkTransform2 or dic["Input2ID"] == linkTransform3):
        batch = dic["HITTypeId"]
        ans = dic["Input1ID"]
        numCorrect = dic["numCorrect"]
        numIncorrect = dic["numIncorrect"]
      if len(tempEntList) > 0:
        if (dic["Input1ID"] == tempEntList[0] and dic["Input3ID"] == linkTransform1) or (dic["Input1ID"] == linkTransform1 and dic["Input3ID"] == tempEntList[0]):
          batch = dic["HITTypeId"]
          ans = dic["Input2ID"]
          numCorrect = dic["numCorrect"]
          numIncorrect = dic["numIncorrect"]
      tempEntList.append(linkTransform1)

  if ans == "":
    return "Sorry, I don't have the needed data for that"
  elif ans.startswith("wd:"):
    link = ans.replace("wd:","http://www.wikidata.org/entity/") # ent.uri like crowd_data
    ans = ent2lbl[rdflib.term.URIRef(link)]
  elif ans.startswith("wdt:"):
    link = ans.replace("wdt:","http://www.wikidata.org/prop/direct/")
    pred_lbl = [i for i in PREDICATES if PREDICATES[i]==rdflib.term.URIRef(link)]
    if len(pred_lbl)>0:
      ans = pred_lbl[0] # label of predicate
    else:
      ans = temp # value of predicate
  elif ans.startswith("ddis:"):
    ans = "indirect subclass"
      
  return "My answer is " +", ".join([ans])+" - according to the crowd, who had an inter-rater agreement of {} in this batch.".format(round(fleissKappaDict[batch], 2))+"\nThe answer distribution for this specific task was {} support votes and {} reject vote (If it has more reject votes than support votes, then the answer is the fix value from the crowd).".format(numCorrect, numIncorrect)

## Assertion_statements

In [57]:
%%add_to ChatBot

def assertion_statements(self, entity1, entity2, predicate):
  classifiedKey = ""
  
  for cl in CLASSIFIED_PREDICATES:
    if cl == "crowdsourcing":
      continue
    if predicate.label in CLASSIFIED_PREDICATES[cl]:
      classifiedKey = cl
  
  if classifiedKey != "":
    for pred in CLASSIFIED_PREDICATES[classifiedKey]:
      if((entity1.uri, PREDICATES[pred], entity2.uri) in graph):
        print("YES")
        return True
      if((entity2.uri, PREDICATES[pred], entity1.uri) in graph):
        return True
    return False
    
  
  else:
    if((entity1.uri, predicate.uri, entity2.uri) in graph):
      return True
    if((entity2.uri, predicate.uri, entity1.uri) in graph):
      return True
    return False

# Main

In [29]:
# Question & Answer

# Old ones
Test_Questions = {
    "Who is the director of the Batman movie?":"Tim Burton",  
    "Did Christopher Nolan ever work on a Batman movie?":"Yes",  
    "What is the name of the lead actor in the movie Catch Me If You Can?":"Nathalie Baye, Frank Abagnale, Gerald R. Molen",  
    "I like the Jurassic Park movie; can you recommend any similar movies?":"The Lost World: Jurassic Park, Jurassic Park III, Jurassic World: Dominion, Top Gun",  
    "Show me the pictures of the lead actors of the movie Jurassic Park.":"image:2176/rm3044105728", 
    "Can you show me the poster of the movie Batman?":"image:1298/rm4086331136",  
    "Show me an action movie poster.":"image:1298/rm4086331136",
    "where does christopher nolan live":"idk",
    "can you recommend me some action movies?":"idk"
}

Factual_Questions = {
    "Who is the director of Good Will Hunting?": "Gus Van Sant is the director of Good Will Hunting.",
    "Who directed The Bridge on the River Kwai?": "David Lean directed The Bridge on the River Kwai.",
    "Who is the director of Star Wars: Episode VI - Return of the Jedi?": "I think it is Richard Marquand."
}

Embedding_Questions = {
    "Who is the screenwriter of The Masked Gang: Cyprus?":"The screenwriter of The Masked Gang: Cyprus is Cengiz Küçükayvaz, Murat Aslan, and Melih Ekener.", 
    "What is the MPAA film rating of Weathering with You?":"The MPAA film rating of Weathering with You is PG-13.", 
    "What is the genre of Good Neighbors?":"The genre of Good Neighbors is drama, comedy-drama, and comedy film."
}

Multimedia_Questions = {
    "Show me a picture of Halle Berry.":"image:0353/rm3257480192",
    "What does Julia Roberts look like?":"image:3739/rm3651656960", 
    "Let me know what Sandra Bullock looks like.":"image:0453/rm2393611008"
}

Recommendation_Questions = {
    "Recommend movies similar to Hamlet and Othello.":"Adequate recommendations will be movies in the drama genre that are based on classic literature (e.g., Shakespeare, Dickens, or Jane Austen).", 
    "Given that I like The Lion King, Pocahontas, and The Beauty and the Beast, can you recommend some movies?":"Adequate recommendations will be (2-D) animated movies or real-life remakes of Disney movies.", 
    "Recommend movies like Nightmare on Elm Street, Friday the 13th, and Halloween.":"Adequate recommendations will be horror movies from the 1970s or 1980s or sequels to the movies."
}

Crowdsourcing_Questions = {
    "What is the box office of The Princess and the Frog?":"The box office of The Princess and the Frog is 267000000.", 
    "Can you tell me the publication date of Tom Meets Zizou?":"The publication date of Tom Meets Zizou is 2011-01-01.", 
    "Who is the executive producer of X-Men: First Class?":"The executive producer is Sheryl Lee Ralph."
}

deleteme = {
    "Hello":"ka alte",
    "Hi":"ka alte",
    "Hallo":"ka alte",
    "Hi":"ka alte",
    }

## Chatbot

In [58]:
chatbot = ChatBot()

## Test Questions

In [31]:
warnings.filterwarnings('ignore')

In [32]:
"""
for k,v in Test_Questions.items():
  print("-----------------------------------------------------------------------")
  print("\n",f"- Question:   {k}\n - Answer:     {chatbot.get_answer(k)}\n - Expected:   {v}")
"""

'\nfor k,v in Test_Questions.items():\n  print("-----------------------------------------------------------------------")\n  print("\n",f"- Question:   {k}\n - Answer:     {chatbot.get_answer(k)}\n - Expected:   {v}")\n'

In [33]:
for k,v in Factual_Questions.items():
  print("---------------------------------------------------------------------")
  print("\n",f"- Question:   {k}\n - Answer:     {chatbot.get_answer(k)}\n - Expected:   {v}")

---------------------------------------------------------------------
Processed Question: director ?
Entity Label: Good Will Hunting, Class: MISC, URI:http://www.wikidata.org/entity/Q193835
Predicate Label: director, , URI:http://www.wikidata.org/prop/direct/P57
Intent: person

 - Question:   Who is the director of Good Will Hunting?
 - Answer:     My answer is Gus Van Sant
 - Expected:   Gus Van Sant is the director of Good Will Hunting.
---------------------------------------------------------------------
Processed Question: directed ?
Entity Label: The Bridge on the River Kwai, Class: LOC, URI:http://www.wikidata.org/entity/Q188718
Predicate Label: director, , URI:http://www.wikidata.org/prop/direct/P57
Intent: person

 - Question:   Who directed The Bridge on the River Kwai?
 - Answer:     My answer is David Lean
 - Expected:   David Lean directed The Bridge on the River Kwai.
---------------------------------------------------------------------
Processed Question: director star wa

In [34]:
for k,v in Embedding_Questions.items():
  print("---------------------------------------------------------------------")
  print("\n",f"- Question:   {k}\n - Answer:     {chatbot.get_answer(k)}\n - Expected:   {v}")

---------------------------------------------------------------------
Processed Question: screenwriter masked gang cyprus ?
Entity Label: The Masked Gang Cyprus, Class: MISC, URI:http://www.wikidata.org/entity/Q7750525
Predicate Label: screenwriter, , URI:http://www.wikidata.org/prop/direct/P58
Intent: person

 - Question:   Who is the screenwriter of The Masked Gang: Cyprus?
 - Answer:     My answer is Cengiz Küçükayvaz, Murat Aslan, Melih Ekener
 - Expected:   The screenwriter of The Masked Gang: Cyprus is Cengiz Küçükayvaz, Murat Aslan, and Melih Ekener.
---------------------------------------------------------------------
Processed Question: mpaa film rating
Entity Label: Weathering with You ?, Class: MISC, URI:http://www.wikidata.org/entity/Q59692464
Entity Label: MPAA film, Class: ORG, URI:http://www.wikidata.org/entity/Q3012563
Intent: rating

 - Question:   What is the MPAA film rating of Weathering with You?
 - Answer:     The rating of Weathering with You ? is PG-13 - Parents

In [59]:
for k,v in Multimedia_Questions.items():
  print("-----------------------------------------------------------------------")
  print("\n",f"- Question:   {k}\n - Answer:     {chatbot.get_answer(k)}\n - Expected:   {v}")

-----------------------------------------------------------------------
Processed Question: picture .
Entity Label: Halle Berry, Class: PER, URI:http://www.wikidata.org/entity/Q1033016
Intent: images
xnx [rdflib.term.Literal('nm0000932', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string'))]

 - Question:   Show me a picture of Halle Berry.
 - Answer:     Here is an image of Halle Berry image:0353/rm3257480192
 - Expected:   image:0353/rm3257480192
-----------------------------------------------------------------------
Processed Question: look like ?
Entity Label: Julia Roberts, Class: PER, URI:http://www.wikidata.org/entity/Q40523
Intent: images
xnx [rdflib.term.Literal('nm0000210', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string'))]

 - Question:   What does Julia Roberts look like?
 - Answer:     Here is an image of Julia Roberts image:3739/rm3651656960
 - Expected:   image:3739/rm3651656960
-----------------------------------------------------

In [36]:
for k,v in Recommendation_Questions.items():
  print("-----------------------------------------------------------------------")
  print("\n",f"- Question:   {k}\n - Answer:     {chatbot.get_answer(k)}\n - Expected:   {v}")

-----------------------------------------------------------------------
Processed Question: recommend movies similar hamlet .
Entity Label: Othello, Class: MISC, URI:http://www.wikidata.org/entity/Q2634673
Entity Label: Hamlet, Class: MISC, URI:http://www.wikidata.org/entity/Q27178
Intent: recommendations

 - Question:   Recommend movies similar to Hamlet and Othello.
 - Answer:     Oh you should watch the following: Henry V, Richard III, Brief Encounter, Becket
 - Expected:   Adequate recommendations will be movies in the drama genre that are based on classic literature (e.g., Shakespeare, Dickens, or Jane Austen).
-----------------------------------------------------------------------
Processed Question: given like lion king , pocahontas , , recommend movies ?
Entity Label: The Beauty and the Beast, Class: MISC, URI:http://www.wikidata.org/entity/Q19946102
Entity Label: The Lion King, Class: MISC, URI:http://www.wikidata.org/entity/Q36479
Entity Label: Pocahontas, Class: MISC, URI:ht

In [37]:
for k,v in Crowdsourcing_Questions.items():
  print("-----------------------------------------------------------------------")
  print("\n",f"- Question:   {k}\n - Answer:     {chatbot.get_answer(k)}\n - Expected:   {v}")

-----------------------------------------------------------------------
Processed Question: box office ?
Entity Label: The Princess and the Frog, Class: MISC, URI:http://www.wikidata.org/entity/Q171300
Predicate Label: box office, , URI:http://www.wikidata.org/prop/direct/P2142
Intent: crowdsourcing

 - Question:   What is the box office of The Princess and the Frog?
 - Answer:     My answer is 267000000 - according to the crowd, who had an inter-rater agreement of 0.24 in this batch.
The answer distribution for this specific task was 2 support votes and 1 reject vote (If it has more reject votes than support votes, then the answer is the fix value from the crowd).
 - Expected:   The box office of The Princess and the Frog is 267000000.
-----------------------------------------------------------------------
Processed Question: tell publication date ?
Entity Label: Tom Meets Zizou, Class: MISC, URI:http://www.wikidata.org/entity/Q1410031
Predicate Label: publication date, , URI:http://w

In [38]:
for k,v in deleteme.items():
  print("-----------------------------------------------------------------------")
  print("\n",f"- Question:   {k}\n - Answer:     {chatbot.get_answer(k)}\n - Expected:   {v}")

-----------------------------------------------------------------------
Processed Question: Hello


 - Question:   Hello
 - Answer:     Hi, human
 - Expected:   ka alte
-----------------------------------------------------------------------
Processed Question: Hi


 - Question:   Hi
 - Answer:     Moin
 - Expected:   ka alte
-----------------------------------------------------------------------
Processed Question: Hallo


 - Question:   Hallo
 - Answer:     Ni hao :)
 - Expected:   ka alte


# Speak Easy Interface

In [60]:
url = 'https://speakeasy.ifi.uzh.ch'
listen_freq = 3

In [61]:
class DemoBot:
    def __init__(self, username, password):
        self.agent_details = self.login(username, password)
        self.session_token = self.agent_details['sessionToken']
        self.chat_state = defaultdict(lambda: {'messages': defaultdict(dict), 'initiated': False, 'my_alias': None})
        atexit.register(self.logout)

    def listen(self):
        chatBotArray = {}
        chatroom_messages = {}
        while True:
            # check for all chatrooms
            current_rooms = self.check_rooms(session_token=self.session_token)['rooms']
            for room in current_rooms:
                # ignore finished conversations
                if room['remainingTime'] > 0:
                    room_id = room['uid']

                    if not self.chat_state[room_id]['initiated']:
                        #-----------------------------------------------------------
                        chatbot = ChatBot()
                        chatBotArray[room_id] = chatbot
                        #-----------------------------------------------------------
                        # send a welcome message and get the alias of the agent in the chatroom
                        self.post_message(room_id=room_id, session_token=self.session_token, message='Hi, I am Rafaels bot, you can send me messages about movies & stuff :)')
                        self.chat_state[room_id]['initiated'] = True
                        self.chat_state[room_id]['my_alias'] = room['alias']

                    #-----------------------------------------------------------    
                    chatbot = chatBotArray[room_id]
                    #-----------------------------------------------------------
                    # check for all messages
                    all_messages = self.check_room_state(room_id=room_id, since=0, session_token=self.session_token)['messages']

                    for message in all_messages:
                        if message['authorAlias'] != self.chat_state[room_id]['my_alias']:

                            # check if the message is new
                            if message['ordinal'] not in self.chat_state[room_id]['messages']:
                                self.chat_state[room_id]['messages'][message['ordinal']] = message
                                # Prints message which was received in terminal
                                print('\t- Chatroom {} - new message #{}: \'{}\' - {}'.format(room_id, message['ordinal'], message['message'], self.get_time()))

                                try:
                                  response = chatbot.get_answer(message["message"])  
                                except:
                                  response = "Unfortunately, I can't answer that, maybe you'll find something here: www.google.com"

                                self.post_message(room_id=room_id, session_token=self.session_token, message=response)


            time.sleep(listen_freq)

    # OK
    def login(self, username: str, password: str):
        agent_details = requests.post(url=url + "/api/login", json={"username": username, "password": password}).json()
        print('- User {} successfully logged in with session \'{}\'!'.format(agent_details['userDetails']['username'], agent_details['sessionToken']))
        return agent_details
    # OK
    def check_rooms(self, session_token: str):
        return requests.get(url=url + "/api/rooms", params={"session": session_token}).json()
    # OK
    def check_room_state(self, room_id: str, since: int, session_token: str):
        return requests.get(url=url + "/api/room/{}/{}".format(room_id, since), params={"roomId": room_id, "since": since, "session": session_token}).json()
    # OK but different
    def post_message(self, room_id: str, session_token: str, message: str):
        try:
          tmp_des = requests.post(url=url + "/api/room/{}".format(room_id),
                                  params={"roomId": room_id, "session": session_token}, data=message).json()
          if tmp_des['description'] != 'Message received':
              print('\t\t Error: failed to post message: {}'.format(message))
        except:
          err_msg = "Ohh I can't answer that, I guess I still need some training... can you ask me something else?"
          tmp_des = requests.post(url=url + "/api/room/{}".format(room_id),
          params={"roomId": room_id, "session": session_token}, data=err_msg).json()

    # ok
    def get_time(self):
        return time.strftime("%H:%M:%S, %d-%m-%Y", time.localtime())
    # ok
    def logout(self):
        if requests.get(url=url + "/api/logout", params={"session": self.session_token}).json()['description'] == 'Logged out':
            print('- Session \'{}\' successfully logged out!'.format(self.session_token))



In [None]:
if __name__ == '__main__':
    username = 'rafael.dubach_bot'
    #password = getpass.getpass('Password of the demo bot:')
    password = "WgPEV6LnvCtXzw"
    demobot = DemoBot(username, password)
    demobot.listen()

- User rafael.dubach_bot successfully logged in with session 'node015y6ukn2tg45sstui4ecqri2h2751813'!
	- Chatroom b9da01f0-3672-4736-8036-464b8b3365af - new message #1: 'Who is the director of Good Will Hunting?' - 17:19:19, 09-12-2022
Processed Question: director ?
Entity Label: Good Will Hunting, Class: MISC, URI:http://www.wikidata.org/entity/Q193835
Predicate Label: director, , URI:http://www.wikidata.org/prop/direct/P57
Intent: person
	- Chatroom b9da01f0-3672-4736-8036-464b8b3365af - new message #3: 'Who directed The Bridge on the River Kwai?' - 17:19:25, 09-12-2022
Processed Question: directed ?
Entity Label: The Bridge on the River Kwai, Class: LOC, URI:http://www.wikidata.org/entity/Q188718
Predicate Label: director, , URI:http://www.wikidata.org/prop/direct/P57
Intent: person
	- Chatroom b9da01f0-3672-4736-8036-464b8b3365af - new message #5: 'Who is the director of Star Wars: Episode VI - Return of the Jedi?' - 17:19:35, 09-12-2022
Processed Question: director star wars episo

In [None]:
"""
print("--- log out")
r = logout(session_token=agent_details["sessionToken"])
print(r.json())
"""