In [1]:
### Imports ###

import nltk
import re
import string
import os
import pandas as pd
import numpy as np
import ast

### Changing the work directory ###

"""
Kindly change your working directory here 
"""
os.chdir("D:\\UniSaarland\\HiWi_Fraunhofer\\incar-dialogue") # kindly chnage the directory here 

In [2]:
### Uploading Datasets ###

train_df = pd.read_csv("train.csv")
valid_df = pd.read_csv("valid.csv")
test_df = pd.read_csv("test.csv")

DFs = [train_df,valid_df,test_df]

In [3]:
### Utility functions for text cleaning ###

"""
These functions will help to clean the corpus.
---NOT USED ---
I am not using these functions here as we will have to clean the KG as well in order to use the clean utterances as
the entities in the kg have special characters and punctuations with them
"""

# def clean_text(x):
#     text = re.sub('(\d+)','',x)   
#     text = text.lower()
#     return text
# def remove_url(x):
#     text = re.sub('(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})\/([a-zA-Z0-9_]+]*)',' ',x)
#     return text
# def remove_punct(x):
#     text_without_puct = [t for t in x if t not in string.punctuation]
#     text_without_puct = ''.join(text_without_puct)
#     return text_without_puct

### Cleaning the dataframes ###

# for df in DFs:
    
#     df["utterance"] = df["utterance"].apply(clean_text)
#     df["utterance"] = df["utterance"].apply(remove_url)
#     df["utterance"] = df["utterance"].apply(remove_punct)
#     df["context"] = df["context"].apply(clean_text)
#     df["context"] = df["context"].apply(remove_url)
#     df["context"] = df["context"].apply(remove_punct)


'\nThese functions will help to clean the corpus.\n---NOT USED ---\nI am not using these functions here as we will have to clean the KG as well in order to use the clean utterances as\nthe entities in the kg have special characters and punctuations with them\n'

In [4]:
### Utility functions for triple extraction ###

"""
This is sourced from https://www.kaggle.com/rahulvks/knowledge-graphs-information-extraction
Note : After a lot of experiments around triple extractors with custom modifications, this seemed to be the best choice 
but it fails on longer texts very badly. 
"""


import spacy
import re
import string
from spacy.lang.en import English
import nltk
def getSentences(text):
    nlp = English()
    nlp.add_pipe(nlp.create_pipe('sentencizer'))
    document = nlp(text)
    return [sent.string.strip() for sent in document.sents]

def printToken(token):
    #print(token.text, "->", token.dep_)
    pass
def appendChunk(original, chunk):
    return original + ' ' + chunk
def isRelationCandidate(token):
    deps = ["ROOT", "adj", "attr", "agent", "amod"]
    return any(subs in token.dep_ for subs in deps)
def isConstructionCandidate(token):
    deps = ["compound", "prep", "conj", "mod"]
    return any(subs in token.dep_ for subs in deps)
def processSubjectObjectPairs(tokens):
    subject = ''
    object = ''
    relation = ''
    subjectConstruction = ''
    objectConstruction = ''
    for token in tokens:
       # printToken(token)
        if "punct" in token.dep_:
            continue
        if isRelationCandidate(token):
            relation = appendChunk(relation, token.lemma_)
        if isConstructionCandidate(token):
            if subjectConstruction:
                subjectConstruction = appendChunk(subjectConstruction, token.text)
            if objectConstruction:
                objectConstruction = appendChunk(objectConstruction, token.text)
        if "subj" in token.dep_:
            subject = appendChunk(subject, token.text)
            subject = appendChunk(subjectConstruction, subject)
            subjectConstruction = ''
        if "obj" in token.dep_:
            object = appendChunk(object, token.text)
            object = appendChunk(objectConstruction, object)
            objectConstruction = ''
            
    #print (subject.strip(), ",", relation.strip(), ",", object.strip())
    return [subject.strip(), relation.strip(), object.strip()]


def processSentence(sentence):
    nlp_model = spacy.load('en_core_web_sm')
    tokens = nlp_model(sentence)
    return processSubjectObjectPairs(tokens)

In [5]:
def match(list1, list2):
    """
    Utility function for PredictEntities
    Input : knowledge-graph's triples and extracted triples from the paragraph
    Returns : Predicted triples 
    """
    
    return [l1 for l1 in list1 if any(l2[0]==l1[0] for l2 in list2)]

In [6]:
def PredictTriples(dataframe):
    """
    This is the main function that performs the triple predictions after extracting triples from the utterances 
    Input : The dataframe from which triples are to be extracted an is to be evaluated
    Returns : Predicted triples from each row of the dataframe 
    
    Note : Please not that as recommended in the problem, I have only used utterance, context and kg as inputs 
    """
    
    df = dataframe
    predicted_labels=[]
    for index, row in df.iterrows():
        paragraph = []
        row["context"] = ast.literal_eval(row["context"])
        for utterance in row["context"]:
            paragraph.append(utterance)
        paragraph.append(row["utterance"])
        text = ' '.join(map(str, paragraph))
        sentences = getSentences(text)
        nlp_model = spacy.load('en_core_web_sm')
        triples = []
        for sentence in sentences:
            triples.append(processSentence(sentence))
        #print(index,triples)
        kg_entities = ast.literal_eval(row["kg"])
        #print(type(kg_entities),type(triples))
        predicted_labels.append(match(kg_entities,triples)) # function call to match()
        if index % 100 == 0 and index != 0: 
            print("{} values extracted!".format(index))
    return predicted_labels           

In [7]:
def CalculateAccuracy(predicted_labels,ground_labels):
    
    """
    This function calculates instance/row wise score(accuracy)
    Input : predicted-labels and ground-lables
    Returns : A list with accuracy for each row 
    """
    instance_accuracy = []
    for idx in range(len(predicted_labels)): 
        ground_val = ast.literal_eval(ground_labels.iloc[idx])
        intersection_vals = [set(tuple(row) for row in predicted_labels[idx] ) & set(tuple(row) for row in ground_val)]
        if intersection_vals == [set()]:
            instance_accuracy.append(0)
        else:
            if len(ground_val) == 0:
                instance_accuracy.append(0)
            else:
                score = (len(intersection_vals))/len(ground_val)
                instance_accuracy.append(score*100)
        

    return instance_accuracy

In [8]:
### Driver Section ###

"""
Please select the dataframe you want to evaluate from - train_df, valid_df,test_df
Default = test_df
Compute : On avgerage for computing 100 data points it takes about 3 minutes. 

"""
df = test_df
df_name = "test_df" # kindly enter the dataframe's name here
predicted_labels = PredictTriples(df)
ground_labels = df.labels
#print(len(predicted_labels),len(ground_labels))
if len(predicted_labels) == len(ground_labels):
    instance_accuracy = CalculateAccuracy(predicted_labels,ground_labels)
else :
    print("Dimension mismatch between the predicted and ground labels")
average_accuracy = round(sum(instance_accuracy)/len(instance_accuracy),2)
print("The average accuracy for the dataset {} is {} %.".format(df_name,average_accuracy))

100 values extracted!
200 values extracted!
300 values extracted!
400 values extracted!
500 values extracted!
600 values extracted!
700 values extracted!
800 values extracted!
The average accuracy for the dataset test_df is 0.58 %.


In [9]:
### Driver Section ###

"""
Please select the dataframe you want to evaluate from - train_df, valid_df,test_df
Default = test_df
Compute : On avgerage for computing 100 data points it takes about 3 minutes. 

"""
df = valid_df
df_name = "valid_df" # kindly enter the dataframe's name here
predicted_labels = PredictTriples(df)
ground_labels = df.labels
#print(len(predicted_labels),len(ground_labels))
if len(predicted_labels) == len(ground_labels):
    instance_accuracy = CalculateAccuracy(predicted_labels,ground_labels)
else :
    print("Dimension mismatch between the predicted and ground labels")
average_accuracy = round(sum(instance_accuracy)/len(instance_accuracy),2)

#print(predicted_labels)
#print(instance_accuracy)
print("The average accuracy for the dataset {} is {} %.".format(df_name,average_accuracy))

100 values extracted!
200 values extracted!
300 values extracted!
400 values extracted!
500 values extracted!
600 values extracted!
700 values extracted!
The average accuracy for the dataset valid_df is 0.67 %.
