## Init

In [None]:
import nltk
from nltk.tokenize import sent_tokenize,word_tokenize
import pandas as pd
import numpy as np

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.probability import FreqDist

from nltk import pos_tag

nltk.download('averaged_perceptron_tagger_eng')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')  # Optional: For better coverage of languages
nltk.download('averaged_perceptron_tagger')  # For POS tagging
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/princee1/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/princee1/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/princee1/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/princee1/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/princee1/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/princee1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_dat

True

In [None]:
stop_words = set(stopwords.words('english'))
stop_words.add('?')
stop_words.add('What')

In [41]:
texts_data  = pd.read_csv('data/texts.csv')
corpus_text = pd.read_csv('corpus_text.csv')
open_ie = pd.read_csv('rel.csv')

## AllenNLP

## Production

In [163]:
from typing import Literal,TypedDict
import re


possible_tags = [
        'ARG0','ARG1', 'ARG2', 'ARG3','ARG4','ARG5', 'ARGM-TMP', 'ARGM-PRD', 'ARGM-MNR', 
        'ARGM-LOC', 'ARGM-DIR', 'ARGM-NEG', 'ARGM-ADV', 'ARGM-MOD', 
        'ARGM-CAU', 'V','ARGM-DIS'
    ]
ArgIdentifiers = Literal['ARG0','ARG1', 'ARG2', 'ARG3','ARG4','ARG5','ARGM-TMP','ARGM-PRD','ARGM-MNR'
                         ,'ARGM-LOC','ARGM-DIR','ARGM-NEG','ARGM-ADV','ARGM-MOD','ARGM-CAU']
args_noun = ['ARG0','ARG1', 'ARG2', 'ARG3','ARG4','ARG5']
args_info = ['ARGM-TMP','ARGM-PRD','ARGM-MNR','ARGM-LOC']
args_neg = ['ARGM-NEG']
args_v_plus =['ARGM-DIR','ARGM-NEG','ARGM-ADV','ARGM-MOD']
Neo4JNodeType = Literal['ARGM-TMP','ARGM-PRD','ARGM-MNR','ARGM-LOC','ARGS']
OpenIE_REGEX = r'\[(ARG\d*|ARGM-[A-Z]+|V):\s*(.+?)\]'

Subj_Label = 'F-ARG'

class NoVerbException(Exception):
    ...


class NoArgsException(Exception):
    ...

class NoNeighborhoodException(Exception):
    ...

In [21]:
from allennlp.predictors import Predictor
from allennlp_models.structured_prediction.predictors.openie import OpenIePredictor
from typing import Dict
from py2neo import Graph, Node, Relationship

In [22]:
class SentenceInformation:
    def __init__(self,sentence):
        self.sentence = sentence

        self.info:Dict[ArgIdentifiers,list[str]] = self.extract_openie_information()
        
        self._build_verb()
        self._order_args()
        
    
    def _build_verb(self):
        if not self.info['V']:
            raise NoVerbException
        self.verb = self.info['V'][0].strip()
        self.verb_plus:str = ' '.join(self.info['ARGM-MOD'])+' '.join(self.info['ARGM-ADV'])+' '.join(self.info['ARGM-NEG'])+ self.verb+' '.join(self.info['ARGM-DIR'])
        self.verb_plus = self.verb_plus.strip()
        
         
    def extract_openie_information(self):

        openie_info = {tag: [] for tag in possible_tags}
        
        matches = re.findall(OpenIE_REGEX, self.sentence)
        
        for match in matches:
            if len(match) >= 2:
                tag = match[0]
                value = match[1]
                openie_info[tag].append(value.strip())
        
        for tag in openie_info:
            if tag in args_noun:
                if not openie_info[tag]:
                    openie_info[tag] = None
                else:
                    temp = openie_info[tag]
                    temp = ' '.join(temp)
                    openie_info[tag] =temp

        return openie_info
    
    @property
    def same_v_plus(self):
        return self.verb_plus == self.verb


    def _order_args(self):
        temp =[]
        for tag in args_noun:
            if self.info[tag] != None:
                temp.append(self.info[tag])

        self.first_arg = temp[0]
        self.other_args = temp[1:]
        if self.first_arg == '':
            raise NoArgsException
    @property
    def neighborhood(self):
        temp =[ ]

        for tag in args_info:
            temp.extend([(tag,t)for t  in self.info[tag].copy()])
            
        temp.extend([('ARGS',t) for t in  self.other_args])
        if not temp:
            raise NoNeighborhoodException
        return temp
        


In [None]:
def is_alpha_numeric(word):
    pattern = r'(?=.*[a-zA-Z])(?=.*\d)'
    if re.match(pattern, word):
        return True
    return False

T = []

for text in texts_data['text']:
    T.extend(word_tokenize(text))

token_freq = FreqDist(T)

In [23]:
coreference_predictor =  Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2021.03.10.tar.gz")
openie_predictor = OpenIePredictor.from_path("https://storage.googleapis.com/allennlp-public-models/openie-model.2020.03.26.tar.gz")

In [None]:
from tqdm import tqdm
class KGOpenIExtractor:

    def __init__(self,dataset: pd.Series,skip_dep=False,skip_coref_file=None,skip_openie_file=None,coreference_predictor:Predictor=coreference_predictor,openie_predictor:OpenIePredictor=openie_predictor,):
        self.coreference_predictor = coreference_predictor
        self.openie_predictor = openie_predictor
        self.dataset = dataset
        self.relationship:list[str] = []
        self.corpus = []
        self.skip_coref_file = skip_coref_file
        self.skip_openie_file = skip_openie_file
        self.skip_dep =skip_dep
        
        password = ""
        neo_uri = ''
        self.graph = Graph(neo_uri, auth=("neo4j", password))
        self.index = 0
        self.lemmatizer = WordNetLemmatizer()


    def build(self):
        if self.skip_coref_file==None:
            self._resolve_coreference()
            self.dataset = pd.Series(self.corpus,)
            self.dataset.to_csv('corpus_text.csv')
        else:
            temp = pd.read_csv(self.skip_coref_file)
            temp.columns = ['index','sentences']
            self.dataset = temp['sentences']
        print('coreference done !')
        if self.skip_openie_file==None:
            self._extract_information()
        else:
            temp = pd.read_csv(self.skip_openie_file)
            temp.columns = ['index','sentences']
            self.relationship = temp['sentences'].to_list()
        
        print('extract information done !')


    def _co_references_resolver(self,text):
        self.index +=1
        print(f'{self.index}/{len(self.dataset)}')
        try:
            prediction = self.coreference_predictor.predict(document=text)
            
            words = prediction['document']
            clusters = prediction['clusters']
            
            for cluster in clusters:
                main_mention = " ".join(words[cluster[0][0]: cluster[0][1] + 1])
                for mention in cluster[1:]:
                    start, end = mention
                    words[start:end + 1] = [main_mention] + [''] * (end - start)
            
            resolved_text = " ".join([word for word in words if word])
        except:
            self.corpus.extend(sent_tokenize(text))
            return
        self.corpus.extend(sent_tokenize(resolved_text))
        return 
           
    def _resolve_coreference(self,):
        self.dataset.apply(self._co_references_resolver)
    
    def _compute_information_extraction(self,sentence):
        output = self.openie_predictor.predict(sentence=sentence)
        for relation in output["verbs"]:
            self.relationship.append(relation['description'])
        self.index+=1
        print(f'{self.index}/{len(self.dataset)}')
        

    def _extract_information(self,):
        self.index = 0
        if not len(self.relationship) ==0:
            return
        self.dataset.apply(self._compute_information_extraction)
        pd.Series(self.relationship).to_csv('rel.csv')


    def _split(info:str):
        ...

    def build_info_dependency(self,):
        if self.skip_dep:
            return
        for rel in tqdm(self.relationship):
            try:
                openie_info_temp = SentenceInformation(rel) 
                
                for triplets in openie_info_temp.neighborhood:

                    obj_label,obj =triplets
                    subj_node = Node(Subj_Label, name=openie_info_temp.first_arg)
                    obj_node = Node(obj_label, name=obj)

                    # Merge nodes into the graph
                    self.graph.merge(subj_node, Subj_Label, "name")  # Merge using label and key
                    self.graph.merge(obj_node, obj_label, "name")

                    # Create relationship and merge into the graph
                    
                    self.graph.merge(Relationship(subj_node, openie_info_temp.verb, obj_node))
                    if not openie_info_temp.same_v_plus:
                        self.graph.merge(Relationship(subj_node, openie_info_temp.verb_plus, obj_node))


            except Exception as  e:
                continue
        print('Knowledge Graph computed !')
    

    def query_node_kg(self,query,type_:Neo4JNodeType|None=None):
        type_ = '' if type_ == None else ':'+type_
        cypher_query = f"""
        MATCH (n)-[r]->(m{type_})
        WHERE n.name CONTAINS '{query}'
        RETURN n.name AS source, type(r) AS relation, m.name AS target
        """
        
        return self.graph.run(cypher_query).data()

    def query_relation_kg(self,query):
        query = f"""
        MATCH (n)-[r:{query}]->(m)
        RETURN n.name AS source, type(r) AS relation, m.name AS target
        """
        return self.graph.run(query).data()
    
    def remove_stop_word(self,question:str):
        filtered_words = [word for word in word_tokenize(question) if word not in stop_words]
        temp = [word for word,tag in pos_tag(filtered_words) if tag.startswith('NN')or is_alpha_numeric(word)]
        return temp

    def generate_bigrams_trigrams(self,words, center_word):
        center_index = words.index(center_word)
        
        bigrams = []
        trigrams = []
        
        # Create bigrams: (center_word, previous_word), (center_word, next_word)
        if center_index > 0:
            bigrams.append((words[center_index - 1], words[center_index]))
        if center_index < len(words) - 1:
            bigrams.append((words[center_index], words[center_index + 1]))
        
        # Create trigrams: (previous_word, center_word, next_word)
        if center_index > 0 and center_index < len(words) - 1:
            trigrams.append((words[center_index - 1], words[center_index], words[center_index + 1]))
        
        return bigrams, trigrams
    
    def _get_question_subj(self,words:list[str]):
        val = np.array([token_freq[w] for w in words])
        index = val.argmin()
        return words[index]

    def _retrieve_KG_relations(self,questions:str):
        words = self.remove_stop_word(questions)
        print(words)
        q_subj=self._get_question_subj(words)
        print(q_subj)
        r_q_subj = None
        if '-' in q_subj:
            r_q_subj = ' '.join(q_subj.split('-'))
        
        bigram,trigram = self.generate_bigrams_trigrams(words,q_subj)
        

        if r_q_subj != None:
            bt,tt = self.generate_bigrams_trigrams(words,r_q_subj)
            bigram+=bt
            trigram+=tt

        print(bigram)
        print(trigram)
        
        temp_tri = []
        for tri in trigram:

            temp_tri.extend(self.query_node_kg(' '.join(tri)))

        if len(temp_tri) >= 6:
            return temp_tri
        
        temp_bi = []
        for bi in bigram:
            temp_bi.extend(self.query_node_kg(' '.join(bi)))


        if len(temp_bi) + len(temp_tri) >= 6:
            return temp_bi+temp_tri

        if r_q_subj == None:
            return self.query_node_kg(q_subj)
        
        return self.query_node_kg(q_subj)+self.query_node_kg(r_q_subj)
        
    
    def retrieve_question_context(self,question:str):
        return [rel['source']+' '+rel['relation'] +' '+rel['target']for rel in self._retrieve_KG_relations(question)]


In [380]:
openIE = KGOpenIExtractor(texts_data['text'],True,'corpus_text.csv','rel.csv')

In [291]:
openIE.build()
openIE.build_info_dependency()

coreference done !
extract information done !


In [123]:
question_val= pd.read_csv('data/questions_val.csv')

In [124]:
def check_answer(index):return question_val['question'][index],question_val['answer'][index]
