In [1]:
import numpy as np
import json
import pandas as pd
import os
import re
import ast
import xml.etree.ElementTree as ET
import ast

from __future__ import print_function
from gensim import corpora

from nltk.sem.logic import *

In [3]:

#ここを外部読み込みにさせたい.
predicate_arr = []
pre_pre = []
variable_arr = []
pre_var = []
formulas = []
check_dup = set([])

from nltk.sem.logic import LogicParser
from nltk.sem.logic import *

logic_parser = LogicParser(type_check=False)
def lexpr(formula_str):
    return logic_parser.parse(formula_str)

def normalize_interpretation(expression):
    norm_interp_str = coq_string_expr(expression)
    return norm_interp_str
def coq_string_expr(expression):    
    if isinstance(expression, str):
        expression = lexpr(expression)
    expr_coq_str = ''
    if isinstance(expression, ApplicationExpression):
        expr_coq_str = coq_string_application_expr(expression)
    elif isinstance(expression, AbstractVariableExpression):
        expr_coq_str = coq_string_abstract_variable_expr(expression)
    elif isinstance(expression, LambdaExpression):
        expr_coq_str = coq_string_lambda_expr(expression)
    elif isinstance(expression, QuantifiedExpression):
        pre_var.append('(')
        pre_var.append(')')
        quantifier = {'exists' : 'exists','exist' : 'exists','all' : 'forall','forall' : 'forall'}
        quant = expression.getQuantifier()
        if quant in quantifier:
            quant = nltk2coq_quantifier[expression.getQuantifier()]
        if(quant == 'exists'):
            pre_var.append('exist')
        elif(quant == 'forall'):
            pre_var.append('forall')
        expr_coq_str = coq_string_quantified_expr(expression)
    elif isinstance(expression, AndExpression):
        pre_var.append('(')
        pre_var.append(')')
        pre_var.append('and')
        expr_coq_str = coq_string_and_expr(expression)
    elif isinstance(expression, OrExpression):
        pre_var.append('(')
        pre_var.append(')')
        pre_var.append('or')
        expr_coq_str = coq_string_or_expr(expression)
    elif isinstance(expression, NegatedExpression):
        pre_var.append('(')
        pre_var.append(')')
        pre_var.append('not')
        expr_coq_str = coq_string_not_expr(expression)
    elif isinstance(expression, BinaryExpression):
        pre_var.append('(')
        pre_var.append(')')
        pre_var.append('=')
        expr_coq_str = coq_string_binary_expr(expression)
    elif isinstance(expression, Variable):
        expr_coq_str = '%s' % expression
    else:
        expr_coq_str = str(expression)
    return expr_coq_str

coqstr = coq_string_expr

def coq_string_application_expr(expression):
    # uncurry the arguments and find the base function
    if expression.is_atom():
        #is_atom : 原子論理式かどうか
        function, args = expression.uncurry()
        arg_str = ' '.join("%s" % coqstr(arg) for arg in args)
    else:
        #Leave arguments curried
        function = expression.function
        arg_str = "%s" % coqstr(expression.argument)

    function_str = "%s" % coqstr(function)
    parenthesize_function = False
    if isinstance(function, LambdaExpression):
        if isinstance(function.term, ApplicationExpression):
            if not isinstance(function.term.function,
                              AbstractVariableExpression):
                parenthesize_function = True
        elif not isinstance(function.term, BooleanExpression):
            parenthesize_function = True
    elif isinstance(function, ApplicationExpression):
        parenthesize_function = True

    if parenthesize_function:
        function_str = Tokens.OPEN + function_str + Tokens.CLOSE
    
    return Tokens.OPEN + function_str + ' ' + arg_str + Tokens.CLOSE

reserved_s = \
  {'AND' : 'and', 'OR' : 'or', 'neg' : 'not', 'EMPTY' : '', 'TrueP' : 'True'}
def coq_string_abstract_variable_expr(expression):
    expr_str = str(expression.variable)
    if expr_str in reserved_s:
        expr_str = reserved_s[expr_str] #Trueしか置換できていない可能性あり
    if not isinstance(expression, FunctionVariableExpression):
        if expr_str == '':
            expr_str = "%s" % expr_str
        else:
            expr_str = "%s" % expr_str
    else:
        expr_str = "%s" % expr_str
        
    if not re.sub(r'_', "",expr_str) in check_dup:
        if expr_str.startswith('_'):
            #expr_str = re.sub(r'_', "", expr_str)
            pre_pre.append(expr_str)
            check_dup.add(expr_str)
            #print("pre",expr_str)
        else :
            pre_var.append(expr_str)
            check_dup.add(expr_str)
            #print("var",expr_str)
    return expr_str

def coq_string_lambda_expr(expression):
    variables = [expression.variable]
    term = expression.term
    while term.__class__ == expression.__class__:
        variables.append(term.variable)
        term = term.term
    return Tokens.OPEN + 'fun ' + ' '.join("%s" % coqstr(v) for v in variables) + \
           ' => ' + "%s" % coqstr(term) + Tokens.CLOSE

nltk2coq_quantifier = {'exists' : 'exists','exist' : 'exists','all' : 'forall','forall' : 'forall'}
def coq_string_quantified_expr(expression):
    variables = [expression.variable]
    term = expression.term
    while term.__class__ == expression.__class__:
        variables.append(term.variable)
        term = term.term
    nltk_quantifier = expression.getQuantifier()
    # Rename quantifiers, according to coq notation. Such renaming dictionary
    # is defined above as "nltk2coq_quantifier". If a rename convention is not
    # available, use the same as in NLTK.
    if nltk_quantifier in nltk2coq_quantifier:
        coq_quantifier = nltk2coq_quantifier[expression.getQuantifier()]
    else:
        coq_quantifier = nltk_quantifier
    return Tokens.OPEN + coq_quantifier + ' ' \
           + ' '.join("%s" % coqstr(v) for v in variables) + \
           '. ' + "%s" % coqstr(term) + Tokens.CLOSE

def coq_string_and_expr(expression):
    first = coqstr(expression.first)
    second = coqstr(expression.second)
    return Tokens.OPEN + 'and ' + first + ' ' + second + Tokens.CLOSE


def coq_string_or_expr(expression):
    first = coqstr(expression.first)
    second = coqstr(expression.second)
    return Tokens.OPEN + 'or ' + first + ' ' + second + Tokens.CLOSE

def coq_string_not_expr(expression):
    term_str = coqstr(expression.term)
    return Tokens.OPEN + 'not ' + term_str + Tokens.CLOSE

def coq_string_binary_expr(expression):
    first = coqstr(expression.first)
    second = coqstr(expression.second)
    return Tokens.OPEN + first + ' ' + expression.getOp() \
            + ' ' + second + Tokens.CLOSE


In [4]:
def substituteString(text,lst):
    ###辞書作り###
    lst = list(set(lst))
    e_sub = {}
    z_sub = {}
    ice = 0 #index counter
    icz = 0 

    for i in lst:
        matchObj = re.search(r'e0+', i)
        if(matchObj):
            tmp = "e0"+str(ice)
            e_sub[tmp] = i
            ice+=1
            continue
        matchObj = re.search(r'z[0-9]*', i)
        if matchObj:
            tmp = "z0"+str(icz)
            z_sub[tmp] = i
            icz+=1
    
    e_sub = {v:k for k, v in e_sub.items()}
    z_sub = {v:k for k, v in z_sub.items()}
    #############
    
    for k, v in e_sub.items():  
        text = text.replace(k,v)
    for k, v in z_sub.items():  
        text = text.replace(k,v)
    #print (text)
    return text

In [5]:
#en_parsed のsemから取り出し ただし，論理記号の順番がスタック式をなおさなければならなそう
tree = ET.parse('sem/snli_dev.txt.candc.sem.xml')
root = tree.getroot()

root = root[0]
root = root[0] #１個目のsentence

mydict = {}
c = 0 #辞書のindexを回す
print(len(root))

#子階層のタグと中身
for child in root:
    
    formula = child[2]  #child[2]がsemantics
    check = (child[2].attrib)
    check = check['status']
   
    if(check == 'success'):
        plain = ""
        toridashi = child[0]
        
        for i in toridashi :
            p = i.attrib
            p = p['surf']
            if(p=='.' or p== ','):
                plain = plain + p
            else :
                plain = plain + " " + p
        plain = plain+'\n'
        
        #print(plain)
        
        formula = child[2][0]
        formula = (formula.attrib)
        formula = formula['sem'] #\nが付与されていると信じている
        
        #論理式を今回の標準にする
        check_dup = set([])
        pre_pre = []
        pre_var = []
        
        try:
            formula = coq_string_expr(formula)
            formula = substituteString(formula,pre_var)
            predicate_arr.append(pre_pre)
            
            ice = 0
            icz = 0
            
            for i in range(len(pre_var)):
                matchObj = re.search(r'e0+', pre_var[i])
                if(matchObj):
                    tmp = "e0"+str(ice)
                    #print("p: ",pre_var[i])
                    pre_var[i] = tmp
                    #print("t:" ,tmp )
                    #print("np: ",pre_var[i])
                    continue
                matchObj = re.search(r'z[0-9]*', pre_var[i])
                if matchObj:
                    #print("p: ",pre_var[i])
                    tmp = "z0"+str(icz)
                    pre_var[i] = tmp
                    #print("t: ",tmp)
                    #print("np: ",pre_var[i])
                    icz+=1
                    
            #print(pre_var)
            variable_arr.append(pre_var)
            test_val = pre_var #置換用のテスト
            test_fomula = formula
            pre_pre = []
            pre_var = []
            pair = {'text':plain,'formula':formula}
            mydict.update({str(c):pair})
        except:
            #import traceback
            #traceback.print_exc()
            #raise
            #print(i," : ",formula)
            #print('\n')
            pre_pre = []
            pre_var = []
    else:
      print('faild parse')
    #c+=1
    #if(c==100):
    #    break
    
#f = open('snli_input_data_100_1208.json', 'w') # 書き込みモードで開く
#json.dump(mydict, f,ensure_ascii=False)
#f.close()

19949
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse
faild parse


In [26]:
f =  open('snli_input_data_1206_100.json')
json_data = json.load(f)
f.close()

org = "exists x.(_woman(x) & _two(x) & TrueP & exists e.(_embrace(e) & (Subj(e) = x) & TrueP) & exists e.(_hold(e) & (Subj(e) = x) & TrueP) & exists z1.(_package(z1) & TrueP & exists e.(_go(e) & (Subj(e) = x) & (Acc(e) = z1) & TrueP))"


for i in json_data :
    lis = json_data[i]
    txt = (lis['text'])
    fom = (lis['formula'])
    print("before: ",fom)
    fom = re.sub(r'\(', "", fom)
    fom = re.sub(r'\)', "", fom)
    print("after: ",fom)
    break
    
/*org before after みると確実にバグある*/

before:  (* x, (& (& (& (& (& (woman x) (two x)) True) (* e, (& (& (embrace e) ((Subj e) = x)) True))) (* e, (& (& (hold e) ((Subj e) = x)) True))) (* z00, (& (& (package z00) True) (* e, (& (& (& (go e) ((Subj e) = x)) ((Acc e) = z00)) True))))))
after:  * x, & & & & & woman x two x True * e, & & embrace e Subj e = x True * e, & & hold e Subj e = x True * z00, & & package z00 True * e, & & & go e Subj e = x Acc e = z00 True


In [6]:
pre_dict = corpora.Dictionary(predicate_arr)
#pre_dict.filter_extremes(no_below=4, no_above=0.4)
pre_dict.save_as_text('itemdic_predicates_nofilter_1208.txt')

val_dict = corpora.Dictionary(variable_arr)
#val_dict.filter_extremes(no_below=4, no_above=0.4)
val_dict.save_as_text('itemdic_variables_nofilter_1208.txt')

In [36]:
print(variables[10])

['e', 'Subj', 'x', 'z65', 'True', 'e068', 'z73', 'z69', 'Acc', 'z71', 'z66', 'z67']
