In [108]:
import numpy as np
import pandas as pd
import logging
import os
import json
import glob
import regex as re
import torch
import argparse
import random
import itertools
import ast
import sys
import ast
from tqdm import tqdm
import warnings

from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import pipeline
from datasets import load_dataset, get_dataset_split_names

warnings.filterwarnings("ignore")

In [109]:
res = pd.read_csv("conll_flan_fewshot_results.csv")
res.shape

(231, 2)

In [25]:
res.head()

Unnamed: 0,input,response
0,"TEXT: John Wilkes Booth , who assassinated Pre...","[[""John Wilkes Booth"", ""Occupation"", ""Actor""]]"
1,TEXT: The opera company performed at the Palac...,"[[""Kevin O'Brien"", ""Worked_For"", Palace of Fin..."
2,"TEXT: In the field of mechanics , Wang Ziqiang...","[[""Wang Ziqiang"", ""Works_at"", ""Institute of Me..."
3,"TEXT: Sun Hung Kai Properties , a Hong Kong co...","[[""Sun Hung Kai Properties"", ""Type"", ""Construc..."
4,"TEXT: Marie Magdefrau Ferraro , 50 , of Bethan...","[[""Marie Magdefrau Ferraro"", ""Death_Place"", ""B..."


In [94]:
results = res.copy()

error_count = 0

for ix, row in results.iterrows():
    
    try:
        response = ast.literal_eval(row["response"])
        for triplets in response:
            if len(triplets) != 3:
                raise ValueError
    except:
        error_count += 1
        print ("Error on index: ", ix)
        print (row["response"])
        results.drop(ix, inplace=True)

Error on index:  1
[["Kevin O'Brien", "Worked_For", Palace of Fine Arts]]
Error on index:  13
[["Yang Jianbai", "Org", Economic Research]]
Error on index:  20
[["La Valette Underground Military Museum", "Owner", Gerald Przenislawski]]
Error on index:  22
[["Dueling Over a Gun", "Place", Dallas]]
Error on index:  25
[["Mark Moseley", "Sibling", Pamela Moseley]]
Error on index:  27
[["Rolling Fork", "is_part_of", Mississippi]]
Error on index:  34
[["Phnom Penh", "is_part_of", Cambodia]]
Error on index:  46
[["River North", "has_a_name"]]
Error on index:  48
[["Kyoto"], "Showcased_in", "Tokyo"]]
Error on index:  61
[["Robert C. Corduwener", "Nationality", Netherlands]]
Error on index:  62
[["Volcanoes in Nicaragua and Indonesia", "Subject", "deforestation in Brazil and Madagascar", "the Sahara desert in Africa"]]
Error on index:  72
[["Grand Rapids, Michigan"], "City", "Grand Rapids, Michigan"]]
Error on index:  95
[["Solidarity", "Person", Lech Walesa]]
Error on index:  135
[["Brian Mulr

In [95]:
error_count

29

In [96]:
results.shape

(202, 2)

In [97]:
valid_r_types = ["work_for", "live_in", "located_in", "orgbased_in", "kill"]

In [98]:
total_count = 0
nc_rel = 0
c_rel = 0
v_rels = 0
r_types = []

for ix, row in results.iterrows():
    for relation in ast.literal_eval(row["response"]):
        total_count += 1
        if len(relation) == 3:
            c_rel += 1
            r_types.append(relation[1])
            if relation[1].lower() in valid_r_types:
                v_rels += 1
                print (relation)
        else:
            nc_rel += 1
#             print (relation)

r_types = set(r_types)

print ("\n\nTOTAL: ", total_count)
print ("NON-CONFORMING: ", nc_rel)
print ("CONFORMING: ", c_rel)
print ("TOTAL VALID R-TYPES: ", v_rels)
print ("TOTAL UNIQUE CONFORMING R-TYPES: ", len(r_types))

['Savannah River Plant', 'located_in', 'Aiken, S.C.']
['Novonezhino', 'located_in', 'Vladivostok']
['Parkland Memorial Hospital', 'located_in', 'Dallas']
['Parkland Memorial Hospital', 'located_in', 'Texas Theater']


TOTAL:  247
NON-CONFORMING:  0
CONFORMING:  247
TOTAL VALID R-TYPES:  4
TOTAL UNIQUE CONFORMING R-TYPES:  121


In [99]:
r_types

{'Activity',
 'Age',
 'Aircraft',
 'Appointed_By',
 'Assassin',
 'Assassinates',
 'Assassination',
 'Attached_to',
 'Bank',
 'Birth_Place',
 'Born',
 'Born_in',
 'Brother',
 'Callers',
 'Capsule_name',
 'Cause',
 'Chairman',
 'City',
 'City,',
 'Client',
 'Component',
 'Creditor',
 'Crime',
 'Date',
 'Date_of_Death',
 'Day',
 'Death_Penalty',
 'Death_Place',
 'Director',
 'Education',
 'Employee',
 'Flight_Status',
 'Friend',
 'Governor',
 'Hometown',
 'Killed',
 'Killed_By',
 'Killing',
 'Kingdom',
 'Known_as',
 'Location',
 'Mother',
 'Movie',
 'Nationality',
 'Native_Son',
 'Native_name',
 'Newspaper',
 'Nickname',
 'Number_Of_Passengers',
 'Occupation',
 'Office',
 'Office_Space',
 'Officer',
 'Operator',
 'Order_to_ignore',
 'Org',
 'Owner',
 'Part',
 'Percentage',
 'Person',
 'Piano',
 'Place',
 'President',
 'Purpose',
 'Rank',
 'Said',
 'Sale_to',
 'Secretary',
 'Sentence',
 'Sentenced_To_Death',
 'Sex',
 'Shot_By',
 'Size',
 'Son',
 'Source',
 'Speaker',
 'Spin-off',
 'Spouse'

In [100]:
conll_gold = json.load(open('conll04_dev.json'))

In [101]:
ips = []
gold = []

for i in conll_gold:
    ips.append("TEXT: " +  ' '.join(i["tokens"]))
    triplets = []
    for relation in i["relations"]:
        triplet = []
        # print (i["entities"])
        # print (i["entities"][relation["head"]]["type"])
        triplet.append(str(" ".join(i["tokens"][i["entities"][relation["head"]]["start"]:i["entities"][relation["head"]]["end"]])))
        triplet.append(relation["type"])
        triplet.append(str(" ".join(i["tokens"][i["entities"][relation["tail"]]["start"]:i["entities"][relation["tail"]]["end"]])))
#         print (triplet)
        triplets.append(triplet)
#     print (triplets)
    gold.append(triplets)
#     print ("\n")

df = pd.DataFrame({'input': ips, 'gold': gold}, index=None)

In [102]:
text = []
gold = []
generated = []

for ix, row in results.iterrows():
    gold_df = df.loc[df['input'] == row["input"]]
    if gold_df.shape[0] > 1:
        continue
    text.append(row["input"])
    for i, r in gold_df.iterrows():
#         print ("TRUE: ", r["gold"])
        gold.append(r["gold"])
#     print ("GENERATED: ", ast.literal_eval(row["response"]))
    generated.append(ast.literal_eval(row["response"]))
#     print ("TRUE: ", gold_df["gold"])
#     print ("\n-------------------------------------------\n")

In [103]:
df = pd.DataFrame({'text': text, 'generated': generated, 'true': gold}, index=None)
df.head()

Unnamed: 0,text,generated,true
0,"TEXT: John Wilkes Booth , who assassinated Pre...","[[John Wilkes Booth, Occupation, Actor]]","[[John Wilkes Booth, Kill, President Lincoln]]"
1,"TEXT: In the field of mechanics , Wang Ziqiang...","[[Wang Ziqiang, Works_at, Institute of Mechani...","[[Wang Ziqiang, Work_For, Institute of Mechani..."
2,"TEXT: Sun Hung Kai Properties , a Hong Kong co...","[[Sun Hung Kai Properties, Type, Construction ...","[[Sun Hung Kai Properties, OrgBased_In, Hong K..."
3,"TEXT: Marie Magdefrau Ferraro , 50 , of Bethan...","[[Marie Magdefrau Ferraro, Death_Place, Bethan...","[[Marie Magdefrau Ferraro, Live_In, Bethany], ..."
4,TEXT: Ten oil workers were missing off Morgan ...,"[[Morgan City, is_part_of, Louisiana]]","[[Morgan City, Located_In, La.]]"


In [107]:
df.shape

(197, 3)

In [105]:
for ix, row in df.iterrows():
    
    if (row["text"] == "TEXT: Widespread street flooding was reported throughout Brazoria , Fort Bend , Galveston and southern Harris Counties in southeast Texas."):
#         print (row["true"])
        for triplet in row["generated"]:
            print (len(triplet))

In [111]:
tp = ()

input = []
relations = []
prefix = []
gold_relations = []

count = 0
for row in df.iterrows():
#     print (row[1]["text"])
    curr_tp = ()
    for t_triplet in row[1]["true"]:
        if t_triplet in row[1]["generated"]:
            tp += (t_triplet,)
            curr_tp += (t_triplet,)
        else:
            for g_triplet in row[1]["generated"]:
                if ((str(t_triplet[0]) in str(g_triplet[0])) or (str(g_triplet[0]) in str(t_triplet[0]) )) and (( str(t_triplet[2]) in str(g_triplet[2])) or (str(g_triplet[2]) in str(t_triplet[2]) )) and ( str(t_triplet[1]) == str(g_triplet[1]) ):
                    tp += (g_triplet,)
                    curr_tp += (g_triplet,)
                    print (row[1]["text"] + "\n")
                    print ("TRUE: " + str(t_triplet))
                    print ("PARTIAL TP GENERATED: " + str(g_triplet))
                    print ("-----------------------------------------------")
#     if len(curr_tp) > 0:
#         count += 1
#         print (row[1]["input"] + "\n")
#         input.append(row[1]["input"])
#         relations.append(list(curr_tp))
#         prefix.append("CONLL04")
#         gold_relations.append(row[1]["true"])
#         print ("TRUE (SET): ", row[1]["true"])
#         print ("TP (SET): ", list(curr_tp))
#         print ("----------------")

print ("TOTAL TRUE RELATIONS: ", count)

TOTAL TRUE RELATIONS:  0


In [122]:
fp = ()

fp_list = []
text = []

for row in df.iterrows():
    for g_triplet in row[1]["generated"]:
        flag = True
        if g_triplet in row[1]["true"]:
            continue
        else:
            for t_triplet in row[1]["true"]:
                if ((str(t_triplet[0]) in str(g_triplet[0]) ) or (str(g_triplet[0]) in str(t_triplet[0]) )) and ((str(t_triplet[2]) in str(g_triplet[2])) or (str(g_triplet[2]) in str(t_triplet[2]))) and (str(t_triplet[1]) == str(g_triplet[1])):
                    flag = False
            if flag:
                text.append(row[1]["text"])
                fp_list.append(g_triplet)
                fp += (g_triplet,)
                print (row[1]["text"] + "\n")
                print ("TRUE (SET): ", row[1]["true"])
                print ("FALSE POSITIVE GENERATED: " + str(g_triplet))
                print ("-----------------------------------------------")

TEXT: John Wilkes Booth , who assassinated President Lincoln , was an actor .

TRUE (SET):  [['John Wilkes Booth', 'Kill', 'President Lincoln']]
FALSE POSITIVE GENERATED: ['John Wilkes Booth', 'Occupation', 'Actor']
-----------------------------------------------
TEXT: In the field of mechanics , Wang Ziqiang at the Institute of Mechanics has made considerable headway in the area of elastoplastic crack mechanics .

TRUE (SET):  [['Wang Ziqiang', 'Work_For', 'Institute of Mechanics']]
FALSE POSITIVE GENERATED: ['Wang Ziqiang', 'Works_at', 'Institute of Mechanics']
-----------------------------------------------
TEXT: Sun Hung Kai Properties , a Hong Kong construction firm with a 27 percent share ;

TRUE (SET):  [['Sun Hung Kai Properties', 'OrgBased_In', 'Hong Kong']]
FALSE POSITIVE GENERATED: ['Sun Hung Kai Properties', 'Type', 'Construction company']
-----------------------------------------------
TEXT: Marie Magdefrau Ferraro , 50 , of Bethany , Conn. , was shot to death Thursday whe

In [123]:
temp_df = pd.DataFrame({"text": text, "gen_fp": fp_list}, index=None)
temp_df.to_csv("fp_conll_flan_eval.csv", index=False)

In [119]:
fn = ()
fn_list = []
text = []
gen_list = []
for row in df.iterrows():
    for t_triplet in row[1]["true"]:
        flag = True
        if t_triplet in row[1]["generated"]:
            continue
        else:
            for g_triplet in row[1]["generated"]:
                if ((str(t_triplet[0]) in str(g_triplet[0])) or (str(g_triplet[0]) in str(t_triplet[0]))) and ((str(t_triplet[2]) in str(g_triplet[2])) or (str(g_triplet[2]) in str(t_triplet[2]))) and (str(t_triplet[1]) == str(g_triplet[1])):
                    flag = False
            if flag:
                text.append(row[1]["text"])
                fn_list.append(t_triplet)
                gen_list.append(str(row[1]["generated"]))
                fn += (t_triplet,)
                print (row[1]["text"] + "\n")
                print ("TRUE (SET): ", row[1]["true"])
                print ("GENERATED: " + str(row[1]["generated"]))
                print ("FALSE NEGATIVE: " + str(t_triplet))
                print ("-----------------------------------------------")

TEXT: John Wilkes Booth , who assassinated President Lincoln , was an actor .

TRUE (SET):  [['John Wilkes Booth', 'Kill', 'President Lincoln']]
GENERATED: [['John Wilkes Booth', 'Occupation', 'Actor']]
FALSE NEGATIVE: ['John Wilkes Booth', 'Kill', 'President Lincoln']
-----------------------------------------------
TEXT: In the field of mechanics , Wang Ziqiang at the Institute of Mechanics has made considerable headway in the area of elastoplastic crack mechanics .

TRUE (SET):  [['Wang Ziqiang', 'Work_For', 'Institute of Mechanics']]
GENERATED: [['Wang Ziqiang', 'Works_at', 'Institute of Mechanics']]
FALSE NEGATIVE: ['Wang Ziqiang', 'Work_For', 'Institute of Mechanics']
-----------------------------------------------
TEXT: Sun Hung Kai Properties , a Hong Kong construction firm with a 27 percent share ;

TRUE (SET):  [['Sun Hung Kai Properties', 'OrgBased_In', 'Hong Kong']]
GENERATED: [['Sun Hung Kai Properties', 'Type', 'Construction company']]
FALSE NEGATIVE: ['Sun Hung Kai Proper

TEXT: LD2804174994 Moscow INTERFAX in English 1601 GMT 28 Apr 94

TRUE (SET):  [['INTERFAX', 'OrgBased_In', 'Moscow']]
GENERATED: [['LD2804174994', 'language', 'English'], ['LD2804174994', 'date', '1994-04-28']]
FALSE NEGATIVE: ['INTERFAX', 'OrgBased_In', 'Moscow']
-----------------------------------------------
TEXT: Navy sources said the Coral Sea , on duty with the 6th Fleet in the Mediterranean , left ` ` a few hours ' ' ahead of its scheduled departure.

TRUE (SET):  [['6th Fleet', 'OrgBased_In', 'Mediterranean']]
GENERATED: [['USS Coral Sea', 'on_duty_with', '6th Fleet']]
FALSE NEGATIVE: ['6th Fleet', 'OrgBased_In', 'Mediterranean']
-----------------------------------------------


In [120]:
temp = pd.DataFrame({"text": text, "true_fn": fn_list, "gen":gen_list}, index=None)
temp.to_csv("fn_conll_flan_eval.csv", index=False)