In [None]:
 # this mounts your Google Drive to the Colab VM.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# enter the foldername in your Drive where you have saved the unzipped
# assignment folder
FOLDERNAME = 'CS224u/cs224u/'
assert FOLDERNAME is not None, "[!] Enter the foldername."

# this ensures that the Python interpreter of the Colab VM can load
# python files from within it.
import sys
sys.path.append('/content/drive/My Drive/{}'.format(FOLDERNAME))

Mounted at /content/drive


## Overview



In [None]:
_author_ = "Krutarth Rao, ANA, VM"
_version_ = "CS224u, Stanford, Spring 2021"

# !pip install fuzzywuzzy[speedup]



import numpy as np
import os
from sklearn.linear_model import LogisticRegression
import utils
import nltk
import random
from fuzzywuzzy import process
from multiprocessing import Pool
import json

rand = random.SystemRandom()

import rel_ext

#DATA_DIR = os.path.join('data', 'rel_ext_data')

DATA_DIR = os.path.join(sys.path[-1], 'data/rel_ext_data')


KB = rel_ext.KB(DATA_DIR)

entity_ids = {x.sbj for x in KB.kb_triples}
entity_ids.union([x.obj for x in KB.kb_triples])

relation_ids = {x.rel for x in KB.kb_triples}

len(relation_ids), len(entity_ids)

# read in docred
files = ["train_distant.json", "train_annotated.json", "dev.json"]

raw_relations = []
for fn in files:
    ds_path = os.path.join(DATA_DIR, f'doc-red/{fn}')
    with open(ds_path, "r") as f:
        raw_json = json.loads(f.read())
        raw_relations.extend(raw_json)
len(raw_relations)

print ("len  of raw relations: ", len (raw_relations))


with open(os.path.join(DATA_DIR, 'doc-red/rel_info.json'), "r") as f:
    relation_names = json.loads(f.read())

rand.sample(list(relation_names.items()), 3)



relations = set()
for nested_rel in raw_relations:
    for links in nested_rel["labels"]:
        head_i = links["h"]
        tail_i = links["t"]
        rel_name = relation_names[links["r"]]

        heads = nested_rel["vertexSet"][head_i]
        tails = nested_rel["vertexSet"][tail_i]

        for h in heads:
            for t in tails:
                relations.add(
                    (h["name"], rel_name, t["name"])
                )

rand.sample(relations, 3)

relations = list (relations)[:200000]

print(len(relations), "pre-id relations")

def get_id(name: str, is_relation: bool=False) -> str:
    search_space = entity_ids
    if is_relation:
        search_space = relation_ids
    match, score = process.extractOne(name, search_space)
    if score < 85:
        raise RuntimeError(f"no match for {name}. Best match is {match}")
    return match

def get_relation_dict(raw_relation):
    sbj, rel, obj = raw_relation
    try:
        return dict(
            rel=get_id(rel, is_relation=True),
            sbj=get_id(sbj),
            obj=get_id(obj),
        )

    except Exception as e:
        return None

n_cpus = 8
chunk_size = len(relations) // n_cpus
processed = []
with Pool(n_cpus) as p:
    print("doc-red triggered")
    results = p.map(get_relation_dict, relations, chunksize=chunk_size)
    processed.extend(x for x in results if x is not None)

print(len(processed), "post-id relations")

if not len(processed):
    print(
        "No relations that can be mapped to the original "
        "relation/entitiy IDs found in dataset",
        processed
    )
else:
    print(rand.sample(processed, 3))

with open("./docred_extracted.json", "w+") as f:
    f.write(json.dumps(processed))