In [23]:
import xml
import xml.etree.ElementTree as ET
import os
from os import path
import glob
from collections import defaultdict
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from IPython.display import Image
import shutil

import random
random.seed(10)

In [24]:
base_dir = "/home/shtoshni/Research/events/data/kbp_2014-2015/data/2015"

split_to_subdir_dict = {
    "training": {"source": "source", "mention": "event_nugget", "coref": "event_hopper"},
    "eval": {"source": "source", "mention": "nugget", "coref": "hopper"}
}

file_suffix_dict = {"source":".txt", "mention": ".event_nuggets.xml", "coref": ".event_hoppers.xml"}

proposed_splits = [("mod_training", 128), ("mod_dev", 30)]

### Get all Doc IDs

In [25]:
training_dir = path.join(base_dir, "training")

source_files = sorted(glob.glob(path.join(path.join(training_dir, "source"), "*.txt")))
doc_ids = [path.splitext(path.basename(source_file))[0] for source_file in source_files]

# Shuffle
random.seed(10)
random.shuffle(doc_ids)


proposed_split_to_doc_ids = {}
offset = 0
for proposed_split, split_size in proposed_splits:
    proposed_split_to_doc_ids[proposed_split] = doc_ids[offset: offset + split_size]
    offset += split_size

### Create directory structure for proposed splits

In [26]:
for proposed_split, doc_ids in proposed_split_to_doc_ids.items():
    split_dir = path.join(base_dir, proposed_split)
    if not path.exists(split_dir):
        os.makedirs(split_dir)
    
    training_subdir_dict = split_to_subdir_dict["training"]
    # Create the same subdirectory structure as eval
    subdir_dict = split_to_subdir_dict["eval"]
    for category, subdir in subdir_dict.items():
        split_subdir = path.join(split_dir, subdir)
        if not path.exists(split_subdir):
            os.makedirs(split_subdir)
        
        # Corresponding training subdirectory
        training_subdir = path.join(training_dir, training_subdir_dict[category])
        
        # File suffix for this subdirectory
        file_suffix = file_suffix_dict[category]
        
        for doc_id in doc_ids:
            src_file_path = path.join(training_subdir, doc_id + file_suffix)
            dest_file_path = path.join(split_subdir, doc_id + file_suffix)
            
            shutil.copy(src_file_path, dest_file_path)