# Prepare Data for Semantic Role Classification

In [1]:
import json
import dill
import os

import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
annotate_example_fn = 'annotate_example.json'
OUTPUT_PATH = './raw/'

In [3]:
os.makedirs(OUTPUT_PATH, exist_ok=True)

In [4]:
with open(annotate_example_fn, 'r') as f:
    data = json.load(f)

Convert a sentence into a list of pairs of word/token and semantic role tag.

In [5]:
sentences = []
verb_tag = []

for k, v in data.items():
    # get color code tag
    color_dict = {'#A4A4A4': 'Verb', '': 'Z-O', '#FFFFFF': 'Z-O'}
    for r, c, _ in v['summary_concept']:
        role = r.split()[0]
        color_dict[c] = role
    
    # convert sentence to array of word and tag pair
    for s in v['example_sentence']:
        for w, t in s['sent']:
            if t not in color_dict:
                print(k, s)
        
        word_list = [(token, color_dict[tag]) for w, tag in s['sent'] for token in w.split('\xa0') ]
        sentences.append(word_list)
        verb_tag.append(k)

In [6]:
sentences[0]

[('ป้องกัน', 'Z-O'),
 ('หนู', 'Agent'),
 ('นา', 'Agent'),
 ('กัด', 'Verb'),
 ('กิน', 'Verb'),
 ('ต้น', 'Object'),
 ('ข้าว', 'Object')]

In [7]:
verb_tag[0]

'กัดกิน'

In [8]:
ner_list = [word[1] for sent in sentences for word in sent]
all_ner = sorted(set(ner_list))

In [9]:
all_ner

['Accompanyment',
 'Agent',
 'Benefactor',
 'Experiencer',
 'Instrument',
 'Location',
 'Manner',
 'Measure',
 'Object',
 'Time',
 'Verb',
 'Z-O']

## แบ่งแบบสุ่ม


In [10]:
train_sents, val_sents= train_test_split(sentences, test_size=0.2, random_state=112)

In [11]:
print(len(train_sents))
print(len(val_sents))

1790
448


In [12]:
with open(f'{OUTPUT_PATH}train.data', 'wb') as f:
    dill.dump(train_sents, f)

In [13]:
with open(f'{OUTPUT_PATH}val.data', 'wb') as f:
    dill.dump(val_sents, f)

## แบ่งแบบ OOV (Out-Of-Vocabulary)

In [14]:
train_tag, val_tag = train_test_split(np.unique(verb_tag), test_size=0.2, random_state=0)

In [15]:
train_sents = []
val_sents = []
for t, s in zip(verb_tag, sentences):
    if t in train_tag:
        train_sents.append(s)
    elif t in val_tag:
        val_sents.append(s)

In [16]:
print(len(train_sents))
print(len(val_sents))

1807
431


In [17]:
with open(f'{OUTPUT_PATH}train_oov.data', 'wb') as f:
    dill.dump(train_sents, f)

In [18]:
with open(f'{OUTPUT_PATH}val_oov.data', 'wb') as f:
    dill.dump(val_sents, f)

## แบ่งแบบ KFold (K=5)

In [19]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=112)
print(kf)

KFold(n_splits=5, random_state=112, shuffle=True)


In [20]:
for k, (train_index, test_index) in enumerate(kf.split(sentences)):
    print("TRAIN:", len(train_index), "TEST:", len(test_index))
    
    train_sents = []
    val_sents = []
    for i, s in enumerate(sentences):
        if i in train_index:
            train_sents.append(s)
        elif i in test_index:
            val_sents.append(s)
    
    with open(f'{OUTPUT_PATH}train_cv{k}.data', 'wb') as f:
        dill.dump(train_sents, f)
    
    with open(f'{OUTPUT_PATH}val_cv{k}.data', 'wb') as f:
        dill.dump(val_sents, f)

TRAIN: 1790 TEST: 448
TRAIN: 1790 TEST: 448
TRAIN: 1790 TEST: 448
TRAIN: 1791 TEST: 447
TRAIN: 1791 TEST: 447
