In [6]:
import pandas as pd
import numpy as np
import re
import random
import itertools
import os

from tensorflow.keras.preprocessing.sequence import pad_sequences
from pythainlp import word_tokenize, subword_tokenize, syllable_tokenize
from sklearn.model_selection import train_test_split

# [Test] Read .ann file and convert to .csv file

In [2]:
df = pd.read_csv('data/csd_rel_data2_annotated/01_nut.a/xaaa.ann', sep='^([^\s]*)\s', engine='python', header=None).drop(0, axis=1)
df.tail()

Unnamed: 0,1,2
28,R10,crime_located_at Arg1:T3 Arg2:T12
29,R11,crime_located_at Arg1:T5 Arg2:T11
30,R12,crime_located_at Arg1:T5 Arg2:T12
31,T20,ORGANIZATION 38 53\tสน.พลับพลาไชย 2
32,R13,employee_or_member_of Arg1:T2 Arg2:T20


In [37]:
with open('data/csd_rel_data_annotated/01_nut.a/xaaa.txt' ,'r', encoding = 'utf-8') as file:
    text = file.read()
print(type(text))

<class 'str'>


In [45]:
add_no_relation(df)

{1: 'R7', 2: 'no_relation Arg1:T6 Arg2:T29'}
{1: 'R8', 2: 'no_relation Arg1:T27 Arg2:T31'}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relation_df[3] = relation_df[2].str.split(' ')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,1,2
0,T1,PERSON 19 43\tพล.ต.ต.เอกราช ลิ้มสังกาศ
1,T2,DATE 12 18\t9 ม.ค.
2,T3,PERSON 63 82\tพ.ต.อ.วชิรา ยาวไธสง
3,T4,PERSON 96 122\tพ.ต.ต.กฤตย์ ธีรเวศย์สุวรรณ
4,T5,PERSON 152 166\tนายณรงค์ นินสง
5,T6,DATE 228 239\t23 ส.ค.2562
6,T7,TIME 172 177\t32 ปี
7,T8,DATE 391 402\t23 ก.ค.2562
8,T9,LOCATION 470 486\tตลาดระแหง 100 ปี
9,T10,LOCATION 487 494\tต.ระแหง


# Convert .ann file to be in form of ready-to-use .csv file

In [7]:
def read_raw_text(filename):
    
    with open(filename, 'r', encoding = 'utf-8') as file:
        
        document = file.read()
        
    return document

def add_no_relation(df):
    
    token_df = df[df[1].str.contains('T')]
    relation_df = df[df[1].str.contains('R')]
    relation_df[3] = relation_df[2].str.split(' ')
    
    relation_dict = relation_df[3].apply(lambda x : dict([((x[1][5:], x[2][5:]), x[0])]))
    relation_df.drop(3, axis = 1, inplace = True)
    token_comb = list(itertools.combinations(token_df[1].tolist(), 2))
    random.shuffle(token_comb)
    
    no_rel_sample = 4
    c = 0
    recent_n_relation = relation_df.shape[0] + 1 
    
    for i in range(len(token_comb)):
        if i not in relation_dict.keys():
            
            tok_1 = token_comb[i][0]
            tok_2 = token_comb[i][1]
            relation_df = relation_df.append({1 : f'R{recent_n_relation}', 
                                              2 : f'no_relation Arg1:{tok_1} Arg2:{tok_2}'}, 
                                              ignore_index = True)
            c += 1
            recent_n_relation += 1
            
        if c== 2:
            break
    
    df = pd.concat([token_df, relation_df])
    
    return df
            
            
    
    
    
def read_ann_file(PATH, filename): #filename e.g. 01_nut.a/xxaa.ann
    
    
    
    document = read_raw_text(PATH + filename[:-4] + '.txt')
    df = pd.read_csv(PATH + filename, sep='^([^\s]*)\s', engine='python', header=None).drop(0, axis=1)
    
    df = add_no_relation(df)
    
    token_df = df[df[1].str.contains('T')]
    relation_df = df[df[1].str.contains('R')]
    
    list_tokens = []
    list_relations = []
    
    for index, row in token_df.iterrows():
        
        text = re.findall('\t.*', row[2])[0][1:]
        entityLabel, start, end = re.findall('.*\t', row[2])[0][:-1].split(' ')
        dict_token = {'text' : text,
                      'start' : int(start),
                      'end' : int(end),
                      'entityLabel' : entityLabel}
        
        list_tokens.append(dict_token)
    
    for index, row in relation_df.iterrows():
        
        relationLabel, token_id_1, token_id_2 = row[2].split(' ')
        token_id_1, token_id_2 = token_id_1[5:], token_id_2[5:]
        
        _, start_1, __ = re.findall('.*\t', token_df[token_df[1] == token_id_1].iloc[0][2])[0][:-1].split()
        _, start_2, __ = re.findall('.*\t', token_df[token_df[1] == token_id_2].iloc[0][2])[0][:-1].split()
        
        dict_relation = {'child' : int(start_2),
                         'head' : int(start_1),
                         'relationLabel' : relationLabel}
        
        list_relations.append(dict_relation)
        
    dict_ann = {'document' : document,
                'tokens' : list_tokens,
                'relations' : list_relations}
    
    
    return dict_ann
        
        
def read_all_file(PATH):
    
    
    assignee_folder_list = os.listdir(PATH)[3:3+15]

    result = []
    for assignee_folder in assignee_folder_list:
        text_folder_list = sorted(os.listdir(PATH + assignee_folder))
        text_folder_list = [i for i in text_folder_list if i[-3:] in ['ann', 'txt']]
        text_folder_list = set(map(lambda x : x[:-4], text_folder_list))
        
        
        for text_folder in text_folder_list:
            
            filename = assignee_folder + '/' + text_folder + '.ann'
            
            try:
                dict_ann = read_ann_file(PATH, filename)
                result.append(dict_ann)
                
            except:
                print(filename)
                
            
            
    return result


# Add more columns 

ถ้าอยากเพิ่มคอลัมน์ใหม่ ให้แก้ส่วนนี้

In [8]:
# input น่าจะเป็น list of dicts

def get_E1(text, start_E1, end_E1):
    
    return text[start_E1: end_E1]

def get_E2(text, start_E2, end_E2):
    
    return text[start_E2: end_E2]

def get_before_E1(text, start_E1):
    
    return text[:start_E1]

def get_before_E2(text, start_E2):
    
    return text[:start_E2]

def get_after_E1(text, end_E1):
    
    return text[end_E1:]

def get_after_E2(text, end_E2):
    
    return text[end_E2:]

def get_between_E1_E2(text, start_E1, start_E2, end_E1, end_E2):
    
    left, right = start_E1, end_E2
    if start_E1 > end_E2:
        left, right = start_E2, end_E1
        
    return text[left:right]

def get_POS_E1(text):
    
    pass

def get_POS_E1(text):
    
    pass

def get_POS_E2(text):
    
    pass

def get_POS_E2(text):
    
    pass

def prep_data(text, start_E1, start_E2, end_E1, end_E2, E1_entity, E2_entity):
    
    E1 = get_E1(text, start_E1, end_E1)
    E2 = get_E2(text, start_E2, end_E2)
    before_E1 = get_before_E1(text, start_E1)
    before_E2 = get_before_E2(text, start_E2)
    after_E1 = get_after_E1(text, end_E1)
    after_E2 = get_after_E2(text, end_E2)
    between_E1_E2 = get_between_E1_E2(text, start_E1, start_E2, end_E1, end_E2)
    POS_E1 = get_POS_E1(text)
    POS_E2 = get_POS_E2(text)
    
    dict_prep_data = {'text' : text,
                      'E1' : E1, 'E2' : E2,
                      'E1_entity' : E1_entity, 'E2_entity': E2_entity,
                      'before_E1' : before_E1, 'before_E2' : before_E2,
                      'after_E1' : after_E1, 'after_E2' : after_E2,
                      'between_E1_E2' : between_E1_E2}
#                       'POS_E1' : POS_E1, 'POS_E2' : POS_E2,
#                       'NER_E1' : NER_E1, 'NER_E2' : NER_E2}
    
    return dict_prep_data

def p(inp):
    
    d = []
    
    for doc in inp:
        
        text = doc['document']
        tokens = doc['tokens']
        relations = doc['relations']
        
        map_start2end = dict([(i['start'], i['end']) for i in tokens])
        map_start2entity = dict([(i['start'], i['entityLabel']) for i in tokens])
        for rel in relations:
            
            start_E1, start_E2 = rel['head'], rel['child']
            end_E1, end_E2 = map_start2end[start_E1], map_start2end[start_E2]
            E1_entity, E2_entity = map_start2entity[start_E1], map_start2entity[start_E2]
            
            dict_prep_data = prep_data(text, start_E1, start_E2, end_E1, end_E2, E1_entity, E2_entity)
            dict_prep_data['label'] = rel['relationLabel']
            d.append(dict_prep_data)
            
    df = pd.DataFrame(d)
    
    return df
            

# Preprocessing data

In [9]:
def tokenize(df, method = 'word',columns = None):
    
    tokenizers = {'word' : word_tokenize,
                 'subword' : subword_tokenize,
                 'syllable' : syllable_tokenize}
    
    tokenizer = tokenizers[method]
    
    dict_columns = dict([(name, []) for name in columns])
    
    for index, row in df.iterrows():
        
        for name in columns:
            
            tokenized_text = tokenizer(row[name])
            dict_columns[name].append(tokenized_text)
    
    max_len = 0
    
    for name in columns:
        
        df[name] = dict_columns[name]
        max_len_column = max([len(i) for i in dict_columns[name]])
        max_len = max(max_len, max_len_column)
        
    return df, max_len

def build_map_token_to_index(df, columns = None):
    
    token_list = []
    
    for name in columns:
        
        tok = [j for i in df[name] for j in i]
        print(tok)
        token_list += tok
        
    token_set = sorted(set(token_list))
    map_tok2ind = dict([(v, k) for k, v in enumerate(token_set)])
    map_tok2ind['<UNK>'] = len(map_tok2ind)
    map_tok2ind['<PAD>'] = len(map_tok2ind)
    
    return map_tok2ind

def convert_to_index(df, map_tok2ind, columns = None ):
    
    for name in columns:
        
        df[name] = df[name].apply(lambda x: np.array([map_tok2ind[i] for i in x]))
    

    return df

def pad_sequences_(df, max_len, map_tok2ind, columns = None):
    
    for name in columns:
        
        
        padded_seq = pad_sequences(df[name], 
                                 maxlen = max_len, 
                                 dtype ='int32', 
                                 padding ='post', 
                                 value = map_tok2ind['<PAD>'])
        _ = []
        
        for seq in padded_seq:
            _.append(list(seq))
        
        df[name] = _
        

    return df

def preprocessing(df, method = 'word', columns = None):
    
    tokenized_df, max_len = tokenize(df = df, 
                                     columns = columns, 
                                     method = method)
    
    map_tok2ind = build_map_token_to_index(df = tokenized_df, 
                                           columns = columns)
    
    indexed_df = convert_to_index(df = tokenized_df, 
                                  columns = columns, 
                                  map_tok2ind = map_tok2ind)
    
    padded_df = pad_sequences_(df = indexed_df,
                               columns = columns,
                               map_tok2ind = map_tok2ind,
                               max_len = max_len)
    return padded_df, map_tok2ind, max_len


    
#     return padded_df
    
            
        
    
        
        
        
        
    
    


In [10]:
def convert_tok_to_index_for_test_set(df, map_tok2ind, columns):
    
    dict_columns = dict([(v, []) for k,v in enumerate(columns)])
    
    for index, row in df.iterrows():
        for name in columns:
            res = []
            for i in row[name]:
                if i in map_tok2ind:
                    res.append(map_tok2ind[i])
                else:
                    res.append(map_tok2ind['<UNK>'])
            dict_columns[name].append(res)
    
    for name in columns:
        df[name] = dict_columns[name]
        
    return df
            
    
        
def return_train_test(df):
    
    columns = ['E1', 'E2', 'before_E1', 'before_E2', 'after_E1', 'after_E2', 'between_E1_E2']
    train_df, test_df = train_test_split(df, test_size = 0.2, random_state = 42)
    train_df, map_tok2ind, max_len = preprocessing(train_df, method = 'word', columns = columns)
    
    test_df, _ = tokenize(test_df, method = 'word', columns = columns)
    test_df = convert_tok_to_index_for_test_set(test_df, map_tok2ind, columns)   
    test_df = pad_sequences_(test_df, max_len , map_tok2ind, columns = columns)
    
    map_tag2idx = sorted(set(list(train_df['E1_entity'].unique()) +\
                             list(train_df['E2_entity'].unique()) +\
                             list(test_df['E1_entity'].unique()) +\
                             list(test_df['E2_entity'].unique())))
    
    map_tag2idx = dict([(v, k) for k,v in enumerate(map_tag2idx)])
    
    train_df['E1_entity'] = train_df['E1_entity'].apply(lambda x: map_tag2idx[x])
    train_df['E2_entity'] = train_df['E2_entity'].apply(lambda x: map_tag2idx[x])
    test_df['E1_entity'] = test_df['E1_entity'].apply(lambda x: map_tag2idx[x])
    test_df['E2_entity'] = test_df['E2_entity'].apply(lambda x: map_tag2idx[x])
    
    map_rel2idx = sorted(set(list(train_df['label'].unique()) +\
                             list(train_df['label'].unique()) +\
                             list(test_df['label'].unique()) +\
                             list(test_df['label'].unique())))
    
    map_rel2idx = dict([(v, k) for k,v in enumerate(map_rel2idx)])
    
    train_df['label'] = train_df['label'].apply(lambda x: map_rel2idx[x])
    test_df['label'] = test_df['label'].apply(lambda x: map_rel2idx[x])
    
    mapping = {'tok2idx' : map_tok2ind,
               'tag2idx' : map_tag2idx,
               'rel2idx' : map_rel2idx,
               'max_len' : max_len}
    
    return train_df, test_df, mapping

# Run and save

In [11]:
# result = read_all_file() 
result_1 = read_all_file('data/csd_rel_data_annotated/')
df_1 = p(result_1)
# result = preprocessing(df, columns = ['before_E1', 'before_E2'], method = 'word')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relation_df[3] = relation_df[2].str.split(' ')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


10_sun.y/xaso.ann


In [12]:
result_2 = read_all_file('data/csd_rel_data2_annotated/')
df_2 = p(result_2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relation_df[3] = relation_df[2].str.split(' ')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [13]:
df = pd.concat([df_1, df_2], ignore_index = True)

In [31]:
df['label'].value_counts()

no_relation              3017
crime_located_at         2371
located_at               2021
found_object_at          1605
employee_or_member_of     757
crime_relation            711
own_weapon                674
crime_date_at             474
own_vehicle               440
color_of                  403
vehicle_lp                334
crime_time_at             209
Name: label, dtype: int64

In [15]:
df.to_csv('REL_prepared_df.csv')

In [16]:
train, test, mapping = return_train_test(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[name] = dict_columns[name]


['นาย', 'ณัฐพล', ' ', 'พล', 'เสน', 'นาย', 'อุดม', ' ', 'แย้ม', 'ศิ', 'ริ', 'โทรศัพท์มือถือ', ' ', 'ยี่ห้อ', 'วี', 'โว่', 'นาย', 'จิร', 'ศักดิ์', ' ', 'อุ', 'นัย', 'บัน', 'นาย', 'บอย', 'นาย', 'ศิ', 'ริ', 'ชัย', ' ', 'แก้ว', 'หลัก', 'คำ', 'หัว', 'กระสุนปืน', 'รถจักรยาน', 'รถยนต์', 'โตโยต้า', ' ', 'รุ่น', 'ยาริส', 'นาย', 'อาชีพ', ' ', 'ประกอบ', 'ธรรม', 'พ.ต.อ.', 'ยุทธนา', ' ', 'พฤกษา', 'รุ่งเรือง', 'พล.ต.ต.', 'ธีร', 'เดช', ' ', 'ธรรม', 'สุ', 'ธีร', '์', 'นาย', 'จิมมี่', 'อาหาร', 'พ.ต.อ.', 'รัก', 'ศักดิ์', ' ', 'เมฆ', 'จินดา', 'สีน้ำเงิน', 'ขาว', 'นาย', 'แสน', ' ', 'ปทุม', 'สูตร', 'รถกระบะ', 'อีซูซุ', 'นาย', 'รอ', 'ซา', 'ลี', ' ', 'เจะ', 'เลาะ', 'อ.', 'แม่', 'อาย', 'นาย', 'ณัฐ', 'กานต์', ' ', 'เกสร', 'สิทธิ์', 'รถยนต์', 'เฮโรอีน', 'นาย', 'ธุรการ', ' ', 'ยัง', 'บรรเทา', 'หมู่', ' ', '3', 'รองเท้าบูท', 'หนัง', 'ตก', 'อยู่', 'พ.ต.อ.', 'ประเสริฐ', 'สุข', ' ', 'เฮง', 'สุวรรณ', '์', 'นาย', 'คุ', 'ณัชญ์', 'น.', 'อ.', 'สัน', 'ชัย', ' ', 'ประชุม', 'พันธ์', 'รถกระบะ', 'มิตซูบิชิ', 'ไทร', 'ทัน', 'นาย

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[name] = df[name].apply(lambda x: np.array([map_tok2ind[i] for i in x]))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[name] = _
A value is trying to be set on a copy of a slice from 

In [33]:
mapping['rel2idx']

{'color_of': 0,
 'crime_date_at': 1,
 'crime_located_at': 2,
 'crime_relation': 3,
 'crime_time_at': 4,
 'employee_or_member_of': 5,
 'found_object_at': 6,
 'located_at': 7,
 'no_relation': 8,
 'own_vehicle': 9,
 'own_weapon': 10,
 'vehicle_lp': 11}

In [19]:
df.to_csv('data/ohm_data/REL_prepared_data.csv')
train.to_csv('data/ohm_data/train.csv')
test.to_csv('data/ohm_data/test.csv')

In [20]:
import pickle

with open('mapping/REL/tok2idx.pickle', 'wb') as dict_:
    pickle.dump(mapping['tok2idx'], dict_)
    
with open('mapping/REL/tag2idx.pickle', 'wb') as dict_:
    pickle.dump(mapping['tag2idx'], dict_)
    
with open('mapping/REL/rel2idx.pickle', 'wb') as dict_:
    pickle.dump(mapping['rel2idx'], dict_)
    
with open('mapping/REL/max_len.pickle', 'wb') as dict_:
    pickle.dump(mapping['max_len'], dict_)