In [1]:
import pandas as pd
import numpy as np

import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Reshape, Concatenate, Activation
from tensorflow.keras.utils import plot_model
from transformers import TFAutoModel
from tensorflow.keras import backend as K
from focal_loss import sparse_categorical_focal_loss
from transformers import AutoModel
from tensorflow.keras.layers import concatenate
from keras_contrib.layers import CRF
# import tensorflow_hub as hub
# import tensorflow_text as text
import pythainlp
import spacy_thai
from nltk.tokenize import RegexpTokenizer
import re
import string
import os
from string import punctuation
import pickle
import itertools

In [2]:
def load_model(filename, customize = True):
    
    model = tf.keras.models.load_model(filename, compile = not customize)
    
    return model

def load_mapping(filename):
    
    with open(filename, 'rb') as dict_:
        
        mapping = pickle.load(dict_)
        
    return mapping

def check_condition(condition):

        if condition:
            return 'True'
        else:
            return 'False'
        
def preprocess(text, tokenize_method = 'syllable'):
    
    list_tokenizer = {'word' : pythainlp.word_tokenize,
                 'subword' : pythainlp.subword_tokenize,
                 'syllable' : pythainlp.syllable_tokenize}
    
    tokenizer = list_tokenizer[tokenize_method]
    
    list_text = tokenizer(text)
    
    NER_mapping = {
                   'tok2idx' : load_mapping('mapping/NER/tok2idx.pickle'),
                   'tag2idx' : load_mapping('mapping/NER/tag2idx.pickle'),
                   'pos2idx' : load_mapping('mapping/NER/pos2idx.pickle'),
                   'max_len' : load_mapping('mapping/NER/max_len.pickle')
                  }
    
    NER_mapping['digit2idx'] = {'True' : 1, 'False' : 0, '<PAD>' : 2}
    NER_mapping['punc2idx'] = {'True' : 1, 'False' : 0, '<PAD>' : 2}
    NER_mapping['vowel2idx'] = {'True' : 1, 'False' : 0, '<PAD>' : 2}
    

    thai_vowel = 'ะาิีุุึืโเแัำไใฤๅฦ'
    contain_digit_condition = [check_condition(any(char.isdigit() for char in text)) for text in list_text]
    contain_punc_condition = [check_condition(any(char in punctuation for char in text)) for text in list_text]
    contain_vowel_condition = [check_condition(any(char in thai_vowel for char in text)) for text in list_text]
    
    words_input = []
    pos_input = []
    
    for word in list_text:
        try:
            words_input.append(NER_mapping['tok2idx'][word])
        except:
            words_input.append(NER_mapping['tok2idx']['<UNK>'])
            
    for word in list_text:
        try:
            pos_input.append(NER_mapping['pos2idx'][word])
        except:
            pos_input.append(NER_mapping['pos2idx']['<UNK>'])
            
    contain_digit_input = [NER_mapping['digit2idx'][i] for i in contain_digit_condition]
    contain_punc_input  = [NER_mapping['punc2idx'][i] for i in contain_punc_condition]
    contain_vowel_input = [NER_mapping['vowel2idx'][i] for i in contain_vowel_condition]
    
    NER_input = {'list_text' : list_text,
                 'words_idx' : words_input,
                 'pos_idx' : pos_input,
                 'contain_digit_idx' : contain_digit_input,
                 'contain_punc_idx' : contain_punc_input,
                 'contain_vowel_idx' : contain_vowel_input}
    
    return NER_input, NER_mapping

def return_span(entities):
    

    entities.append(('.', 'O'))
    text = ''.join([i[0] for i in entities])
    seek = 0
    start_seek = None
    start_entity_type = None
    spans = []

    for word, named_entity in entities:

        if len(named_entity.split('-')) == 1:
            entity_prefix = named_entity[0] # O

        else: 
            entity_prefix, entity_type = named_entity.split('-') # B , LOCATION
            
        
        
        if entity_prefix == 'B':
            if start_seek is None:
                start_seek = seek # --> assign first span at B tag
                start_entity_type = entity_type
                
            elif start_seek is not None:
                
                end_seek = seek
                spans.append((start_seek, end_seek, entity_type, text[start_seek:end_seek]))
                start_seek = seek
                start_entity_type = entity_type

        elif entity_prefix == 'I':
            if start_seek is None:
            
                pass
            
            else:
                if entity_type == start_entity_type:

                    pass
                
                else:
                    end_seek = seek
                    spans.append((start_seek, end_seek, entity_type, text[start_seek:end_seek]))
                    start_seek = None
                    start_entity_type = None

        
        
        else:
            if start_seek is not None: # ถ้าไม่มี start seek 
                end_seek = seek
                spans.append((start_seek, end_seek, entity_type, text[start_seek:end_seek]))
                start_seek = None
                entity_type = None
       
        seek += len(word)
    
    return spans

def run_NER(raw_text):
    
    NER_input, NER_mapping = preprocess(raw_text)
    model = load_model('model/NER/NER_model_v2_26_1_2022.h5')
    max_len = NER_mapping['max_len']
    
    NER_input['padded_words_idx'] = list(pad_sequences([NER_input['words_idx']], maxlen = max_len, 
                                                       padding = 'post', value = NER_mapping['tok2idx']['<PAD>']))
    NER_input['padded_pos_idx'] = list(pad_sequences([NER_input['pos_idx']], maxlen = max_len, 
                                                     padding = 'post', value = NER_mapping['pos2idx']['<PAD>']))
    NER_input['padded_contain_digit_idx'] = list(pad_sequences([NER_input['contain_digit_idx']], maxlen = max_len, 
                                                               padding = 'post', value = NER_mapping['digit2idx']['<PAD>']))
    NER_input['padded_contain_punc_idx'] = list(pad_sequences([NER_input['contain_punc_idx']], maxlen = max_len, 
                                                              padding = 'post', value = NER_mapping['punc2idx']['<PAD>']))
    NER_input['padded_contain_vowel_idx'] = list(pad_sequences([NER_input['contain_vowel_idx']], maxlen = max_len, 
                                                               padding = 'post', value = NER_mapping['vowel2idx']['<PAD>']))
   
    
    
    X_test = [np.array(NER_input['padded_words_idx']), 
              np.array(NER_input['padded_pos_idx']),
              np.array(NER_input['padded_contain_digit_idx']),
              np.array(NER_input['padded_contain_punc_idx']),
              np.array(NER_input['padded_contain_vowel_idx'])]
    
    y_pred = model.predict(X_test)
    y_pred = np.argmax(y_pred, axis = 2)[0]
    
    idx2tag = dict([(v, k) for k,v in NER_mapping['tag2idx'].items()])
    prediction = [idx2tag[i] for i in y_pred]
    
    entities = [*zip(NER_input['list_text'], prediction)]
    
    spans = return_span(entities)

    return raw_text, spans
    


In [3]:
t, s = run_NER('พบศพนายสมชาย และพบมีดสั้นตกอยู่ข้างๆ คาดว่าน่าจะเป็นฝีมือของนายสมปอง ที่เป็นเจ้าของรถยนต์โตโยต้า')
preprocessed_span_for_REL(t, s)

NameError: name 'preprocessed_span_for_REL' is not defined

In [4]:
def preprocessed_span_for_REL(raw_text, spans):
    
    possible_pairs = list(itertools.combinations(spans, 2))
    reverse_pairs = [(i[1], i[0]) for i in possible_pairs]
    possible_pairs += reverse_pairs
    
    condition = [
                    ('PERSON', 'VEHICLE'), ('VEHICLE', 'LP'), ('PERSON', 'PERSON'), ('PERSON', 'WEAPON'),
                    ('PERSON', 'LOCATION'), ('PERSON', 'DATE'), ('PERSON', 'TIME'), ('VEHICLE', 'COLOR'),
                    ('OBJECT', 'LOCATION'), ('PERSON', 'ORGANIZATION')
                ]
    
    possible_pairs = [i for i in possible_pairs if (i[0][2], i[1][2]) in condition]
    
    pairs_df = pd.DataFrame()
    
    tokenizer = pythainlp.word_tokenize
    
    for tok_1, tok_2 in possible_pairs:
        
        
        E1 = tok_1[3]
        E2 = tok_2[3]
        E1_entity = tok_1[2]
        E2_entity = tok_2[2]
        
        start_E1, start_E2, end_E1, end_E2 = tok_1[0], tok_2[0], tok_1[1], tok_2[1]
        
        before_E1 = raw_text[:start_E1]
        before_E2 = raw_text[:start_E2]
        after_E1 = raw_text[end_E1:]
        after_E2 = raw_text[end_E2:]
        
        btw_E1_E2 = raw_text[end_E1:start_E2]
        
        if end_E1 > start_E2:
            btw_E1_E2 = raw_text[start_E2:end_E1]
            
        tmp = pd.DataFrame({'text' : [raw_text],
                            'E1' : [tokenizer(E1)],
                            'E2' : [tokenizer(E2)],
                            'E1_entity' : [E1_entity],
                            'E2_entity' : [E2_entity],
                            'before_E1' : [tokenizer(before_E1)],
                            'before_E2' : [tokenizer(before_E2)],
                            'after_E1' : [tokenizer(after_E1)],
                            'after_E2' : [tokenizer(after_E2)],
                            'between_E1_E2' : [tokenizer(btw_E1_E2)]})
        
        pairs_df = pd.concat([pairs_df, tmp], ignore_index = True)
            
        
        
    return pairs_df

def convert_to_idx(pairs_df, mapping):
    
    columns = ['E1', 'E2', 'before_E1', 'before_E2', 'after_E1', 'after_E2', 'between_E1_E2']
    
    for col in columns:
        
        tt = []
        
        for i in pairs_df[col]:
            t = []
            for x in i:
                try:
                    t.append(mapping['tok2idx'][x])
                except:
                    t.append(mapping['tok2idx']['<UNK>'])
            tt.append(t)
            
        pairs_df[col + '_idx'] = list(pad_sequences(tt, maxlen = mapping['max_len'], 
                                                    padding = 'post', value = mapping['tok2idx']['<PAD>']))
            
        
    pairs_df['E1_entity_idx'] = pairs_df['E1_entity'].apply(lambda x: mapping['tag2idx'][x])
    pairs_df['E2_entity_idx'] = pairs_df['E2_entity'].apply(lambda x: mapping['tag2idx'][x])
    
    
    return pairs_df

def run_REL(text):
    
    try:
        
        text, spans = run_NER(text)
        REL_mapping = {
                       'tok2idx' : load_mapping('mapping/REL/tok2idx.pickle'),
                       'tag2idx' : load_mapping('mapping/REL/tag2idx.pickle'),
                       'rel2idx' : load_mapping('mapping/REL/rel2idx.pickle'),
                       'max_len' : load_mapping('mapping/REL/max_len.pickle')
                      }
        
        pairs_df = preprocessed_span_for_REL(text, spans)
        pairs_df = convert_to_idx(pairs_df, REL_mapping)

        model = load_model('model/REL/REL_model_v2_27_1_2022.h5')

        X_test = []
        columns = ['E1_idx', 'E2_idx', 'E1_entity_idx', 'E2_entity_idx', 
                   'before_E1_idx', 'before_E2_idx', 'after_E1_idx', 'after_E2_idx', 'between_E1_E2_idx']

        for col in columns:
            col_list = []
            for i in pairs_df[col]:
                col_list.append(i)
            X_test.append(np.array(col_list))

        y_pred = model.predict(X_test)
        y_pred = np.argmax(y_pred, axis = 1)
        idx2rel = dict([(v, k) for k, v in REL_mapping['rel2idx'].items()])
        prediction = [idx2rel[i] for i in y_pred]

        pairs_df['prediction'] = prediction
        pairs_df['E1'] = pairs_df['E1'].apply(lambda x: ''.join(x))
        pairs_df['E2'] = pairs_df['E2'].apply(lambda x: ''.join(x))
#         pairs_df = pairs_df[pairs_df['prediction'] != 'no_relation']
        
        
        
        ENTITIES_RESULT = {
                            'success' : True, 
                            'input' : text,
                            'keysEntities' : [
                                {
                                'type' : ent[2],
                                'name' : ent[3],
                                'mentionOffsets' : {'start' : ent[0], 'end' : ent[1], 'entity_index' : index}
                                }  for index, ent in enumerate(spans)]
                          }
        
        RELATION_RESULT = {
                            'success' : True,
                            'input' : text,
                            'keyEntities' : ENTITIES_RESULT['keysEntities'],
                            'relationships' : [
                                                {
                                                    'source' : row['E1'],
                                                    'target' : row['E2'],
                                                    'source_type' : row['E1_entity'],
                                                    'target_type' : row['E2_entity'],
                                                    'relation' : row['prediction']
                                                } for index, row in pairs_df.iterrows()]
        
                          }
        
        
    except:
        
        ENTITIES_RESULT = {'success' : False}
        RELATION_RESULT = {'success' : False}
        
  
    return ENTITIES_RESULT, RELATION_RESULT
    

In [5]:
entities, relation = run_REL('เมื่อเวลา 18.30 น. วันที่ 26 ม.ค. 65 พบศพนายมงคล ชัยมงคล ถูกแทงด้วยมีดสั้นที่กลางอกจนเสียชีวิต ทราบภายหลังว่าผู้ก่อเหตุคือ นายสมหวัง แซ่ตั้ง อายุ 25 ปี')

In [6]:
entities

{'success': True,
 'input': 'เมื่อเวลา 18.30 น. วันที่ 26 ม.ค. 65 พบศพนายมงคล ชัยมงคล ถูกแทงด้วยมีดสั้นที่กลางอกจนเสียชีวิต ทราบภายหลังว่าผู้ก่อเหตุคือ นายสมหวัง แซ่ตั้ง อายุ 25 ปี',
 'keysEntities': [{'type': 'TIME',
   'name': '18.30 น.',
   'mentionOffsets': {'start': 10, 'end': 18, 'entity_index': 0}},
  {'type': 'DATE',
   'name': '26 ม.ค. 65',
   'mentionOffsets': {'start': 26, 'end': 36, 'entity_index': 1}},
  {'type': 'PERSON',
   'name': 'นายมงคล ชัยมงคล ',
   'mentionOffsets': {'start': 41, 'end': 57, 'entity_index': 2}},
  {'type': 'WEAPON',
   'name': 'มีดสั้น',
   'mentionOffsets': {'start': 67, 'end': 74, 'entity_index': 3}},
  {'type': 'PERSON',
   'name': 'นายสมหวัง แซ่ตั้ง ',
   'mentionOffsets': {'start': 123, 'end': 141, 'entity_index': 4}},
  {'type': 'TIME',
   'name': '25 ปี',
   'mentionOffsets': {'start': 146, 'end': 151, 'entity_index': 5}}]}

In [7]:
relation

{'success': True,
 'input': 'เมื่อเวลา 18.30 น. วันที่ 26 ม.ค. 65 พบศพนายมงคล ชัยมงคล ถูกแทงด้วยมีดสั้นที่กลางอกจนเสียชีวิต ทราบภายหลังว่าผู้ก่อเหตุคือ นายสมหวัง แซ่ตั้ง อายุ 25 ปี',
 'keyEntities': [{'type': 'TIME',
   'name': '18.30 น.',
   'mentionOffsets': {'start': 10, 'end': 18, 'entity_index': 0}},
  {'type': 'DATE',
   'name': '26 ม.ค. 65',
   'mentionOffsets': {'start': 26, 'end': 36, 'entity_index': 1}},
  {'type': 'PERSON',
   'name': 'นายมงคล ชัยมงคล ',
   'mentionOffsets': {'start': 41, 'end': 57, 'entity_index': 2}},
  {'type': 'WEAPON',
   'name': 'มีดสั้น',
   'mentionOffsets': {'start': 67, 'end': 74, 'entity_index': 3}},
  {'type': 'PERSON',
   'name': 'นายสมหวัง แซ่ตั้ง ',
   'mentionOffsets': {'start': 123, 'end': 141, 'entity_index': 4}},
  {'type': 'TIME',
   'name': '25 ปี',
   'mentionOffsets': {'start': 146, 'end': 151, 'entity_index': 5}}],
 'relationships': [{'source': 'นายมงคล ชัยมงคล ',
   'target': 'มีดสั้น',
   'source_type': 'PERSON',
   'target_type': '