In [None]:
!pip install pythainlp

Collecting pythainlp
  Downloading pythainlp-3.0.5-py3-none-any.whl (11.5 MB)
[K     |████████████████████████████████| 11.5 MB 9.5 MB/s 
Collecting tinydb>=3.0
  Downloading tinydb-4.6.1-py3-none-any.whl (24 kB)
Installing collected packages: tinydb, pythainlp
Successfully installed pythainlp-3.0.5 tinydb-4.6.1


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer , Embedding , LSTM , TimeDistributed , Dense , Activation
import glob
import os
from pythainlp.tokenize import word_tokenize, sent_tokenize
from tqdm import tqdm

In [None]:
!tar -xf /content/drive/MyDrive/SuperAI_NLP/AIFORTHAI-LST20Corpus.tar.gz

In [None]:
train_path = '/content/LST20_Corpus/train'
test_path = '/content/LST20_Corpus/test'

def get_doc(path):
  docs = []
  for i in tqdm(list(os.scandir(path))):
    if i.name[0] != '.':
      doc = open(i.path,'r').read()
      doc = doc.split('\n\n')
      doc = [[k.split('\t')[0] for k in d.split('\n')] for d in doc]
      # doc = [d.split('\t')[0] for j in doc for d in j]
      docs.append(doc)
  return docs

def get_label(docz):
  label = []
  for i in docz:
    temp = []
    for index, j in enumerate(i[:-1]):
      # print(len(j))
      temp.append(['I_SENT']*len(j))
      temp[index][0] = 'B_SENT'
      temp[index][-1] = 'E_SENT'
    label.append(temp)
  return label

docs = get_doc(train_path)
docs_test = get_doc(test_path)
label = get_label(docs)
label_test = get_label(docs_test)


dic = {j:i for i,j in enumerate(list(set([x for i in docs for d in i for x in d])))}
dic_class = {}
dic['<PAD_TOKEN>'] = len(dic)
dic['<UNKNOWN>'] = len(dic)
dic_class['I_SENT'] = len(dic_class)
dic_class['B_SENT'] = len(dic_class)
dic_class['E_SENT'] = len(dic_class)
dic_class['O_SENT'] = len(dic_class)
dic_class['P_SENT'] = len(dic_class)
inv_dic_class = {i:j for j,i in dic_class.items()}


100%|██████████| 7588/7588 [00:02<00:00, 3261.23it/s]
100%|██████████| 966/966 [00:00<00:00, 4054.05it/s]


In [None]:
X_train, Y_train = [],[]
max_length = 1024

def get_data(docz, labelz):
  def get_middle_index_b(label):
    # print(label)
    b_index = []
    for i in range(len(label)):
      if label[i] == dic_class['B_SENT']: b_index.append(i)
    sorted(b_index)
    return b_index[round(len(b_index) / 2)]

  for x,y in zip(docz,labelz):
    string = []
    lab = []
    for xx, yy in zip(x,y):
      xx = [dic[word] if word in dic.keys() else dic['<UNKNOWN>'] for word in xx]
      string.extend(xx)
      string.append(dic['_'])
      yy = [dic_class[word] for word in yy]
      lab.extend(yy)
      lab.append(dic_class['O_SENT'])
      # print(lab)
    if len(string) > max_length:
      id = get_middle_index_b(lab)
      if len(string[:id]) < max_length:
        X_train.append(string[:id] + [dic['<PAD_TOKEN>']] * (max_length - len(string[:id])))
        Y_train.append(lab[:id] + [dic_class['P_SENT']] * (max_length - len(string[:id])))
        # print(id,len(string[:id]), len( [dic['<PAD_TOKEN>']] * (max_length - len(string[:id]))))
      if len(string[id:]) < max_length:
        X_train.append(string[id:] + [dic['<PAD_TOKEN>']] * (max_length - len(string[id:])))
        Y_train.append(lab[id:] + [dic_class['P_SENT']] * (max_length - len(string[id:])))
    else:
      X_train.append(string + [dic['<PAD_TOKEN>']] * (max_length - len(string)))
      Y_train.append(lab + [dic_class['P_SENT']] * (max_length - len(string)))
  return X_train, Y_train
    # print(len(string))

X_train, Y_train = get_data(docs, label)
X_test, Y_test = get_data(docs_test, label_test)

In [None]:
model = Sequential() 
model.add(InputLayer(input_shape=(1024,)))
model.add(Embedding(len(dic),64))
model.add(LSTM(64,return_sequences=True))
model.add(TimeDistributed(Dense(5)))
model.add(Activation('softmax'))
model.compile(optimizer="Adam",loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              weighted_metrics=['accuracy'], run_eagerly=True)

In [None]:
Y_test = np.array(Y_test)

print('B', np.sum(Y_test == dic_class['B_SENT']))
print('I', np.sum(Y_test == dic_class['I_SENT']))
print('E', np.sum(Y_test == dic_class['E_SENT']))
print('O', np.sum(Y_test == dic_class['O_SENT']))

total = len(Y_test.flatten())

class_weight = {dic_class['B_SENT']: total/(np.sum(Y_test == dic_class['B_SENT'])),
                dic_class['I_SENT']: total/(np.sum(Y_test == dic_class['I_SENT'])),
                dic_class['E_SENT']: total/(np.sum(Y_test == dic_class['E_SENT'])),
                dic_class['O_SENT']: total/(np.sum(Y_test == dic_class['O_SENT'])),
                dic_class['P_SENT']: total/(np.sum(Y_test == dic_class['P_SENT'])),}

Y_test = Y_test.tolist()
print(dic_class)
class_weight

B 54009
I 2149320
E 54056
O 54056
{'I_SENT': 0, 'B_SENT': 1, 'E_SENT': 2, 'O_SENT': 3, 'P_SENT': 4}


{0: 2.1720432508886534,
 1: 86.4377418578385,
 2: 86.36258694686991,
 3: 86.36258694686991,
 4: 1.9806811697196618}

In [None]:
model.fit(X_train, Y_train, validation_data = (X_test, Y_test), epochs=10)

Epoch 1/10
  1/143 [..............................] - ETA: 27s - loss: 0.0716 - accuracy: 0.9738

  return dispatch_target(*args, **kwargs)


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10

In [None]:
correct = 0
pred = [inv_dic_class[d] for d in np.argmax(y_pred, axis=2)[0].tolist()]
gt = [inv_dic_class[d] for d in Y_train[0]]

total = np.sum(np.array(gt) == 'O_SENT')
corr = 0

for p,g in zip(pred, gt):
  if g == 'O_SENT':
    if p == g: corr += 1

print('ac', corr/total)

In [None]:
y_pred.shape

(4559, 1024, 5)

In [None]:
y_pred

array([[[5.37955761e-01, 2.03589052e-01, 9.64102000e-02, 1.10552162e-01,
         5.14927842e-02],
        [8.51204693e-01, 5.37104681e-02, 6.74351528e-02, 1.87505577e-02,
         8.89909733e-03],
        [8.86707723e-01, 8.57885461e-03, 9.57685411e-02, 5.85847162e-03,
         3.08643840e-03],
        ...,
        [7.11025450e-07, 7.16690693e-06, 2.11648094e-05, 9.55411178e-06,
         9.99961376e-01],
        [7.11025450e-07, 7.16690693e-06, 2.11648094e-05, 9.55411178e-06,
         9.99961376e-01],
        [7.11025450e-07, 7.16690693e-06, 2.11648094e-05, 9.55411178e-06,
         9.99961376e-01]],

       [[3.92610669e-01, 2.28094921e-01, 1.28554985e-01, 1.48810521e-01,
         1.01928905e-01],
        [7.64553189e-01, 1.30523667e-01, 3.80669236e-02, 4.89239432e-02,
         1.79322343e-02],
        [8.72603238e-01, 2.37289723e-02, 8.87984410e-02, 7.77447596e-03,
         7.09482562e-03],
        ...,
        [7.11025450e-07, 7.16690693e-06, 2.11648094e-05, 9.55411178e-06,
        

In [None]:
np.array(X_test[:1]).shape

(1, 1024)

In [None]:
model.predict(X_test[:1]).shape

(1, 1024, 4)

In [None]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 1024, 64)          3132672   
                                                                 
 lstm_2 (LSTM)               (None, 1024, 64)          33024     
                                                                 
 time_distributed_2 (TimeDis  (None, 1024, 5)          325       
 tributed)                                                       
                                                                 
 activation_2 (Activation)   (None, 1024, 5)           0         
                                                                 
Total params: 3,166,021
Trainable params: 3,166,021
Non-trainable params: 0
_________________________________________________________________


In [None]:
Y_test[:19]

[[1,
  0,
  0,
  0,
  0,
  2,
  3,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  2,
  3,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  2,
  3,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  2,
  3,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  2,
  3,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  2,
  3,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  2,
  3,
  1,
  0,
