In [1]:
!pip install pythainlp

Collecting pythainlp
  Downloading pythainlp-3.0.5-py3-none-any.whl (11.5 MB)
[K     |████████████████████████████████| 11.5 MB 12.0 MB/s 
Collecting tinydb>=3.0
  Downloading tinydb-4.6.1-py3-none-any.whl (24 kB)
Installing collected packages: tinydb, pythainlp
Successfully installed pythainlp-3.0.5 tinydb-4.6.1


In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer , Embedding , LSTM , TimeDistributed , Dense , Activation
import glob
import os
from pythainlp.tokenize import word_tokenize, sent_tokenize
from tqdm import tqdm

In [3]:
!tar -xf /content/drive/MyDrive/SuperAI_NLP/AIFORTHAI-LST20Corpus.tar.gz

In [7]:
train_path = '/content/LST20_Corpus/train'
test_path = '/content/LST20_Corpus/test'

def get_doc(path):
  docs = []
  for i in tqdm(list(os.scandir(path))):
    if i.name[0] != '.':
      doc = open(i.path,'r').read()
      doc = doc.split('\n\n')
      doc = [[k.split('\t')[0] for k in d.split('\n')] for d in doc]
      # doc = [d.split('\t')[0] for j in doc for d in j]
      docs.append(doc)
  return docs

def get_label(docz):
  label = []
  for i in docz:
    temp = []
    for index, j in enumerate(i[:-1]):
      # print(len(j))
      temp.append(['I_SENT']*len(j))
      temp[index][0] = 'B_SENT'
      temp[index][-1] = 'E_SENT'
    label.append(temp)
  return label

docs = get_doc(train_path)
docs_test = get_doc(test_path)
label = get_label(docs)
label_test = get_label(docs_test)


dic = {j:i for i,j in enumerate(list(set([x for i in docs for d in i for x in d])))}
dic_class = {}
dic['<PAD_TOKEN>'] = len(dic)
dic['<UNKNOWN>'] = len(dic)
dic_class['I_SENT'] = len(dic_class)
dic_class['B_SENT'] = len(dic_class)
dic_class['E_SENT'] = len(dic_class)
dic_class['O_SENT'] = len(dic_class)
dic_class['P_SENT'] = len(dic_class)
inv_dic_class = {i:j for j,i in dic_class.items()}


100%|██████████| 7588/7588 [00:02<00:00, 3583.60it/s]
100%|██████████| 966/966 [00:00<00:00, 5614.09it/s]


In [8]:
X_train, Y_train = [],[]
max_length = 1024

def get_data(docz, labelz):
  def get_middle_index_b(label):
    # print(label)
    b_index = []
    for i in range(len(label)):
      if label[i] == dic_class['B_SENT']: b_index.append(i)
    sorted(b_index)
    return b_index[round(len(b_index) / 2)]

  for x,y in zip(docz,labelz):
    string = []
    lab = []
    for xx, yy in zip(x,y):
      xx = [dic[word] if word in dic.keys() else dic['<UNKNOWN>'] for word in xx]
      string.extend(xx)
      string.append(dic['_'])
      yy = [dic_class[word] for word in yy]
      lab.extend(yy)
      lab.append(dic_class['O_SENT'])
      # print(lab)
    if len(string) > max_length:
      id = get_middle_index_b(lab)
      if len(string[:id]) < max_length:
        X_train.append(string[:id] + [dic['<PAD_TOKEN>']] * (max_length - len(string[:id])))
        Y_train.append(lab[:id] + [dic_class['P_SENT']] * (max_length - len(string[:id])))
        # print(id,len(string[:id]), len( [dic['<PAD_TOKEN>']] * (max_length - len(string[:id]))))
      if len(string[id:]) < max_length:
        X_train.append(string[id:] + [dic['<PAD_TOKEN>']] * (max_length - len(string[id:])))
        Y_train.append(lab[id:] + [dic_class['P_SENT']] * (max_length - len(string[id:])))
    else:
      X_train.append(string + [dic['<PAD_TOKEN>']] * (max_length - len(string)))
      Y_train.append(lab + [dic_class['P_SENT']] * (max_length - len(string)))
  return X_train, Y_train
    # print(len(string))

X_train, Y_train = get_data(docs, label)
X_test, Y_test = get_data(docs_test, label_test)

In [9]:
model = Sequential() 
model.add(InputLayer(input_shape=(1024,)))
model.add(Embedding(len(dic),64))
model.add(LSTM(64,return_sequences=True))
model.add(TimeDistributed(Dense(5)))
model.add(Activation('softmax'))
model.compile(optimizer="Adam",loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              weighted_metrics=['accuracy'], run_eagerly=True)

In [16]:
Y_test = np.array(Y_test)

print('B', np.sum(Y_test == dic_class['B_SENT']))
print('I', np.sum(Y_test == dic_class['I_SENT']))
print('E', np.sum(Y_test == dic_class['E_SENT']))
print('O', np.sum(Y_test == dic_class['O_SENT']))

total = len(Y_test.flatten())

# class_weight = {dic_class['B_SENT']: total/(np.sum(Y_test == dic_class['B_SENT'])),
#                 dic_class['I_SENT']: total/(np.sum(Y_test == dic_class['I_SENT'])),
#                 dic_class['E_SENT']: total/(np.sum(Y_test == dic_class['E_SENT'])),
#                 dic_class['O_SENT']: total/(np.sum(Y_test == dic_class['O_SENT'])),
#                 dic_class['P_SENT']: total/(np.sum(Y_test == dic_class['P_SENT'])),}
class_weight = {dic_class['B_SENT']: 20.0,
                dic_class['I_SENT']: 1.0,
                dic_class['E_SENT']: 20.0,
                dic_class['O_SENT']: 30.0,
                dic_class['P_SENT']: 0.005,}

Y_test = Y_test.tolist()
print(dic_class)
class_weight

B 54009
I 2149320
E 54056
O 54056
{'I_SENT': 0, 'B_SENT': 1, 'E_SENT': 2, 'O_SENT': 3, 'P_SENT': 4}


{0: 1.0, 1: 20.0, 2: 20.0, 3: 30.0, 4: 0.005}

In [17]:
def gen_sample_weight(Y, class_weight):
  Y = np.array(Y).astype(np.float32)
  for k,v in class_weight.items():
    Y[Y==k] = v
  return Y

gen_sample_weight(Y_train, class_weight)

array([[2.e+01, 1.e+00, 1.e+00, ..., 5.e-03, 5.e-03, 5.e-03],
       [2.e+01, 1.e+00, 1.e+00, ..., 5.e-03, 5.e-03, 5.e-03],
       [2.e+01, 1.e+00, 1.e+00, ..., 5.e-03, 5.e-03, 5.e-03],
       ...,
       [2.e+01, 1.e+00, 1.e+00, ..., 5.e-03, 5.e-03, 5.e-03],
       [2.e+01, 1.e+00, 1.e+00, ..., 5.e-03, 5.e-03, 5.e-03],
       [2.e+01, 1.e+00, 1.e+00, ..., 5.e-03, 5.e-03, 5.e-03]],
      dtype=float32)

In [18]:
model.fit(X_train, Y_train, batch_size = 128, validation_data = (X_test, Y_test), sample_weight = gen_sample_weight(Y_train, class_weight), epochs=20)

Epoch 1/20
 2/36 [>.............................] - ETA: 3s - loss: 0.3015 - accuracy: 0.9133

  return dispatch_target(*args, **kwargs)


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
 1/36 [..............................] - ETA: 2s - loss: 0.2050 - accuracy: 0.9406

KeyboardInterrupt: ignored

In [23]:
y_pred = model.predict(X_test)

In [30]:
model.save('/content/drive/MyDrive/SuperAI_NLP/model_09422.h5')

In [29]:
correct = 0
total = np.sum(np.array(Y_test) == dic_class['O_SENT'])
corr = 0
for yp,yt in zip(y_pred, Y_test):
  pred = [inv_dic_class[d] for d in np.argmax(yp, axis=1).tolist()]
  gt = [inv_dic_class[d] for d in yt]

  for p,g in zip(pred, gt):
    if g == 'O_SENT':
      if p == g: corr += 1
      # print(p,g)
  # break

print('ac', corr/total)

ac 0.9919897883676188


In [None]:
y_pred[0].argmax(1).tolist()

In [None]:
import pandas as pd

my_file =  open('/content/drive/MyDrive/SuperAI_NLP/ss_test.txt', 'r')


r = my_file.read()


data = r.split("\n")
# print(data)
my_file.close()
text = ''
for  i in range(len(data)):
    
    if data[i] =='':
        text = text+' '
        data[i] = '_'
    else:
        text = text+data[i]
# print(text)
# print(data)
df = pd.DataFrame({'word':data})
data = np.array(data)
id = np.where(data == '_')[0]


In [67]:
inc = 0
idx = list(range(0,len(data),1024))
len(idx)
res = []
for i in idx:
  dat = data[i:i+1024]
  temp = []
  for word in dat:
    if word in dic.keys():
      temp.append(dic[word])
    else: temp.append(dic['<UNKNOWN>'])
  temp = temp + ([dic['<PAD_TOKEN>']]*(1024-len(temp)))
  pred = model.predict([temp])[0]
  pred = pred.argmax(1)
  pred = [inv_dic_class[i] for i in pred]
  pred = pred[:len(dat)]
  res.extend(pred)
  # print(pred)
  # index = np.sort(id[(id > inc) & (id < 1024 + inc)])[-1]
  # inc = i
  # print(index)

In [68]:
print(res.__len__())
print(data.__len__())
for 

74405
74405


In [70]:
map_class = {
    'B_SENT':'B_SENT',
    'I_SENT':'I_SENT',
    'E_SENT':'E_SENT',
    'O_SENT':'O',
    'P_SENT':'I_SENT',
}

res = [map_class[i] for i in res]

In [72]:
sub = pd.read_csv('/content/drive/MyDrive/SuperAI_NLP/ss_sample_submission.csv')
sub['Predicted'] = res[:-2]

In [74]:
sub.to_csv('submit.csv',index=False)