In [None]:
!pip install pythainlp

Collecting pythainlp
  Downloading pythainlp-3.0.5-py3-none-any.whl (11.5 MB)
[K     |████████████████████████████████| 11.5 MB 4.3 MB/s 
[?25hCollecting tinydb>=3.0
  Downloading tinydb-4.6.1-py3-none-any.whl (24 kB)
Installing collected packages: tinydb, pythainlp
Successfully installed pythainlp-3.0.5 tinydb-4.6.1


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import InputLayer , Embedding , LSTM , TimeDistributed , Dense , Activation, Bidirectional,Concatenate, Input, Dropout
import glob
import os
from pythainlp.tokenize import word_tokenize, sent_tokenize
from pythainlp.tag import pos_tag,pos_tag_sents
from pythainlp import tag
from tqdm import tqdm

In [None]:
!tar -xf /content/drive/MyDrive/SuperAI_NLP/AIFORTHAI-LST20Corpus.tar.gz

In [None]:
train_path = '/content/LST20_Corpus/train'
test_path = '/content/LST20_Corpus/test'

def get_doc(path):
  docs = []
  for i in tqdm(list(os.scandir(path))):
    if i.name[0] != '.':
      doc = open(i.path,'r').read()
      doc = doc.split('\n\n')
      doc = [[k.split('\t')[0] for k in d.split('\n')] for d in doc]
      # doc = [d.split('\t')[0] for j in doc for d in j]
      docs.append(doc)
  return docs

def get_label(docz):
  label = []
  for i in docz:
    temp = []
    for index, j in enumerate(i[:-1]):
      # print(len(j))
      temp.append(['I_SENT']*len(j))
      temp[index][-1] = 'E_SENT'
      temp[index][0] = 'B_SENT'
    label.append(temp)
  return label

docs = get_doc(train_path)
docs_test = get_doc(test_path)
label = get_label(docs)
label_test = get_label(docs_test)

list_pos = ['AJ', 'AV', 'AX', 'CC', 'CL', 'FX', 'IJ', 'NG', 'NN', 'NU', 'PA', 'PR', 'PS', 'PU', 'VV', 'XX']
dic_pos = {j:i for i,j in enumerate(list_pos)}
dic_pos['<PAD_TOKEN>'] = len(dic_pos)
inv_dic_pos = {i:j for j,i in dic_pos.items()}

dic = {j:i for i,j in enumerate(list(set([x for i in docs for d in i for x in d])))}
dic_class = {}
dic['<PAD_TOKEN>'] = len(dic)
dic['<UNKNOWN>'] = len(dic)
dic_class['I_SENT'] = len(dic_class)
dic_class['B_SENT'] = len(dic_class)
dic_class['E_SENT'] = len(dic_class)
dic_class['O_SENT'] = len(dic_class)
dic_class['P_SENT'] = len(dic_class)
inv_dic_class = {i:j for j,i in dic_class.items()}


100%|██████████| 7588/7588 [00:01<00:00, 4329.68it/s]
100%|██████████| 966/966 [00:00<00:00, 6559.79it/s]


In [None]:
print(dic_class)
print(inv_dic_class)
print(dic_pos)



{'I_SENT': 0, 'B_SENT': 1, 'E_SENT': 2, 'O_SENT': 3, 'P_SENT': 4}
{0: 'I_SENT', 1: 'B_SENT', 2: 'E_SENT', 3: 'O_SENT', 4: 'P_SENT'}
{'AJ': 0, 'AV': 1, 'AX': 2, 'CC': 3, 'CL': 4, 'FX': 5, 'IJ': 6, 'NG': 7, 'NN': 8, 'NU': 9, 'PA': 10, 'PR': 11, 'PS': 12, 'PU': 13, 'VV': 14, 'XX': 15, '<PAD_TOKEN>': 16}


In [None]:
X_train, Y_train = [],[]
max_length = 1024

def get_data(docz, labelz):
  X_train, Y_train, Original = [],[],[]
  def get_middle_index_b(label):
    # print(label)
    b_index = []
    for i in range(len(label)):
      if label[i] == dic_class['B_SENT']: b_index.append(i)
    sorted(b_index)
    return b_index[round(len(b_index) / 2)]

  for x,y in tqdm(zip(docz,labelz),total = len(docz)):
    string = []
    pstag = []
    lab = []
    pstag_temp = []
    original = []
    for xx, yy in zip(x,y):
      # print(xx)
      original.extend(xx+['_'])
      pstag_temp = [pos_tag([a],corpus='lst20')[0][1] for a in xx]
      xx = [dic[word] if word in dic.keys() else dic['<UNKNOWN>'] for word in xx]
      pstag_temp = [dic_pos[p] for p in pstag_temp]
      string.extend(xx)
      string.append(dic['_'])
      yy = [dic_class[word] for word in yy]
      lab.extend(yy)
      lab.append(dic_class['O_SENT'])
      pstag.extend(pstag_temp)
      pstag.append(dic_pos[pos_tag(['_'], corpus='lst20')[0][1]])
    Original.append(original)
    if len(string) > max_length:
      id = get_middle_index_b(lab)
      if len(string[:id]) < max_length:
        padded_string = string[:id] + [dic['<PAD_TOKEN>']] * (max_length - len(string[:id]))
        padded_label = lab[:id] + [dic_class['P_SENT']] * (max_length - len(string[:id]))
        padded_pos = pstag[:id] + [dic_pos['<PAD_TOKEN>']] * (max_length - len(string[:id]))
        X_train.append([padded_string, padded_pos])
        Y_train.append(padded_label)
        # print(id,len(string[:id]), len( [dic['<PAD_TOKEN>']] * (max_length - len(string[:id]))))
      if len(string[id:]) < max_length:
        padded_string = string[id:] + [dic['<PAD_TOKEN>']] * (max_length - len(string[id:]))
        padded_label = lab[id:] + [dic_class['P_SENT']] * (max_length - len(string[id:]))
        padded_pos = pstag[id:] + [dic_pos['<PAD_TOKEN>']] * (max_length - len(string[id:]))
        X_train.append([padded_string, padded_pos])
        Y_train.append(padded_label)
    else:
        padded_string = string + [dic['<PAD_TOKEN>']] * (max_length - len(string))
        padded_label = lab + [dic_class['P_SENT']] * (max_length - len(string))
        padded_pos = pstag + [dic_pos['<PAD_TOKEN>']] * (max_length - len(string))
        X_train.append([padded_string, padded_pos])
        Y_train.append(padded_label)
  return X_train, Y_train, Original
    # print(len(string))

X_train, Y_train, Original_train = get_data(docs, label)
X_test, Y_test, Original_test = get_data(docs_test, label_test)

100%|██████████| 3794/3794 [01:31<00:00, 41.55it/s]
100%|██████████| 483/483 [00:04<00:00, 99.82it/s]


In [None]:
temp = np.array(X_train[0])
ori = np.array(Original_train[0])
for i,j,k in zip(temp[0,:], temp[1,:], ori):
  print(i,inv_dic_pos[j],k)


In [None]:
[j for i in np.array(docs[0]).flatten() for j in i]

In [None]:
# model = Sequential() 
# model.add(InputLayer(input_shape=(1024,)))
# model.add(Embedding(len(dic),64))
# model.add(Bidirectional(LSTM(64,return_sequences=True)))
# model.add(TimeDistributed(Dense(5)))
# model.add(Activation('softmax'))
# model.compile(optimizer="Adam",loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
#               weighted_metrics=['accuracy'], run_eagerly=True)

input_word = Input(shape=(1024,))
embed_word = Embedding(len(dic),64)(input_word)
input_pos = Input(shape=(1024,))
embed_pos = Embedding(len(dic_pos),10)(input_pos)

concat = Concatenate()([embed_word, embed_pos])
bidirec = Bidirectional(LSTM(80,return_sequences=True))(concat)
time_dis = TimeDistributed(Dense(30))(bidirec)
drop = TimeDistributed(Dropout(0.3))(time_dis)
dense = TimeDistributed(Dense(13))(drop)
dense = TimeDistributed(Dense(5))(dense)
act = Activation('softmax')(dense)
model = Model(inputs=[input_word, input_pos], outputs=[act])
model.compile(optimizer="Adam",loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              weighted_metrics=['accuracy'], run_eagerly=True)

# input_word = Input(shape=(1024,))
# embed_word = Embedding(len(dic),64)(input_word)
# input_pos = Input(shape=(1024,))
# embed_pos = Embedding(len(dic_pos),4)(input_pos)

# concat = Concatenate()([embed_word, embed_pos])
# bidirec = Bidirectional(LSTM(128,return_sequences=True))(concat)
# bidirec = Bidirectional(LSTM(64,return_sequences=True))(bidirec)
# bidirec = Bidirectional(LSTM(32,return_sequences=True))(bidirec)
# time_dis = TimeDistributed(Dense(5))(bidirec)
# act = Activation('softmax')(time_dis)
# model = Model(inputs=[input_word, input_pos], outputs=[act])
# model.compile(optimizer="Adam",loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
#               weighted_metrics=['accuracy'], run_eagerly=True)





In [None]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 1024)]       0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 1024)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, 1024, 64)     3132672     ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, 1024, 10)     170         ['input_2[0][0]']                
                                                                                              

In [None]:
Y_test = np.array(Y_test)

print('B', np.sum(Y_test == dic_class['B_SENT']))
print('I', np.sum(Y_test == dic_class['I_SENT']))
print('E', np.sum(Y_test == dic_class['E_SENT']))
print('O', np.sum(Y_test == dic_class['O_SENT']))

total = len(Y_test.flatten())

# class_weight = {dic_class['B_SENT']: total/(np.sum(Y_test == dic_class['B_SENT'])),
#                 dic_class['I_SENT']: total/(np.sum(Y_test == dic_class['I_SENT'])),
#                 dic_class['E_SENT']: total/(np.sum(Y_test == dic_class['E_SENT'])),
#                 dic_class['O_SENT']: total/(np.sum(Y_test == dic_class['O_SENT'])),
#                 dic_class['P_SENT']: total/(np.sum(Y_test == dic_class['P_SENT'])),}

# V1 submit 0.82
class_weight = {dic_class['B_SENT']: 20.0,
                dic_class['I_SENT']: 0.5,
                dic_class['E_SENT']: 20.0,
                dic_class['O_SENT']: 30.0,
                dic_class['P_SENT']: 0.005,}

#V2
# class_weight = {dic_class['B_SENT']: 30.0,
#                 dic_class['I_SENT']: 1.0,
#                 dic_class['E_SENT']: 20.0,
#                 dic_class['O_SENT']: 30.0,
#                 dic_class['P_SENT']: 1.0,}

Y_test = Y_test.tolist()
print(dic_class)
class_weight

B 5116
I 191333
E 5113
O 5116
{'I_SENT': 0, 'B_SENT': 1, 'E_SENT': 2, 'O_SENT': 3, 'P_SENT': 4}


{0: 0.5, 1: 20.0, 2: 20.0, 3: 30.0, 4: 0.005}

In [None]:
def gen_sample_weight(Y, class_weight):
  Y = np.array(Y).astype(np.float32)
  for k,v in class_weight.items():
    Y[Y==k] = v
  return Y

gen_sample_weight(Y_train, class_weight)

array([[2.e+01, 5.e-01, 5.e-01, ..., 5.e-03, 5.e-03, 5.e-03],
       [2.e+01, 5.e-01, 5.e-01, ..., 5.e-03, 5.e-03, 5.e-03],
       [2.e+01, 5.e-01, 5.e-01, ..., 5.e-03, 5.e-03, 5.e-03],
       ...,
       [2.e+01, 5.e-01, 5.e-01, ..., 5.e-03, 5.e-03, 5.e-03],
       [2.e+01, 5.e-01, 5.e-01, ..., 5.e-03, 5.e-03, 5.e-03],
       [2.e+01, 5.e-01, 5.e-01, ..., 5.e-03, 5.e-03, 5.e-03]],
      dtype=float32)

In [None]:
X_train = np.array(X_train).astype(np.float32)
X_test = np.array(X_test).astype(np.float32)
Y_train = np.array(Y_train).astype(np.float32)
Y_test = np.array(Y_test).astype(np.float32)



In [None]:
model.fit([X_train[:,0,:],X_train[:,1,:]], Y_train, batch_size = 128, validation_data = ([X_test[:,0,:],X_test[:,1,:]], Y_test), sample_weight = gen_sample_weight(Y_train, class_weight), epochs=20)

Epoch 1/20
 1/32 [..............................] - ETA: 7s - loss: 1.7487 - accuracy: 0.2047

  return dispatch_target(*args, **kwargs)


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f1f44408190>

In [None]:
y_pred = model.predict([X_test[:,0,:],X_test[:,1,:]])

In [None]:
# model.save('/content/drive/MyDrive/SuperAI_NLP/model_0999_bilstm_1.h5')

In [None]:
correct = 0
total = np.sum(np.array(Y_test) == dic_class['O_SENT'])
corr = 0
for yp,yt in zip(y_pred, Y_test):
  pred = [inv_dic_class[d] for d in np.argmax(yp, axis=1).tolist()]
  gt = [inv_dic_class[d] for d in yt]

  for p,g in zip(pred, gt):
    if g == 'O_SENT':
      if p == g: corr += 1
      # print(p,g)
  # break

print('ac', corr/total)

ac 0.7570367474589523


In [None]:
import pandas as pd

my_file =  open('/content/drive/MyDrive/SuperAI_NLP/ss_test.txt', 'r')


r = my_file.read()


data = r.split("\n")
# print(data)
my_file.close()
text = ''
for  i in range(len(data)):
    
    if data[i] =='':
        text = text+' '
        data[i] = '_'
    else:
        text = text+data[i]
# print(text)
# print(data)
df = pd.DataFrame({'word':data})
data = np.array(data)
id = np.where(data == '_')[0]


In [None]:
inc = 0
idx = list(range(0,len(data),1024))
len(idx)
res = []
for i in idx:
  dat = data[i:i+1024]
  temp = []
  sent = []
  for word in dat:
    if word in dic.keys():
      temp.append(dic[word])
    else: temp.append(dic['<UNKNOWN>'])
  pos = [dic_pos[x[1]] for x in pos_tag(dat.tolist(), corpus='lst20')]
  lenz = len(temp)
  temp = np.expand_dims(np.array(temp + ([dic['<PAD_TOKEN>']]*(1024-lenz))).astype(np.float32), axis = 0)
  pos = np.expand_dims(np.array(pos + ([dic_pos['<PAD_TOKEN>']]*(1024-lenz))).astype(np.float32), axis = 0)

  pred = model.predict([temp, pos])[0]
  pred = pred.argmax(1)
  pred = [inv_dic_class[i] for i in pred]
  pred = pred[:len(dat)]
  res.extend(pred)
  # print(pred)
  # index = np.sort(id[(id > inc) & (id < 1024 + inc)])[-1]
  # inc = i
  # print(index)

In [None]:
print(res.__len__())
print(data.__len__())


74405
74405


In [None]:
map_class = {
    'B_SENT':'B_SENT',
    'I_SENT':'I_SENT',
    'E_SENT':'E_SENT',
    'O_SENT':'O',
    'P_SENT':'I_SENT',
}

res = [map_class[i] for i in res]

In [None]:
np.unique(res, return_counts=True)

(array(['B_SENT', 'E_SENT', 'I_SENT', 'O'], dtype='<U6'),
 array([ 1196,  1240, 70745,  1224]))

In [None]:
sub = pd.read_csv('/content/drive/MyDrive/SuperAI_NLP/ss_sample_submission.csv')
sub['Predicted'] = res[:-2]

In [None]:
sub.to_csv('submit3.csv',index=False)

In [None]:
sub

Unnamed: 0,Id,Predicted
0,1,I_SENT
1,2,I_SENT
2,3,I_SENT
3,4,I_SENT
4,5,I_SENT
...,...,...
74398,74399,I_SENT
74399,74400,I_SENT
74400,74401,I_SENT
74401,74402,I_SENT




In [None]:
rez = y_pred.argmax(2).flatten()

In [None]:
import copy
temp_rez = []
t = []
count = 0
for i in rez:
  if i == 'O':
    count += 1
    if len(t) == 0:print('t')
    temp_rez.append(copy.deepcopy(t))
    t = []
  else:
    count += 1
    t.append(i)
temp_rez.append(t)
# rez = ''.join(rez).split('O')
rez = [['I_SENT']*len(i) for i in temp_rez]
for i in range(len(rez)):
  if len(rez[i]) == 0: 
    rez[i] = ['I_SENT']
    continue
  elif len(rez[i]) == 1: rez[i] = ['B_SENT']
  elif len(rez[i]) == 2: rez[i] = ['B_SENT','E_SENT']
  else:
    rez[i][0] = 'B_SENT'
    rez[i][-1] = 'E_SENT'
  rez[i].append('O')

temp = []
for i in rez:
  for j in i:
    temp.append(j)
  # temp.append('O')
# rez = [i for j in rez for i in j]
temp = [dic_class[i] for i in temp]

KeyError: ignored

In [None]:
from sklearn.metrics import f1_score

ytest = np.array(Y_test).flatten()
ypred = temp[:-1]

f1_score(ytest, ypred, average='micro')

ValueError: ignored

In [None]:
from sklearn.metrics import f1_score

ytest = np.array(Y_test).flatten()
ypred = y_pred.argmax(2).flatten()

f1_score(ytest, ypred, average='micro')

0.9683412977867203

In [None]:
from sklearn.metrics import f1_score

ytest = np.array(Y_test).flatten()
ypred = y_pred.argmax(2).flatten()

f1_score(ytest, ypred, average='micro')

0.9763286751760564

In [None]:
from sklearn.metrics import f1_score

ytest = np.array(Y_test).flatten()
ypred = y_pred.argmax(2).flatten()

f1_score(ytest, ypred, average='micro')

0.9598194636569416

In [None]:
model = tf.keras.models.load_model('/content/drive/MyDrive/SuperAI_NLP/model_0999_bilstm_1.h5')

In [None]:
pos_tag(docs[0][0],corpus='lst20')

[('เปิด', 'VV'),
 ('ขนส่ง', 'VV'),
 ('สินค้า', 'NN'),
 ('เชื่อม', 'VV'),
 (' ', 'PU'),
 ('3', 'NU'),
 (' ', 'PU'),
 ('ประเทศ', 'CL')]

IndexError: ignored

In [None]:
docs[0]