# **1. Clone GIT and Prepare Files**

In [None]:
# Clone DeepCut Repo
!git clone https://github.com/huak95/deepcut.git
# Clone Additional CORPUS
!git clone https://github.com/korakot/corpus.git

Cloning into 'deepcut'...
remote: Enumerating objects: 523, done.[K
remote: Counting objects: 100% (18/18), done.[K
remote: Compressing objects: 100% (18/18), done.[K
remote: Total 523 (delta 10), reused 0 (delta 0), pack-reused 505[K
Receiving objects: 100% (523/523), 11.54 MiB | 39.13 MiB/s, done.
Resolving deltas: 100% (275/275), done.
Cloning into 'corpus'...
remote: Enumerating objects: 26836, done.[K
remote: Counting objects: 100% (30/30), done.[K
remote: Compressing objects: 100% (21/21), done.[K
remote: Total 26836 (delta 16), reused 23 (delta 9), pack-reused 26806[K
Receiving objects: 100% (26836/26836), 62.86 MiB | 17.77 MiB/s, done.
Resolving deltas: 100% (670/670), done.
Checking out files: 100% (27996/27996), done.


In [None]:
# Download and UnZIP Kaggle Word SegmentDataSet (LST20)
!gdown --id '1OxAPWlilK1gAl0_hDlMEHAUe_r-WogoG'
!unzip '/content/dataset.zip'

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: dataset/train/T10215.txt  
  inflating: __MACOSX/dataset/train/._T10215.txt  
  inflating: dataset/train/T00760.txt  
  inflating: __MACOSX/dataset/train/._T00760.txt  
  inflating: dataset/train/T10573.txt  
  inflating: __MACOSX/dataset/train/._T10573.txt  
  inflating: dataset/train/T00774.txt  
  inflating: __MACOSX/dataset/train/._T00774.txt  
  inflating: dataset/train/T10567.txt  
  inflating: __MACOSX/dataset/train/._T10567.txt  
  inflating: dataset/train/T11679.txt  
  inflating: __MACOSX/dataset/train/._T11679.txt  
  inflating: dataset/train/T12370.txt  
  inflating: __MACOSX/dataset/train/._T12370.txt  
  inflating: dataset/train/T12416.txt  
  inflating: __MACOSX/dataset/train/._T12416.txt  
  inflating: dataset/train/T10201.txt  
  inflating: __MACOSX/dataset/train/._T10201.txt  
  inflating: dataset/train/T12371.txt  
  inflating: __MACOSX/dataset/train/._T12371.txt  
  inflating: dataset/trai

# **2. Install Dependency**

In [None]:
# install DeepCut Requirements
!pip install -r /content/deepcut/requirements.txt -qq

In [None]:
%cd deepcut/deepcut
%ls

/content/deepcut/deepcut
deepcut.py  __init__.py  model.py  stop_words.py  train.py  utils.py  [0m[01;34mweight[0m/


In [None]:
%pwd

'/content/deepcut/deepcut'

# **3. Import Library**

In [None]:
#!/usr/bin/env python
# encoding: utf-8
import os
from glob import glob
import pandas as pd
from functools import reduce
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
import utils
from train import *

if __package__ != 'deepcut':
    from utils import create_n_gram_df, CHAR_TYPE_FLATTEN, CHARS_MAP, CHAR_TYPES_MAP
    from model import get_convo_nn2
else:
    from .utils import create_n_gram_df, CHAR_TYPE_FLATTEN, CHARS_MAP, CHAR_TYPES_MAP
    from .model import get_convo_nn2


# **4. Load LST20 Corpus**

### DataFrame แบบแบ่ง by word (1 row คือ 1 word)

ผู้เขียนได้เพิ่ม column sentenceBound เข้าไป เพื่อให้ง่ายต่อการใช้ DataFrame (ปกติ sentence จะ bound ด้วย blank line)
                                                                                                                                                                  
***value ใน sentenceBound***

*   *B-SEN* สำหรับ คำที่เริ่มต้นประโยค
*   *I-SEN* สำหรับ คำที่ไม่ใช่คำจบประโยค
*  *E-SEN* สำหรับ คำจบประโยค

## **!!!! _ (วรรค) จะถูกนับเป็น I-SEN**






In [None]:
#read file by glob
path_train = glob('/content/dataset/train/*')
path_test  = glob('/content/dataset/test/*')
path_eval  = glob('/content/dataset/eval/*')
print(path_train[1])

/content/dataset/train/T00432.txt


In [None]:
def create_dataframe(list_path):
    data_list = []
    for item in list_path:
        with open(item, 'r') as f:
            for line in f:
                if line == '\n':
                    continue
                line = line.strip().split('\t')
                data_list.append(line)
    df = pd.DataFrame(data_list, columns = ['word', 'pos', 'entity-name', 'sentence-seg'])
    df['pos'] = df['pos'].fillna('O')
    df['entity-name'] = df['entity-name'].fillna('O')
    df['sentence-seg'] = df['sentence-seg'].fillna('O')
    return df

In [None]:
# train_df = create_dataframe(path_train)
# test_df  = create_dataframe(path_test)
# eval_df  = create_dataframe(path_eval)

In [None]:
# print(f'train_df.shape: {train_df.shape}')
# train_df.head()

In [None]:
!mkdir '/content/LST20/'

In [None]:
def mkdir_not_error(path):
    try:
        os.mkdir(path)
    except:
        pass

def df_to_csv(paths):
    filename = paths[0].split('/')[-2]
    print(f'filename: {filename}')
    train_df = create_dataframe(paths)
    train_array = train_df['word'].to_numpy()
    df_train = create_char_dataframe(train_array)
    mkdir_not_error('/content/LST20/')
    mkdir_not_error(f'/content/LST20/{filename}')
    df_train.to_csv(f'/content/LST20/{filename}/df_best_LST20_{filename}.csv')

#### Create all CSV in DeepCut formats

In [None]:
!rm -rf '/content/LST20'

df_to_csv(path_train)
df_to_csv(path_test)
df_to_csv(path_eval)

filename: train
filename: test
filename: eval


# **5. FineTune Model**

## Prepare Feature

In [None]:
# x_char, x_type, y = prepare_feature('/content/deepcut/deepcut/cleaned_data')

In [None]:
# utils.CHARS_MAP.get(x_char, 80)

In [None]:
!cp -r /content/LST20/eval /content/LST20/val

In [None]:
%%time
model = train_model('/content/LST20', verbose=1)

train with 10 epochs and 4096 batch size
Epoch 1/10
Epoch 00001: val_loss improved from inf to 0.03713, saving model to ../weight/model_weight.h5
Epoch 2/10
Epoch 00002: val_loss improved from 0.03713 to 0.02946, saving model to ../weight/model_weight.h5
Epoch 3/10
Epoch 00003: val_loss improved from 0.02946 to 0.02627, saving model to ../weight/model_weight.h5
Epoch 4/10
Epoch 00004: val_loss improved from 0.02627 to 0.02506, saving model to ../weight/model_weight.h5
Epoch 5/10
Epoch 00005: val_loss improved from 0.02506 to 0.02385, saving model to ../weight/model_weight.h5
Epoch 6/10
Epoch 00006: val_loss improved from 0.02385 to 0.02303, saving model to ../weight/model_weight.h5
Epoch 7/10
Epoch 00007: val_loss improved from 0.02303 to 0.02222, saving model to ../weight/model_weight.h5
Epoch 8/10
Epoch 00008: val_loss did not improve from 0.02222
Epoch 9/10
Epoch 00009: val_loss improved from 0.02222 to 0.02207, saving model to ../weight/model_weight.h5
Epoch 10/10
Epoch 00010: val_

In [None]:
!pwd

/content/deepcut/deepcut


In [None]:
!mkdir last_weight

# Save Model

In [None]:
!pwd

/content/deepcut/deepcut


In [None]:
model.save_weights('../../last_weight/last_weight')

In [None]:
!zip -r ../../content/last_weight.zip ../../content/last_weight

UsageError: Line magic function `%zip` not found.


In [None]:
!pwd

/content/deepcut/deepcut


In [None]:
!zip -r ../../last_weight.zip ../../last_weight

  adding: ../../last_weight/ (stored 0%)
  adding: ../../last_weight/checkpoint (deflated 38%)
  adding: ../../last_weight/last_weight.data-00000-of-00001 (deflated 9%)
  adding: ../../last_weight/last_weight.index (deflated 76%)


# Load Model

#### **Load test**

In [None]:
!gdown --id '1zcSGEPXk-DotYSgkU3vHxLiy6kmYqNWX'
!unzip 'nlp-1-word-segmentation-without-dataset.zip'

Downloading...
From: https://drive.google.com/uc?id=1zcSGEPXk-DotYSgkU3vHxLiy6kmYqNWX
To: /content/deepcut/deepcut/nlp-1-word-segmentation-without-dataset.zip
  0% 0.00/101k [00:00<?, ?B/s]100% 101k/101k [00:00<00:00, 76.4MB/s]
Archive:  nlp-1-word-segmentation-without-dataset.zip
  inflating: nlp-1-word-segmentation/ws_list.txt  
  inflating: nlp-1-word-segmentation/ws_sample_submission.csv  
  inflating: nlp-1-word-segmentation/ws_test.txt  


In [None]:
# Load Test.txt File
path = '../../nlp-1-word-segmentation/'
f = open(path + 'ws_test.txt', "r")
txtData = (f.read())
f.close()

FileNotFoundError: ignored

In [None]:
def tokenize_word(text):
    pretrained_model = get_convo_nn2()
    pretrained_model.load_weights("last_weight")
    x_char, x_type = utils.create_feature_array(text)

    y_predict = pretrained_model.predict([x_char, x_type])
    y_predict = (y_predict.ravel() > 0.5).astype(int)
    word_end = y_predict[1:].tolist() + [1]
    tokens = []
    word = ''
    for char, w_e in zip(text, word_end):
        word += char
        if w_e:
            tokens.append(word)
            word = ''
    return tokens

In [None]:
tokenize_word('นางสุดารัตน์เกยุราพันธ์ รมว.สาธารณสุข')



['นาง', 'สุดารัตน์', 'เกยุราพันธ์ ', 'รมว.', 'สาธารณสุข']