In [None]:
import sys
import numpy as np
import pandas as pd
import random
import collections
import re
from tqdm import tqdm
from nltk.tokenize import word_tokenize

In [None]:
from enum import Enum

In [None]:
class Token(Enum):
    PAD_TOK = '<PAD>'
    PAD_IDX = 0
    UNK_TOK = '<UNK>'
    UNK_IDX = 1
    SOS_TOK = '<SOS>'
    SOS_IDX = 2
    EOS_TOK = '<EOS>'
    EOS_IDX = 3

In [None]:
class Preprocessor :

    def __init__(self, data, tokenize, th=3) :
        self.token_data = [tokenize(sen.lower()) for sen in data]
        self.tokenize = tokenize
        self.th = th
        
        self.word2idx, self.idx2word = self.build_set()
        
    def build_set(self) :
        vocab_set = collections.Counter()
        for sen in self.token_data :
            vocab_set.update(sen)
        
        vocab_set = dict(vocab_set)
        valid_tok = []
        for tok, count in vocab_set.items() : 
            if count >= self.th and (re.search('[0-9]' , tok) == None) :
                valid_tok.append(tok)
                
        random.shuffle(valid_tok)
        tok_list = [Token.PAD_TOK.value,
                    Token.UNK_TOK.value, 
                    Token.SOS_TOK.value, 
                    Token.EOS_TOK.value] + valid_tok
        
        word2idx = dict(zip(tok_list, range(len(tok_list))))
        idx2word = {word: idx for idx, word in word2idx.items()}
        
        return word2idx, idx2word
        
    def __len__(self) :
        return len(self.token_data)
    
    def get_size(self) :
        return len(self.word2idx)
    
    def encode_sen(self, sen) :
        idx_list = []
        for tok in sen :
            if tok not in self.word2idx :
                tok = Token.UNK_TOK.value
            idx_list.append(self.word2idx[tok])
            
        idx_list = [Token.SOS_IDX.value] + idx_list + [Token.EOS_IDX.value]
        return idx_list
    
    def encode(self) :
        idx_data = [self.encode_sen(sen) for sen in self.token_data]
        return idx_data

    def decode(self, idx_list) :
        return [self.idx2word[idx] for idx in idx_list]
    