In [1]:
import os
import numpy as np
import pandas as pd

from tensorflow.keras.preprocessing.sequence import pad_sequences

from drivers.loaders.imdb import IMDB
from drivers.loaders.newsspace200 import Newsspace200
from drivers.loaders.sentimentMD import SentimentMD
from drivers.loaders.sentiment140 import Sentiment140

from drivers.tokenizers.word_piece_vocab import WordPieceVocab
from drivers.tokenizers.word_level_vocab import WordLevelVocab
from drivers.tokenizers.unigram_vocab import UnigramVocab
from drivers.tokenizers.bpe_vocab import BPEVocab

from drivers.models.simple import Simple

In [2]:
VOCAB_SIZE = 1000
UNK_TOKEN = "[UNK]"
PATH_VOCABS = "vocabs/"
PATH_ENCODES = "encodes/"
MAX_PEDDING_RATIO = 0.8

In [3]:
def encode(model, data):
    result = []
    for item in data:
        output = model.encode(item)
        result.append(output.ids)
    return result

# Data loading

In [4]:
dbs = [
    { "db" : SentimentMD("data") },
    { "db" : Sentiment140("data") },
    { "db" : Newsspace200("data") },
    { "db" : IMDB("data") }
]

# Train tokenizers

In [5]:
for item in dbs:
    item["vocabs"] = [
        WordPieceVocab(item["db"].get_train()["text"].values, UNK_TOKEN, VOCAB_SIZE),
        WordLevelVocab(item["db"].get_train()["text"].values, UNK_TOKEN, VOCAB_SIZE),
        UnigramVocab(item["db"].get_train()["text"].values, UNK_TOKEN, VOCAB_SIZE),
        BPEVocab(item["db"].get_train()["text"].values, UNK_TOKEN, VOCAB_SIZE)
    ] 

In [6]:
for item in dbs:
    for vocab in item["vocabs"]:
        file_name_vocabs = PATH_VOCABS + vocab.name + "_" + item["db"].name + ".json"
        print(file_name_vocabs)
        if os.path.isfile(file_name_vocabs) == False:
            vocab.train()
            print("TRAINED:", item["db"].name)
            vocab.save(file_name_vocabs)
            print("SAVED:", item["db"].name)
        else:
            vocab.load(file_name_vocabs)
            print("LOADED:", item["db"].name)

vocabs/word_piece_IMDB.json
LOADED: IMDB
vocabs/word_level_IMDB.json
LOADED: IMDB
vocabs/unigram_IMDB.json
LOADED: IMDB
vocabs/bpe_IMDB.json
LOADED: IMDB


In [7]:
for item in dbs:
    item["encodes"] = []
    for vocab in item["vocabs"]:
        file_name_encodes_train = PATH_ENCODES + vocab.name + "_" + item["db"].name + "_train.json"
        file_name_encodes_test = PATH_ENCODES + vocab.name + "_" + item["db"].name + "_test.json"
        item["encodes"].append({
            "train" : pd.DataFrame(),
            "test" : pd.DataFrame(),
        })
        if os.path.isfile(file_name_encodes_train) == False or os.path.isfile(file_name_encodes_test) == False :
            item["encodes"][-1]["train"]["X"] = encode(vocab, item["db"].get_train()["text"].values)
            item["encodes"][-1]["train"]["y"] = item["db"].get_train()["label"].values
            item["encodes"][-1]["train"].to_json(file_name_encodes_train, orient="records", lines=True)

            item["encodes"][-1]["test"]["X"] = encode(vocab, item["db"].get_test()["text"].values)
            item["encodes"][-1]["test"]["y"] = item["db"].get_test()["label"].values
            item["encodes"][-1]["test"].to_json(file_name_encodes_test, orient="records", lines=True)
            print("ENCODED (CREATED AND LOADED):", 
                  file_name_encodes_train, 
                  file_name_encodes_test,
                  vocab.name, 
                  item["db"].name)
        else:                        
            item["encodes"][-1]["train"] = pd.read_json(file_name_encodes_train, orient="records", lines=True)
            item["encodes"][-1]["test"] = pd.read_json(file_name_encodes_test, orient="records", lines=True)
            print("ENCODED (LOADED):", 
                  file_name_encodes_train, 
                  file_name_encodes_test, 
                  vocab.name, 
                  item["db"].name)

        tmp_sorted = list(item["encodes"][-1]["train"].X.map(len).sort_values())
        index = round(len(tmp_sorted) * MAX_PEDDING_RATIO)
        item["encodes"][-1]["train"]["X"] = list(pad_sequences(item["encodes"][-1]["train"]["X"].values, 
                                                               maxlen=tmp_sorted[index]))
        item["encodes"][-1]["test"]["X"] = list(pad_sequences(item["encodes"][-1]["test"]["X"].values, 
                                                              maxlen=tmp_sorted[index]))
        print("PADDED TRAIN AND TEST: ", tmp_sorted[index], vocab.name, item["db"].name)                                 
    # break

ENCODED (LOADED): encodes/word_piece_IMDB_train.json encodes/word_piece_IMDB_test.json word_piece IMDB
PADDED TRAIN AND TEST:  694 word_piece IMDB
ENCODED (LOADED): encodes/word_level_IMDB_train.json encodes/word_level_IMDB_test.json word_level IMDB
PADDED TRAIN AND TEST:  412 word_level IMDB
ENCODED (LOADED): encodes/unigram_IMDB_train.json encodes/unigram_IMDB_test.json unigram IMDB
PADDED TRAIN AND TEST:  720 unigram IMDB
ENCODED (LOADED): encodes/bpe_IMDB_train.json encodes/bpe_IMDB_test.json bpe IMDB
PADDED TRAIN AND TEST:  650 bpe IMDB


# Train and Test

In [8]:
 for item in dbs:
    for i in range(len(item["vocabs"])):
        simple_name = "Simple_" + item["db"].name + "_" + item["vocabs"][i].name
        simple = Simple(vocab_size=item["vocabs"][i].vocab_size, 
                        input_lenght=len(item["encodes"][i]["train"]["X"].values[0]), 
                        embedding_size=8,
                        output_size=item["db"].get_labels(),
                        repeate=2,
                        name=simple_name)
        
        simple.set_data(train_X=np.array([item for item in item["encodes"][i]["train"]["X"].values]), 
                        train_y=np.array(item["encodes"][i]["train"]["y"].values), 
                        test_X=np.array([item for item in item["encodes"][i]["test"]["X"].values]), 
                        test_y=np.array(item["encodes"][i]["test"]["y"].values))

        history = simple.fit()
        for h in history: print(h.history)
    # break

Model: "Simple_IMDB_word_piece"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 694, 8)            8000      
_________________________________________________________________
flatten (Flatten)            (None, 5552)              0         
_________________________________________________________________
dense (Dense)                (None, 64)                355392    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 363,457
Trainable params: 363,457
Non-trainable params: 0
_________________________________________________________________
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Model: "Simple_IMDB_word_piece"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (Non

InvalidArgumentError:  indices[15,439] = 1000 is not in [0, 1000)
	 [[node Simple_IMDB_unigram/embedding_4/embedding_lookup (defined at f:\projects\POC\evaltok\drivers\models\simple.py:79) ]] [Op:__inference_train_function_30448]

Errors may have originated from an input operation.
Input Source operations connected to node Simple_IMDB_unigram/embedding_4/embedding_lookup:
 Simple_IMDB_unigram/embedding_4/embedding_lookup/30219 (defined at C:\Users\opell.DESKTOP-UEQ8DPV\anaconda3\envs\E01\lib\contextlib.py:112)

Function call stack:
train_function
