# English - Alice in wonderland

## Data preparation

In [2]:
from importlib import reload

In [1]:
import preprocessor
preprocessor = reload(preprocessor)


pp = preprocessor.Preprocessor("data/original/alice_in_wonderland.txt", "eng")
cleaned_corpus = pp.process()
pp.split(cleaned_corpus)

## Subword segmentation

In [3]:
import sentpiece
sentpiece = reload(sentpiece)


NUM_CHARS = 100
TRAIN_DATA_PATH = "data/processed/eng_train.txt"
MODEL_NAME = "eng_model_sml"
SEG_DATA_PATH = "eng_sml.txt"


sentpiece.train_model(TRAIN_DATA_PATH, MODEL_NAME, NUM_CHARS)

In [4]:
!mv $MODEL_NAME* spm_models/

In [5]:
sentpiece.segmentation(TRAIN_DATA_PATH, MODEL_NAME, SEG_DATA_PATH)

## Train LM

In [18]:
TRAIN_SEG_DATA_PATH = "data/segmented/eng_sml.txt"
NUM_LAYERS = 40
CLASS = 9999
BPTT = 3


!bash train_script.sh $TRAIN_SEG_DATA_PATH "data/processed/eng_test.txt" $NUM_LAYERS $CLASS $BPTT

rm: cannot remove 'rnnlm/models/model': No such file or directory
rm: cannot remove 'rnnlm/models/model.output.txt': No such file or directory
debug mode: 2
train file: data/segmented/eng_sml.txt
valid file: data/processed/eng_test.txt
class size: 9999
Hidden layer size: 40
BPTT: 3
Rand seed: 1
rnnlm file: model
Starting training using file data/segmented/eng_sml.txt
Vocab size: 98
Words in train file: 70003
Iter:   0	Alpha: 0.100000	   TRAIN entropy: 4.9914    Words/sec: 1226.2   VALID entropy: 11.5537
Iter:   1	Alpha: 0.100000	   TRAIN entropy: 4.1096    Words/sec: 1382.2   VALID entropy: 12.4125
Iter:   2	Alpha: 0.050000	   TRAIN entropy: 4.0804    Words/sec: 1380.0   VALID entropy: 12.5771

real	2m43.524s
user	2m42.983s
sys	0m0.176s


In [19]:
MODEL_OUTPUT = MODEL_NAME + ".output.txt"

!mv model $MODEL_NAME
!mv model.output.txt $MODEL_OUTPUT

!mv $MODEL_NAME $MODEL_OUTPUT rnnlm_models/

## Data generation

In [21]:
GEN_DATA_PATH = "data/generated/eng_model_sml/"
MODEL_PATH = f"./rnnlm_models/{MODEL_NAME}"

!bash gen_script.sh $GEN_DATA_PATH $MODEL_PATH

^C


In [22]:
# Decoding generated texts

sentpiece.desegmentation("data/generated/eng_model_sml/100.txt", "eng_model_chr")

# Bengali

In [1]:
from importlib import reload

In [2]:
import preprocessor
preprocessor = reload(preprocessor)


pp = preprocessor.Preprocessor("data/original/bengali_corpus.txt", "bng")
cleaned_corpus = pp.process()
pp.split(cleaned_corpus)

10861 2716


## Subword segmentation

In [3]:
import sentpiece
sentpiece = reload(sentpiece)

In [None]:
# character-level
TRAIN_DATA_PATH = "data/processed/bng_train.txt"
TEST_DATA_PATH = "data/processed/bng_test.txt"
models, test_models = [], []
seg_train_path, seg_test_path = [], []
start=50
stop=100
step=50

for vocab_size in range(start, stop, step):
    MODEL_NAME = "bng_train_"+str(vocab_size)
    SEG_DATA_PATH = f"bng_tr_chr_{str(vocab_size)}.txt"
    sentpiece.train_model(TRAIN_DATA_PATH, MODEL_NAME, vocab_size, lang="bng")
    models.append(MODEL_NAME)
    seg_train_path.append(SEG_DATA_PATH)
    
for vocab_size in range(start, stop, step):
    MODEL_NAME = "bng_test_"+str(vocab_size)
    SEG_DATA_PATH = f"bng_te_chr_{str(vocab_size)}.txt"
    sentpiece.train_model(TEST_DATA_PATH, MODEL_NAME, vocab_size, lang="bng")
    test_models.append(MODEL_NAME)
    seg_test_path.append(SEG_DATA_PATH)

In [None]:
import shutil, glob, os

for model in models+test_models:
    for file in glob.glob(f'{model}*'):
        cwd = os.getcwd() 
        src = cwd
        dst = cwd + "/spm_models/"
        shutil.move(os.path.join(src, file), os.path.join(dst, file))

In [None]:
for model, seg_path in zip(models,seg_train_path):
    sentpiece.segmentation(TRAIN_DATA_PATH, model, seg_path)

for model, seg_path in zip(test_models,seg_test_path):
    sentpiece.segmentation(TEST_DATA_PATH, model, seg_path)

## Train LM

In [None]:
for model, seg_path in zip(models,seg_train_path):
    print(seg_path)

In [None]:
import subprocess

NUM_LAYERS = [60,70]
CLASS = [50,6000,7000,8000,9000,9999]
BPTT = [3,4]

dir_path="data/segmented/"

for train_file, test_file in zip(seg_train_path, seg_test_path):
    for hid in NUM_LAYERS:
        for cl in CLASS:
            for bp in BPTT:
                fname="hd"+str(hid)+"_cl"+str(cl)+"_"+str(bp)+"_"+train_file[train_file.rfind("/")+1:train_file.rfind('.')]
                print(fname)
                args = ['bash', 'train_script.sh', dir_path+train_file, dir_path+test_file, fname, str(hid), str(cl), str(bp)]
                p = subprocess.run(args)
                print(p)


## Data generation

In [None]:
GEN_DATA_PATH = "data/generated/hd70_cl6000_3_bng_tr_chr_50/"
MODEL_PATH = f"./rnnlm_models/hd70_cl6000_3_bng_tr_chr_50"

from pathlib import Path
Path(GEN_DATA_PATH).mkdir(parents=True, exist_ok=True)

!bash gen_script.sh $GEN_DATA_PATH $MODEL_PATH

In [None]:
# Decoding generated texts

sentpiece.desegmentation("data/generated/hd70_cl6000_3_bng_tr_chr_50/100.txt", "bng_train_50")

## OOV comparison

In [None]:
from collections import Counter

def get_vocab(path):
    vocab=[]
    with open(path) as f:
        data = f.read()
        vocab = Counter(data.split())
        return vocab

TRAIN_DATA_PATH = "data/processed/bng_train.txt"
TEST_DATA_PATH = "data/processed/bng_test.txt"
train_vocab = get_vocab(TRAIN_DATA_PATH)
test_vocab = get_vocab(TEST_DATA_PATH)
print(len(train_vocab), len(test_vocab))

In [None]:
def compute_oov_rate(train_vocab, test_vocab):
    oov_words = list(test_vocab.keys()-train_vocab.keys())
    count_oov = 0
    for k, v in test_vocab.items():
        if k in oov_words:
            count_oov += v
            
    oov_rate = count_oov / sum(test_vocab.values())
    return oov_rate

compute_oov_rate(train_vocab, test_vocab)

In [None]:
def augment_vocab(train_vocab, add_vocab):
    #print(len(train_vocab))
    for k, v in add_vocab.items():
        if k not in train_vocab:
            train_vocab.update({k:v})
        else:
            train_vocab.update({k:train_vocab[k]+v})
    #print(len(train_vocab))
    return train_vocab

In [None]:
# char-level baselines 
dir_path="data/generated/bng_chr_50/"
oov_rates = {}
for i in range(1,8):
    file=10**i
    print(file)
    #sentpiece.desegmentation(dir_path+str(file)+".txt", "bng_train_50")
    add_vocab=get_vocab(dir_path+str(file)+"_desegmented.txt")
    oov_rates[file] = compute_oov_rate(augment_vocab(train_vocab, add_vocab), test_vocab)
print(oov_rates)

In [None]:
import matplotlib.pyplot as plt

def plot_oov_rates(oov_rates) -> None:
    fig, ax = plt.subplots()
    plt.loglog(list(oov_rates.keys()), list(oov_rates.values()))
    ax.set_xlabel("vocab size")
    ax.set_ylabel("OOV rate")
    plt.legend()
    plt.show()

plot_oov_rates(oov_rates)

# Analysis