In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
from pathlib import Path
sys.path.append("python_lib")

In [3]:
import torch
from torch import nn
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
import nnsplit
from nnsplit import train, utils, models

In [4]:
cache_dir = Path("cache")
cache_dir.mkdir(exist_ok=True)

# Prepare data

## German

In [5]:
paragraphs = train.xml_to_paragraphs("train_data/dewiki-20180920-corpus.xml", max_n_paragraphs=3_000_000)

HBox(children=(FloatProgress(value=0.0, max=3000000.0), HTML(value='')))

In [6]:
tokenizer = nnsplit.tokenizer.SoMaJoTokenizer("de")

In [7]:
with open(cache_dir / "de_data" / "tokenized_paragraphs.pkl", "wb") as f:
    for x in tokenizer.split(paragraphs, verbose=True):
        f.write(pickle.dumps(x))

HBox(children=(FloatProgress(value=0.0, max=3000000.0), HTML(value='')))

## English

In [None]:
paragraphs = train.xml_to_paragraphs("train_data/enwiki-20181001-corpus.xml", max_n_paragraphs=3_000_000)

In [None]:
tokenizer = nnsplit.tokenizer.SoMaJoTokenizer("en")

In [None]:
with open(cache_dir / "en_data" / "tokenized_paragraphs.pkl", "wb") as f:
    for x in tokenizer.split(paragraphs, verbose=True):
        f.write(pickle.dumps(x))

# Train model (german)

In [12]:
sentences, labels = train.prepare_tokenized_paragraphs(cache_dir / "de_data" / "tokenized_paragraphs.pkl", "de")

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Faulty paragraph:
[[Token(text='.', whitespace='')], [Token(text='GLOBAL', whitespace=' '), Token(text='_set_float', whitespace=''), Token(text='Extend', whitespace=''), Token(text=';', whitespace=' '), Token(text=';', whitespace=' '), Token(text='Sprunglabel', whitespace=' '), Token(text='global', whitespace=' '), Token(text='sichtbar', whitespace=' '), Token(text='_set_float', whitespace=''), Token(text='Extend', whitespace=''), Token(text=':', whitespace=' '), Token(text=';', whitespace=' '), Token(text='Sprunglabel', whitespace=' '), Token(text='angeben', whitespace=''), Token(text=',', whitespace=' '), Token(text='das', whitespace=' '), Token(text='ist', whitespace=' '), Token(text='der', whitespace=' '), Token(text='Name', whitespace=' '), Token(text='des', whitespace=' '), Token(text='Unterprogramms', whitespace=''), Token(text=',', whitespace=' '), Token(text=';', whitespace=' '), Token(text='aus', whitespace=' '), Token(text='C', whitespace=' '), Token(text='ohne', whitespace=

In [17]:
x_train, x_valid, y_train, y_valid = train_test_split(sentences, labels, test_size=0.1, random_state=1234)

In [18]:
de_model = train.train(x_train, y_train, x_valid, y_valid, n_epochs=15)

epoch,train_loss,valid_loss,time
0,0.03182,0.031392,02:04


In [8]:
torch.save(de_model, Path("cache/de_data") / "model.pt")

In [9]:
utils.store_model(de_model, "data/de")

  return h5py.File(h5file)


## Evaluate

In [14]:
de_model = torch.load(Path("cache/de_data") / "model.pt")

In [15]:
train.evaluate(de_model.cuda().half(), x_valid, y_valid)

HBox(children=(FloatProgress(value=0.0, max=1172.0), HTML(value='')))


Target: Tokenize 

F1: 0.9976900240406937
Precision: 0.997930734517165
Recall: 0.9974494296595764



Target: Sentencize 

F1: 0.9585695555203936
Precision: 0.937686412832653
Recall: 0.9804040596674962





In [9]:
train.evaluate(de_model.cuda().half(), x_valid, y_valid)

HBox(children=(FloatProgress(value=0.0, max=977.0), HTML(value='')))


Target: Tokenize 

F1: 0.9985709185426798
Precision: 0.9977352694599613
Recall: 0.9994079685876922



Target: Sentencize 

F1: 0.9581387972891665
Precision: 0.9362705478411474
Recall: 0.9810530203414978





In [11]:
quantized_model = torch.quantization.quantize_dynamic(de_model.float().cpu(), {nn.LSTM, nn.Linear}, dtype=torch.qint8)

In [12]:
train.evaluate(quantized_model, x_valid, y_valid)

HBox(children=(FloatProgress(value=0.0, max=977.0), HTML(value='')))


Target: Tokenize 

F1: 0.9985806618134485
Precision: 0.9977847594261771
Recall: 0.9993778349483429



Target: Sentencize 

F1: 0.9581120292762628
Precision: 0.9363237215520647
Recall: 0.9809385261100492





# Train model (english)

In [5]:
sentences, labels = train.prepare_tokenized_paragraphs(cache_dir / "en_data" / "tokenized_paragraphs.pkl", "en")

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Faulty paragraph:
[[Token(text='.', whitespace='')], [Token(text='Info', whitespace=' '), Token(text='in', whitespace=' '), Token(text='1987', whitespace=' '), Token(text='gave', whitespace=' '), Token(text='the', whitespace=' '), Token(text='Commodore', whitespace=' '), Token(text='64', whitespace=' '), Token(text='version', whitespace=' '), Token(text='four', whitespace=' '), Token(text='stars', whitespace=' '), Token(text='out', whitespace=' '), Token(text='of', whitespace=' '), Token(text='five', whitespace=''), Token(text=',', whitespace=' '), Token(text='describing', whitespace=' '), Token(text='it', whitespace=' '), Token(text='as', whitespace=' '), Token(text='"', whitespace=''), Token(text='fun', whitespace=' '), Token(text='to', whitespace=' '), Token(text='play', whitespace=''), Token(text=',', whitespace=' '), Token(text='though', whitespace=' '), Token(text='Infocom', whitespace=' '), Token(text='has', whitespace=' '), Token(text='produced', whitespace=' '), Token(text='mo

In [6]:
x_train, x_valid, y_train, y_valid = train_test_split(sentences, labels, test_size=0.1, random_state=1234)

In [7]:
en_model = train.train(x_train, y_train, x_valid, y_valid, n_epochs=15)

epoch,train_loss,valid_loss,time
0,0.040443,0.038892,20:00
1,0.032624,0.031846,20:00
2,0.030703,0.030959,20:01
3,0.030231,0.030827,20:01
4,0.029231,0.030218,20:02
5,0.028824,0.029291,20:02
6,0.029228,0.028684,20:02
7,0.028699,0.027927,20:03
8,0.026893,0.027063,20:03
9,0.027493,0.026111,20:03


In [8]:
torch.save(en_model, Path("cache/en_data") / "model.pt")

In [10]:
utils.store_model(en_model, "data/en")

  return h5py.File(h5file)


# Test

In [7]:
from nnsplit import NNSplit

In [8]:
splitter = NNSplit(utils.load_model("data/de").float())

In [9]:
splitter.split(["Das ist ein Test Das ist noch ein Test."])

[[[Token(text='Das', whitespace=' '),
   Token(text='ist', whitespace=' '),
   Token(text='ein', whitespace=' '),
   Token(text='Test', whitespace=' ')],
  [Token(text='Das', whitespace=' '),
   Token(text='ist', whitespace=' '),
   Token(text='noch', whitespace=' '),
   Token(text='ein', whitespace=' '),
   Token(text='Test', whitespace=''),
   Token(text='.', whitespace='')]]]

In [10]:
splitter = NNSplit(utils.load_model("data/en").float())

# Tune model

In [63]:
n = 1_000_000
x = torch.load(cache_dir / "de_data/all_sentences.pt")[:n]
y = torch.load(cache_dir / "de_data/all_labels.pt")[:n]

In [73]:
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.1)

In [78]:
model = train.train(x_train, y_train, x_valid, y_valid, n_epochs=1)

epoch,train_loss,valid_loss,time
0,0.035032,0.035142,01:51


In [74]:
model = train.train(x_train, y_train, x_valid, y_valid, n_epochs=1)

epoch,train_loss,valid_loss,time
0,0.002735,0.002637,01:54


In [46]:
quantized_model = torch.quantization.quantize_dynamic(model.float().cpu(), {nn.LSTM, nn.Linear}, dtype=torch.qint8)

In [47]:
train.evaluate(quantized_model, x_valid, y_valid)


  0%|          | 0/98 [00:00<?, ?it/s][A
  1%|          | 1/98 [00:00<01:09,  1.41it/s][A
  2%|▏         | 2/98 [00:01<01:09,  1.39it/s][A
  3%|▎         | 3/98 [00:02<01:12,  1.32it/s][A
  4%|▍         | 4/98 [00:03<01:22,  1.14it/s][A
  5%|▌         | 5/98 [00:04<01:28,  1.06it/s][A
  6%|▌         | 6/98 [00:05<01:19,  1.16it/s][A
  7%|▋         | 7/98 [00:06<01:28,  1.03it/s][A
  8%|▊         | 8/98 [00:07<01:27,  1.03it/s][A
  9%|▉         | 9/98 [00:08<01:25,  1.04it/s][A
 10%|█         | 10/98 [00:09<01:27,  1.01it/s][A
 11%|█         | 11/98 [00:10<01:24,  1.03it/s][A
 12%|█▏        | 12/98 [00:11<01:19,  1.09it/s][A
 13%|█▎        | 13/98 [00:12<01:17,  1.10it/s][A
 14%|█▍        | 14/98 [00:13<01:23,  1.01it/s][A
 15%|█▌        | 15/98 [00:14<01:23,  1.00s/it][A
 16%|█▋        | 16/98 [00:15<01:28,  1.08s/it][A
 17%|█▋        | 17/98 [00:16<01:23,  1.03s/it][A
 18%|█▊        | 18/98 [00:17<01:23,  1.04s/it][A
 19%|█▉        | 19/98 [00:18<01:25,  1.08s/it]

Target: Tokenize 

F1: 0.9982945732838129
Precision: 0.9974475067007087
Recall: 0.9991430798056675



Target: Sentencize 

F1: 0.897711766558203
Precision: 0.8391659852820932
Recall: 0.9650393049234588





In [77]:
train.evaluate(model.cuda().half(), x_valid, y_valid)


  0%|          | 0/98 [00:00<?, ?it/s][A
 11%|█         | 11/98 [00:00<00:00, 109.64it/s][A
 22%|██▏       | 22/98 [00:00<00:00, 107.66it/s][A
 35%|███▍      | 34/98 [00:00<00:00, 109.46it/s][A
 49%|████▉     | 48/98 [00:00<00:00, 116.24it/s][A
 63%|██████▎   | 62/98 [00:00<00:00, 120.74it/s][A
 77%|███████▋  | 75/98 [00:00<00:00, 121.12it/s][A
100%|██████████| 98/98 [00:00<00:00, 122.83it/s][A


Target: Tokenize 

F1: 0.9973287289215611
Precision: 0.9961537390852487
Recall: 0.9985064938947623



Target: Sentencize 

F1: 0.9055118110236221
Precision: 0.8516719990905105
Recall: 0.9666181268548045



