### Load the corpus

In [1]:
import numpy as np

corpus_brtext = []
corpus_brtext_test = []
sents_set = set()

with open('br-text.txt') as f:
    for l in f.readlines():
        sents_set.add(l.replace('\n',''))
        
sents_set = list(sents_set)        
sents1 = []
for i in sents_set:
    sent = i.split(' ')
    sents1.append(sent)

for _ in range(400):
    corpus_brtext.append([])
    for i in set(np.random.choice(range(len(sents1)),200)):
        corpus_brtext[-1].append(sents1[i])
    corpus_brtext[-1] = [[''.join(j) for j in corpus_brtext[-1]], corpus_brtext[-1]]

sents2 = []
for i in sents_set[int(len(sents_set)*0.9):]:
    sent = i.split(' ')
    sents2.append(sent)
sents2 = [[''.join(j) for j in sents2], sents2]
corpus_brtext_test.append(sents2)

### Run LiB

In [2]:
import model
import importlib
importlib.reload(model)

model.life = 10
model.max_len = 12
model.memory_in = 0.25
model.memory_out = 0.0001
model.update_rate = 0.2

model.mini_gap = 1
model.use_skip=False

memory = model.TrieList()

corpus_train = corpus_brtext
corpus_test = corpus_brtext_test

model.init(memory, corpus_train[0][0]) # init the Lexicon memory with some unigrams in corpus

for epoch_id in range(5001):
    model.run(epoch_id, memory, corpus_train, corpus_test)

0	  MemLength: 131
[B] Precision: 34.67% 	 Recall: 95.18% 	 F1: 50.83%
[L] Precision: 8.20% 	 Recall: 22.52% 	 F1: 12.02%

100	  MemLength: 710
[B] Precision: 65.59% 	 Recall: 91.90% 	 F1: 76.55%
[L] Precision: 41.72% 	 Recall: 58.45% 	 F1: 48.69%

200	  MemLength: 899
[B] Precision: 70.42% 	 Recall: 91.44% 	 F1: 79.56%
[L] Precision: 48.25% 	 Recall: 62.65% 	 F1: 54.52%

300	  MemLength: 994
[B] Precision: 74.84% 	 Recall: 91.28% 	 F1: 82.25%
[L] Precision: 56.06% 	 Recall: 68.39% 	 F1: 61.62%

400	  MemLength: 1096
[B] Precision: 74.87% 	 Recall: 91.36% 	 F1: 82.30%
[L] Precision: 54.61% 	 Recall: 66.63% 	 F1: 60.02%

500	  MemLength: 1162
[B] Precision: 74.61% 	 Recall: 90.29% 	 F1: 81.70%
[L] Precision: 54.23% 	 Recall: 65.63% 	 F1: 59.39%

600	  MemLength: 1215
[B] Precision: 74.96% 	 Recall: 89.95% 	 F1: 81.77%
[L] Precision: 53.11% 	 Recall: 63.72% 	 F1: 57.93%

700	  MemLength: 1256
[B] Precision: 76.73% 	 Recall: 90.37% 	 F1: 82.99%
[L] Precision: 57.42% 	 Recall: 67.62% 	 F1:

### See the head entities in Lexicon memory

In [3]:
memory[:50]

['you',
 'for',
 'and',
 'canyou',
 'the',
 'what',
 'wanna',
 "what's",
 'yeah',
 'it',
 'your',
 'we',
 'he',
 'that',
 'put',
 'thisis',
 'feel',
 'youcan',
 "that's",
 'with',
 'this',
 'his',
 "he's",
 'see',
 'okay',
 'now',
 'to',
 "there's",
 'can',
 'look',
 'youwanna',
 'youwant',
 'is',
 'here',
 'isit',
 'in',
 "don't",
 'at',
 "here's",
 "let's",
 "you're",
 "'s",
 'on',
 'no',
 'have',
 'not',
 'thoseare',
 'get',
 'my',
 'gonna']

### See the chunk segmentation result and the subchunk segmentation result 

In [5]:
article, article_raw = corpus_train[2]
onset, end = 10, 20
print('---\nchunks\n---')
model.show_result(memory, article_raw[onset:end], article[onset:end], decompose=False)
print('---\nsubchunks\n---')
model.show_result(memory, article_raw[onset:end], article[onset:end], decompose=True)

---
chunks
---
 	 can you 	 make 	 a tower 	 with 	 what you 	 have 	
 	 canyou 	 make 	 atower 	 with 	 whatyou 	 have 	

you can 	 get down 	 by 	 yourself 	 see she has 	 her 	
youcan 	 getdown 	 b y 	 your self 	 seeshehas 	 her 	

pajamas 	 on 	 don't 	 honey 	 you'll 	 break 	 it 	
p a j am as 	 on 	 don't 	 honey 	 you 'll 	 br eak 	 it 	

numbers 	 are those 	 slippers 	 no what 	 that's a 	
numbers 	 arethose 	 s l i pper s 	 now hat 	 that'sa 	

---
subchunks
---
 	 can you 	 make 	 a tower 	 with 	 what 	 you 	
 	 canyou 	 make 	 atower 	 with 	 what 	 you 	

have 	 you can 	 get 	 down 	 by 	 yourself 	
have 	 youcan 	 get 	 down 	 b y 	 your self 	

see she has 	 her 	 pajamas 	 on 	 don't 	 honey 	
seeshehas 	 her 	 p a j am as 	 on 	 don't 	 honey 	

you'll 	 break 	 it 	 numbers 	 are 	 those 	 slippers 	
you 'll 	 br eak 	 it 	 numbers 	 are 	 those 	 s l i p per s 	

no what 	 that's 	 a 	 bird 	 which 	 color 	
now h at 	 that's 	 a 	 bird 	 which 	 color 	

