# __Plant science word frequency__

Goal
- Evaluate plant science corpus to see how best to tokenize the corpus

In [42]:
import spacy
import pandas as pd
from pathlib import Path
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from tqdm import tqdm
from transformers import BertTokenizerFast
from tokenizers import BertWordPieceTokenizer, ByteLevelBPETokenizer, \
                       CharBPETokenizer

In [26]:
work_dir    = Path.home() / "projects/plantbert"
corpus_file = work_dir / "corpus_with_topics.tsv.gz"
train_file  = work_dir / "train.txt"

tokenizer_dir = work_dir / "tokenizers"
tokenizer_dir.mkdir(parents=True, exist_ok=True)

In [28]:
train_file.is_file()

True

## ___Corpus word frequencies___

### Read corpus

In [6]:
corpus = pd.read_csv(corpus_file, sep="\t", compression="gzip")
corpus.head(2)

Unnamed: 0.1,Unnamed: 0,Index_1385417,PMID,Date,Journal,Title,Abstract,Initial filter qualifier,Corpus,reg_article,Text classification score,Preprocessed corpus,Topic
0,0,3,61,1975-12-11,Biochimica et biophysica acta,Identification of the 120 mus phase in the dec...,After a 500 mus laser flash a 120 mus phase in...,spinach,Identification of the 120 mus phase in the dec...,1,0.716394,identification 120 mus phase decay delayed flu...,52
1,1,4,67,1975-11-20,Biochimica et biophysica acta,Cholinesterases from plant tissues. VI. Prelim...,Enzymes capable of hydrolyzing esters of thioc...,plant,Cholinesterases from plant tissues. VI. Prelim...,1,0.894874,cholinesterases plant tissues . vi . prelimina...,48


In [7]:
docs = corpus["Corpus"].tolist()
len(docs)

421307

### Determine frequenies of words in corpus

See:
- [Word freq](https://www.educative.io/answers/text-summarization-in-spacy-and-nltk) 
- [Multiprocessing pool](https://superfastpython.com/multiprocessing-pool-for-loop/)
- [Fill up a dictionary in parallel with multiprocessing](https://python-forum.io/thread-8587.html)

In [8]:
nlp = spacy.blank("en")

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
word_freq = {}
for doc in tqdm(docs):
  doc = nlp(doc)
  for token in doc:
    if token.text not in STOP_WORDS and token.text not in punctuation:
      if token.text not in word_freq:
        word_freq[token.text] = 1
      else:
        word_freq[token.text] += 1

100%|██████████| 421307/421307 [04:09<00:00, 1691.19it/s]


In [13]:
len(word_freq)

908607

In [19]:
#https://www.freecodecamp.org/news/sort-dictionary-by-value-in-python/

word_freq_sorted = sorted(word_freq.items(), key=lambda x:x[1])
word_freq_sorted[:20]

[('284c51', 1),
 ('14,200+/-900', 1),
 ('trypsin[EC', 1),
 ('K+/2e', 1),
 ('acrA.', 1),
 ('gal+', 1),
 ('20mum', 1),
 ('inhilating', 1),
 ('wasnot', 1),
 ('Sepharose-6B.', 1),
 ('Proteindisulphide', 1),
 ('nott', 1),
 ('trypsinmodified', 1),
 ('pH-7.6', 1),
 ('O2(18', 1),
 ('Stoichacis', 1),
 ('E.C.1.8.5.1', 1),
 ('I-21A', 1),
 ('feremented', 1),
 ('Iwaasa', 1)]

### Check examples

In [21]:
word_freq["SA"], word_freq["JA"], word_freq["auxin"], word_freq["ethylene"], word_freq["ABA"]

(13530, 11186, 28007, 23572, 33303)

In [22]:
word_freq["FLC"], word_freq["FLC"]

1339

## ___Train different tokenizer___

Tokenizers to try
- BertWordPieceTokenizer
- ByteLevelBPETokenizer
- CharBPETokenizer
- 

### Setup

In [29]:
special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "<S>", "<T>"]

# training the tokenizer on the training set
files = [str(train_file)]

# 30,522 vocab is BERT's default vocab size
vocab_size = 30_522

# maximum sequence length, lowering will result to faster training (when increasing batch size)
max_length = 512

In [43]:
def train_tokenizer(tokenizer_type, train_file, vocab_size, max_length):

  tokenizer = None
  if tokenizer_type == "bwp":
    tokenizer = BertWordPieceTokenizer()
  elif tokenizer_type == "blbpe":
    tokenizer = ByteLevelBPETokenizer()
  elif tokenizer_type == "chbpe":
    tokenizer = CharBPETokenizer()
  else:
    print("Unknown tokenizer type:", tokenizer_type)
    return 0

  # train the tokenizer
  tokenizer.train(files=str(train_file), vocab_size=vocab_size, 
                      special_tokens=special_tokens)

  # enable truncation up to the maximum 512 tokens
  tokenizer.enable_truncation(max_length=max_length)

  # save the tokenizer
  save_dir = tokenizer_dir / tokenizer_type
  save_dir.mkdir(parents=True, exist_ok=True)
  tokenizer.save_model(str(save_dir))

  # when the tokenizer is trained and configured, load it as BertTokenizerFast
  #btz_tokenizer = BertTokenizerFast.from_pretrained(save_dir)  

### Tokenizer training

In [40]:
# BertWordPieceTokenizer
train_tokenizer("bwp", train_file, vocab_size, max_length)






In [44]:
# ByteLevelBPETokenizer
train_tokenizer("blbpe", train_file, vocab_size, max_length)






In [45]:
# CharBPETokenizer
train_tokenizer("chbpe", train_file, vocab_size, max_length)




