In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from glob import glob
from multiprocessing import Pool
from random import sample
from functools import partial
from itertools import product
import os

!pip install --upgrade matplotlib
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline

print('Loading transformers lib...')
!pip install transformers
from transformers import AutoTokenizer, BertTokenizer
import logging
logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)
# So ti wont display the following:
# "Token indices sequence length is longer than the specified maximum sequence length 
# for this model (619 > 512). Running this sequence through the model will result 
# in indexing errors"

import sys
def i1():# helper function for fork printing
  sys.stdout.write(' ')
  sys.stdout.flush()

original_path = "/home/stratos/Desktop/Bert-final/"

Requirement already up-to-date: matplotlib in ./anaconda3/lib/python3.7/site-packages (3.4.2)
Loading transformers lib...


None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
def read_file(fi, tokenizer, maxx=0):
  """ helper function for loading txts 
      if max!=0 it reads ~maxx bytes"""
  block_size = 2048
  total = 0
  total_tokens = 0
  total_words = 0
  with open(fi, 'r') as f:
    def read_block():
      return f.read(block_size)
    if maxx!=0:
      iterator = tqdm(iter(read_block, ''), leave=False, 
                        desc='loading big txt', total=int(maxx/block_size-100))
      for block in iterator:
        total_words += len(block.split()) 
        encoded = tokenizer(block, add_special_tokens = False, padding = False,
                            truncation = False, return_attention_mask = False, 
                            return_token_type_ids = False)['input_ids']
        total_tokens += len(encoded)
        total += block_size
        if total >= maxx:
          iterator.close()
          break
    else:
      for block in iter(read_block, ''):
        total_words += len(block.split())
        encoded = tokenizer(block, add_special_tokens = False, padding = False,
                            truncation = False, return_attention_mask = False, 
                            return_token_type_ids = False)['input_ids']
        total_tokens += len(encoded)
  return (total_tokens, total_words)


def read_df(fi, tokenizer):
  total_tokens = 0
  total_words = 0
  df = pd.read_pickle(fi)
  # delete usless stuff
  df = df.drop(['title', 'type', 'year', 'law_id', 'leg_uri'], axis=1)
  for _, row in tqdm(df.iterrows(), desc='loading dataframe',
                     leave=True, total=len(df)):
    text = row['header'] + ' ' + row['articles']
    total_words += len(text.split())
    encoded = tokenizer(text, add_special_tokens = False, padding = False,
                        truncation = False, return_attention_mask = False, 
                        return_token_type_ids = False)['input_ids']
    total_tokens += len(encoded)
  return (total_tokens, total_words)


def count(path, tokenizer):
  total_tokens = 0
  total_words = 0

  if 'Legal-Bert' in path or 'Greek-Bert' in path:
    if 'Legal-Bert' in path:
      name = 'Legal'
      files = sample(glob(path+'*/*/*.txt'), int(98043/7/4))
      maxx = 0
    elif 'Greek-Bert' in path:
      name = 'Greek'
      files = glob(path+'europal/*')
      files += sample(glob(path+'oscar/*'), 4)# choose a random file from oscar
      # files += glob(path+'wiki/*')
      maxx = 2000000000/32/4
    with Pool(processes=8, initializer=i1) as pool:
      results = list(tqdm(pool.imap(partial(read_file, maxx=maxx, tokenizer=tokenizer), files),
                          total=len(files), desc='tokenizing '+name, leave=True))
  elif 'RAPTARCHIS' in path:
    files = path+'train.pkl'
    # with Pool(processes=3, initializer=i1) as pool:
    #   results = list(tqdm(pool.imap(partial(read_df, tokenizer=tokenizer), files),
    #                       total=len(files), desc='tokenizing RAPTARCHIS', leave=True))
    results = [read_df(fi=files, tokenizer=tokenizer)]
  total_tokens += sum(i for i, _ in results)
  total_words += sum(j for _, j in results)
      
  return (total_tokens, total_words)

In [3]:
print('Loading Greek_Bert tokenizer...')
tokenizer_greek = AutoTokenizer.from_pretrained("nlpaueb/bert-base-greek-uncased-v1",
                                                use_fast=True)
print('Loading Legal_Bert tokenizer...')
tokenizer_legal = BertTokenizer(original_path+"datasets/Legal_Bert_vocab.txt",
                                    use_fast=True)
print('Loading M_Bert tokenizer...')
tokenizer_m = AutoTokenizer.from_pretrained("bert-base-multilingual-uncased",
                                            use_fast=True)

rapt = original_path+"datasets/RAPTARCHIS/"
greek = original_path+"datasets/Greek-Bert/normalized/"
legal = original_path+"datasets/Legal-Bert/normalized/"

tokenizers = [tokenizer_legal, tokenizer_greek, tokenizer_m]
datasets = [legal, greek, rapt]
tockenizers_ = ['Legal-Bert', 'Greek-Bert', 'M-Bert']
datasets_ = ['legal', 'greek', 'rapt']
params = product(tokenizers, datasets)
keys_ = product(tockenizers_, datasets_)

results = {}
for param, key in zip(params, keys_):
  key = key[0]+'->'+key[1]
  results[key] = count(param[1], param[0])

Loading Greek_Bert tokenizer...
Loading Legal_Bert tokenizer...
Loading M_Bert tokenizer...
        

tokenizing Legal:   0%|          | 0/3501 [00:00<?, ?it/s]

        

tokenizing Greek:   0%|          | 0/5 [00:00<?, ?it/s]

loading big txt:   0%|          | 0/7529 [00:00<?, ?it/s]

loading big txt:   0%|          | 0/7529 [00:00<?, ?it/s]

loading big txt:   0%|          | 0/7529 [00:00<?, ?it/s]

loading big txt:   0%|          | 0/7529 [00:00<?, ?it/s]

loading big txt:   0%|          | 0/7529 [00:00<?, ?it/s]

loading dataframe:   0%|          | 0/28536 [00:00<?, ?it/s]

        

tokenizing Legal:   0%|          | 0/3501 [00:00<?, ?it/s]

        

tokenizing Greek:   0%|          | 0/5 [00:00<?, ?it/s]

loading big txt:   0%|          | 0/7529 [00:00<?, ?it/s]

loading big txt:   0%|          | 0/7529 [00:00<?, ?it/s]

loading big txt:   0%|          | 0/7529 [00:00<?, ?it/s]

loading big txt:   0%|          | 0/7529 [00:00<?, ?it/s]

loading big txt:   0%|          | 0/7529 [00:00<?, ?it/s]

loading dataframe:   0%|          | 0/28536 [00:00<?, ?it/s]

        

tokenizing Legal:   0%|          | 0/3501 [00:00<?, ?it/s]

        

tokenizing Greek:   0%|          | 0/5 [00:00<?, ?it/s]

loading big txt:   0%|          | 0/7529 [00:00<?, ?it/s]

loading big txt:   0%|          | 0/7529 [00:00<?, ?it/s]

loading big txt:   0%|          | 0/7529 [00:00<?, ?it/s]

loading big txt:   0%|          | 0/7529 [00:00<?, ?it/s]

loading big txt:   0%|          | 0/7529 [00:00<?, ?it/s]

loading dataframe:   0%|          | 0/28536 [00:00<?, ?it/s]

In [4]:
print("{:<20} {:<20}".format('Model->dataset','Fragmentation ratio'))
for k, v in results.items():
  ratio = round(v[0]/v[1],3)
  print("{:<20} {:<20}".format(k, ratio))

Model->dataset       Fragmentation ratio 
Legal-Bert->legal    1.06                
Legal-Bert->greek    1.277               
Legal-Bert->rapt     1.513               
Greek-Bert->legal    1.154               
Greek-Bert->greek    1.151               
Greek-Bert->rapt     1.646               
M-Bert->legal        2.059               
M-Bert->greek        2.176               
M-Bert->rapt         2.968               
