In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from glob import glob
from multiprocessing import Pool
from random import sample
from functools import partial
from itertools import product
import os

!pip install --upgrade matplotlib
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline

print('Loading transformers lib...')
!pip install transformers
from transformers import AutoTokenizer
import logging
logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)
# So ti wont display the following:
# "Token indices sequence length is longer than the specified maximum sequence length 
# for this model (619 > 512). Running this sequence through the model will result 
# in indexing errors"

import sys
def i1():# helper function for fork printing
  sys.stdout.write(' ')
  sys.stdout.flush()

original_path = "/mnt/D21005A6100592A1/ΕΚΠΑ/πτυχιακή/Bert-final/"

Loading transformers lib...


In [2]:
def read_file(fi, tokenizer, maxx=0):
  """ helper function for loading txts 
      if max!=0 it reads ~maxx bytes"""
  block_size = 2048
  total = 0
  total_tokens = 0
  total_words = 0
  with open(fi, 'r') as f:
    def read_block():
      return f.read(block_size)
    if maxx!=0:
      iterator = tqdm(iter(read_block, ''), leave=False, 
                        desc='loading big txt', total=int(maxx/block_size-100))
      for block in iterator:
        total_words += len(block.split()) 
        encoded = tokenizer.tokenize(block, add_special_tokens = False, padding = False,
                            truncation = False, return_attention_mask = False, 
                            return_token_type_ids = False)
        total_tokens += len(encoded)
        total += block_size
        if total >= maxx:
          iterator.close()
          break
    else:
      for block in iter(read_block, ''):
        total_words += len(block.split())
        encoded = tokenizer.tokenize(block, add_special_tokens = False, padding = False,
                            truncation = False, return_attention_mask = False, 
                            return_token_type_ids = False)
        total_tokens += len(encoded)
  return (total_tokens, total_words)


def read_df(fi, tokenizer):
  total_tokens = 0
  total_words = 0
  df = pd.read_pickle(fi)
  # delete usless stuff
  df = df.drop(['title', 'type', 'year', 'law_id', 'leg_uri'], axis=1)
  for _, row in tqdm(df.iterrows(), desc='loading dataframe',
                     leave=False, total=len(df)):
    text = row['header'] + ' ' + row['articles']
    total_words += len(text.split())
    encoded = tokenizer.tokenize(text, add_special_tokens = False, padding = False,
                        truncation = False, return_attention_mask = False, 
                        return_token_type_ids = False)
    total_tokens += len(encoded)
  return (total_tokens, total_words)


def count(path, tokenizer):
  total_tokens = 0
  total_words = 0

  if 'Legal-Bert' in path or 'Greek-Bert' in path:
    if 'Legal-Bert' in path:
      name = 'Legal'
      files = sample(glob(path+'*/*/*.txt'),4500)
      maxx = 0
    elif 'Greek-Bert' in path:
      name = 'Greek'
      files = glob(path+'europal/*')
      files += sample(glob(path+'oscar/*'),1)# choose a random file from oscar
      # files += glob(path+'wiki/*')
      maxx = 37000000/2
    with Pool(processes=8, initializer=i1) as pool:
      results = list(tqdm(pool.imap(partial(read_file, maxx=maxx, tokenizer=tokenizer), files),
                          total=len(files), desc='tokenizing '+name, leave=True))
    total_tokens += sum(i for i, _ in results)
    total_words += sum(j for _, j in results)
  elif 'RAPTARCHIS' in path:
    files = glob(path+'*.pkl')
    with Pool(processes=8, initializer=i1) as pool:
      results = list(tqdm(pool.imap(partial(read_df, tokenizer=tokenizer), files),
                          total=len(files), desc='tokenizing RAPTARCHIS', leave=True))
    total_tokens += sum(i for i, _ in results)
    total_words += sum(j for _, j in results)
      
  return (total_tokens, total_words)

In [3]:
print('Loading Greek_Bert tokenizer...')
tokenizer_greek = AutoTokenizer.from_pretrained("nlpaueb/bert-base-greek-uncased-v1",
                                                use_fast=True)
# print('Loading Legal_Bert tokenizer...')
# tokenizer_legal = AutoTokenizer.from_pretrained("bert-base-multilingual-uncased",
                                                  # use_fast=True)
print('Loading M_Bert tokenizer...')
tokenizer_m = AutoTokenizer.from_pretrained("bert-base-multilingual-uncased",
                                            use_fast=True)

rapt = original_path+"datasets/RAPTARCHIS/"
greek = original_path+"datasets/Greek-Bert/normalized/"
legal = original_path+"datasets/Legal-Bert/normalized/"

tockenizers = [tokenizer_greek, tokenizer_m]
datasets = [greek, rapt, legal]
tockenizers_ = ['Greek-Bert', 'M-Bert']
datasets_ = ['greek', 'rapt', 'legal']
params = product(tockenizers, datasets)
keys_ = product(tockenizers_, datasets_)

results = {}
for param, key in zip(params, keys_):
  key = key[0]+'->'+key[1]
  results[key] = count(param[1], param[0])

Loading Greek_Bert tokenizer...
Loading M_Bert tokenizer...
        

HBox(children=(IntProgress(value=0, description='tokenizing Greek', max=2, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='loading big txt', max=8933, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='loading big txt', max=8933, style=ProgressStyle(description_w…


        

HBox(children=(IntProgress(value=0, description='tokenizing RAPTARCHIS', max=3, style=ProgressStyle(descriptio…

HBox(children=(IntProgress(value=0, description='loading dataframe', max=9511, style=ProgressStyle(description…

HBox(children=(IntProgress(value=0, description='loading dataframe', max=9516, style=ProgressStyle(description…

HBox(children=(IntProgress(value=0, description='loading dataframe', max=28536, style=ProgressStyle(descriptio…


        

HBox(children=(IntProgress(value=0, description='tokenizing Legal', max=4500, style=ProgressStyle(description_…


        

HBox(children=(IntProgress(value=0, description='tokenizing Greek', max=2, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='loading big txt', max=8933, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='loading big txt', max=8933, style=ProgressStyle(description_w…


        

HBox(children=(IntProgress(value=0, description='tokenizing RAPTARCHIS', max=3, style=ProgressStyle(descriptio…

HBox(children=(IntProgress(value=0, description='loading dataframe', max=9511, style=ProgressStyle(description…

HBox(children=(IntProgress(value=0, description='loading dataframe', max=9516, style=ProgressStyle(description…

HBox(children=(IntProgress(value=0, description='loading dataframe', max=28536, style=ProgressStyle(descriptio…


        

HBox(children=(IntProgress(value=0, description='tokenizing Legal', max=4500, style=ProgressStyle(description_…




In [7]:
print("{:<20} {:<20}".format('Model->dataset','Fragmentation ratio'))
for k, v in results.items():
  ratio = round(v[0]/v[1],3)
  print("{:<20} {:<20}".format(k, ratio))

Model->dataset       Fragmentation ratio 
Greek-Bert->greek    1.131               
Greek-Bert->rapt     1.647               
Greek-Bert->legal    1.15                
M-Bert->greek        2.199               
M-Bert->rapt         2.967               
M-Bert->legal        2.062               
