In [None]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
train_file = "/content/drive/MyDrive/thesis/dataset/train.pkl"
test_file = "/content/drive/MyDrive/thesis/dataset/dev.pkl"
output_train = "/content/drive/MyDrive/thesis/dataset/raptarchis_for_pretraining_train.txt"
output_test = "/content/drive/MyDrive/thesis/dataset/raptarchis_for_pretraining_test.txt"

min_sent_len = 50
max_sent_len = 200
min_next_sent_len = 11

import pandas as pd
from tqdm.notebook import tqdm
from nltk import tokenize
import nltk
nltk.download('punkt')
from string import digits
from statistics import median

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


###make txt
slpit to sentences. one per line

<!-- Create a huge txt file from the raptarchis training data using the folowing format from [here](https://github.com/google-research/bert/blob/master/create_pretraining_data.py)

Input file format:
  (1) One sentence per line. These should ideally be actual sentences, not
  entire paragraphs or arbitrary spans of text. (Because we use the
  sentence boundaries for the "next sentence prediction" task).
  (2) Blank lines between documents. Document boundaries are needed so
  that the "next sentence prediction" task doesn't span between documents. -->

###funcs

In [None]:
def split_str(seq, chunk):
  lst = []
  if chunk <= len(seq):
    i=0
    for i, txt in enumerate(seq[chunk:]):
      if seq[chunk+i]==' ':
        break
    lst.extend([seq[:chunk+i]])
    lst.extend(split_str(seq[chunk+i:], chunk))
  elif seq:
    lst.extend([seq])
  return lst

In [None]:
def create_txt(input_df, output_file):
  df = pd.read_pickle(input_df)
  # df = df.sample(frac = 1)

  #first pass. deal with small sentences
  with open(output_file,"w+") as f:
    for _, row in tqdm(df.iterrows(), total=len(df), desc="1st_pass"):
      tmp = row['header'] + " " + row['articles']
      text = tmp.replace('\n', ' ').replace('\r', '')
      sentenses = tokenize.sent_tokenize(text, language='greek')
      sent_len = 0
      cont = 0
      final = ""
      for i, _ in enumerate(sentenses):
        if cont > 0:
          cont -= 1
          continue
        sent = sentenses[i]
        final += sent + ' '
        sent_len += len(sent)
        if sent_len > min_sent_len:
          j = i+1
          while j < len(sentenses) and len(sentenses[j]) <= min_next_sent_len:
            final += sentenses[j] + ' '
            j += 1
            cont += 1
          end = True
          sent_len = 0
          final += '\n'
        else:
          end = False
      f.write(final+'\n') if end else f.write(final+'\n\n')
  #second pass. deal with large sentences
  with open(output_file,"r") as f:
    data = f.readlines()
  data2 = data
  for i, line in tqdm(enumerate(data), total=len(data), desc="2nd_pass"):
    if len(line) > max_sent_len:
      how_many = int(len(line)/max_sent_len+1)
      how_many = int(len(line)/how_many)
      ans = split_str(line, how_many)
      if len(ans[-1])<=min_next_sent_len:
        ans[-2] = ans[-2]+ans[-1]
        del ans[-1]
      ans = [a+'\n' for a in ans]
      ans[-1] = ans[-1].replace('\n\n\n', '\n\n')  
      data2[i:i+1] = ans
  with open(output_file,"w+") as f:
    f.writelines(data2)

In [None]:
def print_stats(input_txt):
  sum_sent = 0
  sum_sent_list = []
  no_lines = 0
  no_lines_list = []
  no_doc = 0
  lines_per_doc = 0
  one_sent_docs = 0
  no_max_chars=0
  with open(input_txt,"r") as f:
    for line in f:
      if line == '\n':
        if lines_per_doc == 0:
          continue
        no_doc += 1
        no_lines_list.append(lines_per_doc)
        if lines_per_doc == 1:
          one_sent_docs += 1
        lines_per_doc = 0
        continue
      sum_sent += len(line)
      sum_sent_list.append(len(line))
      no_lines += 1
      lines_per_doc += 1
      if len(line)>max_sent_len+2:
        no_max_chars += 1
  print(f"total documents {no_doc}")
  print(f"mean number of chars per sentense {sum_sent/no_lines}")
  print(f"mean number of sentenses per document {no_lines/no_doc}")
  print(f"median number of chars per sentense {median(sum_sent_list)}")
  print(f"median number of sentenses per document {median(no_lines_list)}")
  print(f"min number of chars per sentense {min(sum_sent_list)}")
  print(f"min number of sentenses per document {min(no_lines_list)}")
  print(f"max number of chars per sentense {max(sum_sent_list)}")
  print(f"max number of sentenses per document {max(no_lines_list)}")
  print(f"number of documents with only one sentese {one_sent_docs}")
  print(f"number of sentenses with only min chars {sum_sent_list.count(min(sum_sent_list))}")
  print(f"number of sentenses with more than max chars {no_max_chars}")

###results

In [None]:
# create_txt(test_file, output_test)
print_stats(output_test)

total documents 75152
mean number of chars per sentense 126.80087526637121
mean number of sentenses per document 4.0712688950393865
median number of chars per sentense 128.0
median number of sentenses per document 3.0
min number of chars per sentense 14
min number of sentenses per document 1
max number of chars per sentense 507
max number of sentenses per document 199
number of documents with only one sentese 2149
number of sentenses with only min chars 63
number of sentenses with more than max chars 804


In [None]:
# create_txt(train_file, output_train)
print_stats(output_train)

total documents 235244
mean number of chars per sentense 127.00422870589117
mean number of sentenses per document 4.086340140449916
median number of chars per sentense 128
median number of sentenses per document 3.0
min number of chars per sentense 14
min number of sentenses per document 1
max number of chars per sentense 331
max number of sentenses per document 217
number of documents with only one sentese 6248
number of sentenses with only min chars 137
number of sentenses with more than max chars 2437
