In [1]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
train_file = "/content/drive/MyDrive/thesis/dataset/train.pkl"
test_file = "/content/drive/MyDrive/thesis/dataset/dev.pkl"
output_train = "/content/drive/MyDrive/thesis/dataset/raptarchis_sentences_train.txt"
output_test = "/content/drive/MyDrive/thesis/dataset/raptarchis_sentences_test.txt"

max_sent_len = 200

import pandas as pd
from tqdm.notebook import tqdm
from nltk import tokenize
import nltk
nltk.download('punkt')
from string import digits
from statistics import median
import re
import string
import unicodedata

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


###make txt
slpit to sentences. one per line

<!-- Create a huge txt file from the raptarchis training data using the folowing format from [here](https://github.com/google-research/bert/blob/master/create_pretraining_data.py)

Input file format:
  (1) One sentence per line. These should ideally be actual sentences, not
  entire paragraphs or arbitrary spans of text. (Because we use the
  sentence boundaries for the "next sentence prediction" task).
  (2) Blank lines between documents. Document boundaries are needed so
  that the "next sentence prediction" task doesn't span between documents. -->

###funcs

In [3]:
def split_str(seq, chunk):
  lst = []
  if chunk <= len(seq):
    i=0
    for i, txt in enumerate(seq[chunk:]):
      if seq[chunk+i]==' ':
        break
    lst.extend([seq[:chunk+i]])
    lst.extend(split_str(seq[chunk+i:], chunk))
  elif seq:
    lst.extend([seq])
  return lst

In [4]:
def create_txt(input_df, output_file):
  df = pd.read_pickle(input_df)

  lines = []
  for _, row in tqdm(df.iterrows(), total=len(df),
                      desc=f'loading {input_df[-10:]}', leave=True):
    text = row['header'] + ' ' + row['articles']
    lines.append(text.encode("utf-8", "ignore").decode().replace('\n',' '))

  df = pd.DataFrame(lines, columns=['text'])

  with open(output_file,"w+") as f:
    for _, row in tqdm(df.iterrows(), total=len(df), desc="1st_pass"):
      tmp = row['text']
      if not (tmp.strip().endswith('.') or tmp.strip().endswith('!') or tmp.strip().endswith(';')):
        tmp+='.'
      tmp = re.findall('.*?[.!;]\s*',tmp)
      old_toc = tmp[0].strip('\n')
      new = [old_toc]
      for toc in tmp[1:]:
        toc.strip('\n')
        if len(old_toc)<8 or not re.search('[α-ωΑ-Ω]', old_toc) or len(old_toc.split()[-1])<6:
          new[-1] = new[-1] + toc
        else:
          new.append(toc)
        old_toc = toc
      for sent in new:
        f.write(sent + '\n')
      f.write('\n')

In [5]:
def print_stats(input_txt):
  sum_sent = 0
  sum_sent_list = []
  no_lines = 0
  no_lines_list = []
  no_doc = 0
  lines_per_doc = 0
  one_sent_docs = 0
  no_max_chars=0
  with open(input_txt,"r") as f:
    for line in f:
      if line == '\n':
        if lines_per_doc == 0:
          continue
        no_doc += 1
        no_lines_list.append(lines_per_doc)
        if lines_per_doc == 1:
          one_sent_docs += 1
        lines_per_doc = 0
        continue
      sum_sent += len(line)
      sum_sent_list.append(len(line))
      no_lines += 1
      lines_per_doc += 1
      if len(line)>max_sent_len+2:
        no_max_chars += 1
  print(f"total documents {no_doc}")
  print(f"mean number of chars per sentense {sum_sent/no_lines}")
  print(f"mean number of sentenses per document {no_lines/no_doc}")
  print(f"median number of chars per sentense {median(sum_sent_list)}")
  print(f"median number of sentenses per document {median(no_lines_list)}")
  print(f"min number of chars per sentense {min(sum_sent_list)}")
  print(f"min number of sentenses per document {min(no_lines_list)}")
  print(f"max number of chars per sentense {max(sum_sent_list)}")
  print(f"max number of sentenses per document {max(no_lines_list)}")
  print(f"number of documents with only one sentese {one_sent_docs}")
  print(f"number of sentenses with only min chars {sum_sent_list.count(min(sum_sent_list))}")
  print(f"number of sentenses with more than max chars {no_max_chars}")
  return no_lines

In [6]:
def _is_punctuation(char):
  """Checks whether `chars` is a punctuation character."""
  cp = ord(char)
  # We treat all non-letter/number ASCII as punctuation.
  # Characters such as "^", "$", and "`" are not in the Unicode
  # Punctuation class but we treat them as punctuation anyways, for
  # consistency.
  if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
      (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
    return True
  cat = unicodedata.category(char)
  if cat.startswith("P"):
    return True
  return False

def _run_split_on_punc(text):
  """Splits punctuation on a piece of text."""
  chars = list(text)
  i = 0
  start_new_word = True
  output = []
  while i < len(chars):
    char = chars[i]
    if _is_punctuation(char):
      output.append([char])
      start_new_word = True
    else:
      if start_new_word:
        output.append([])
      start_new_word = False
      output[-1].append(char)
    i += 1

  return ["".join(x) for x in output]

def strip_accents_and_lowercase(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s)
                if unicodedata.category(c) != 'Mn').lower()

def normalize(filename, total_file_lines=0):
  new = []
  with open(filename,"r") as f:
    for line in tqdm(f, total=total_file_lines, desc=f'loading {filename[-10:]}', leave=True):
      tokens = line.lower().split()
      splited_tokens = []
      for token in tokens:
        splited_tokens.extend(_run_split_on_punc(token))
      line = ' '.join(splited_tokens)
      line = strip_accents_and_lowercase(line)
      new.append(line+'\n')
      # if line.endswith('\n'):
      #   new.append(line)
      # else:
      #   new.append(line+'\n')
  with open(filename,"w+") as f:
    for l in new:
      f.write(l)

###results

In [7]:
create_txt(test_file, output_test)
total_file_lines = print_stats(output_test)
print('\n\n')
normalize(output_test, total_file_lines)
print_stats(output_test)

HBox(children=(FloatProgress(value=0.0, description='loading et/dev.pkl', max=9511.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='1st_pass', max=9511.0, style=ProgressStyle(description_wi…


total documents 9511
mean number of chars per sentense 223.10497008266177
mean number of sentenses per document 18.239722426663864
median number of chars per sentense 169.0
median number of sentenses per document 4
min number of chars per sentense 4
min number of sentenses per document 1
max number of chars per sentense 14281
max number of sentenses per document 3258
number of documents with only one sentese 1807
number of sentenses with only min chars 4
number of sentenses with more than max chars 71257





HBox(children=(FloatProgress(value=0.0, description='loading s_test.txt', max=173478.0, style=ProgressStyle(de…


total documents 9511
mean number of chars per sentense 232.25347882728644
mean number of sentenses per document 18.239722426663864
median number of chars per sentense 176.0
median number of sentenses per document 4
min number of chars per sentense 2
min number of sentenses per document 1
max number of chars per sentense 11191
max number of sentenses per document 3258
number of documents with only one sentese 1807
number of sentenses with only min chars 4
number of sentenses with more than max chars 74438


173478

In [8]:
create_txt(train_file, output_train)
print_stats(output_train)
print('\n\n')
normalize(output_test, total_file_lines)
print_stats(output_test)

HBox(children=(FloatProgress(value=0.0, description='loading /train.pkl', max=28536.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='1st_pass', max=28536.0, style=ProgressStyle(description_w…


total documents 28536
mean number of chars per sentense 222.9068392211985
mean number of sentenses per document 19.146972245584525
median number of chars per sentense 170.0
median number of sentenses per document 4.0
min number of chars per sentense 4
min number of sentenses per document 1
max number of chars per sentense 24532
max number of sentenses per document 6156
number of documents with only one sentese 5410
number of sentenses with only min chars 9
number of sentenses with more than max chars 225910





HBox(children=(FloatProgress(value=0.0, description='loading s_test.txt', max=173478.0, style=ProgressStyle(de…


total documents 9511
mean number of chars per sentense 232.25347882728644
mean number of sentenses per document 18.239722426663864
median number of chars per sentense 176.0
median number of sentenses per document 4
min number of chars per sentense 2
min number of sentenses per document 1
max number of chars per sentense 11191
max number of sentenses per document 3258
number of documents with only one sentese 1807
number of sentenses with only min chars 4
number of sentenses with more than max chars 74438


173478