## Convert formats of Fon's corpus into text transcripts

## Data source

The datasource is from the by-seq files originally in lago/resources.  
I work with the folder decompressed from `seqs.zip` , which is provided by CYY.  
The size of `seqs.zip` is 2.5G, and the SHA1sum is `0e9c2c`

In [1]:
def sliding_window(seq_lens, window_size):
  seq_counter = 0
  stride_buffer = []
  stride_count = 0
  start_idx = 0
  stride_idx = 0  
  stride_size = window_size//2

  for idx, len_x in enumerate(seq_lens):
    if len_x > stride_size:
      raise ValueError(f"len_x ({len_x}) > stride_size ({stride_size})")
    
    seq_counter += len_x

    if seq_counter >= window_size:
      assert seq_counter == seq_lens[start_idx:idx+1].sum()
      yield (start_idx, idx)
      start_idx = stride_idx
      seq_counter = seq_lens[start_idx:idx+1].sum()

    stride_buffer.append(len_x)
    stride_count = sum(stride_buffer)
    while stride_count >= stride_size \
          and stride_idx < idx:
      stride_buffer.pop(0)
      stride_count = sum(stride_buffer)
      stride_idx += 1

  yield start_idx, len(seq_lens)

## Load data

In [2]:
import numpy as np
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("ckip-joint/bloom-3b-zh", add_prefix_space=True)

In [3]:
from pathlib import Path
from tqdm.auto import tqdm

seq_dir = Path("~/lago/resources/fon_seqs").expanduser()
debom = lambda x: x.strip().strip("\ufeff")
decodes = {}

for speaker_dir in tqdm(seq_dir.iterdir()):
  speaker_id = speaker_dir.name
  lab_files = list(speaker_dir.glob("*.lab"))
  lab_files = sorted(lab_files)
  transcripts = [debom(lab_file.read_text()) for lab_file in lab_files]  
  seq_words = [x.split(" ")+["\n"] for x in transcripts]
  seq_words = [[w for w in words_x if w] 
               for words_x in seq_words]
  seq_tokens = [tokenizer(x, is_split_into_words=True)["input_ids"] 
                 for x in seq_words]
  seq_lens = np.array([len(x) for x in seq_tokens])

  decode_seqs = []
  for start_idx, end_idx in sliding_window(seq_lens, 1000):
    seq_tokens_slice = seq_tokens[start_idx:end_idx]
    decode_seqs.append({
      "start_idx": start_idx,
      "end_idx": end_idx,
      "sequences": seq_words[start_idx:end_idx]
      })
  decodes[speaker_id] = decode_seqs


0it [00:00, ?it/s]

## Export artifact

In [4]:
import json
decode_path = Path("../../data/surprisal/decode_seqs.json")
decode_path.write_text(json.dumps(decodes, ensure_ascii=False))

2956117

In [5]:
## Ignore the tokenzier parallelism warning
!sha1sum $decode_path

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
6b6873ae3f441b3cca2e88d943ea91fb93965273  ../../data/surprisal/decode_seqs.json
