In [None]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install langdetect datasets

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
%cd /content/drive/MyDrive/ParsBART/bart-ir

In [None]:
from datasets import load_dataset
import random
import re
import langdetect
import pandas as pd
import timeit
import torch
import numpy as np
from data.utils import sentence_permutation, document_rotation
from data.utils import token_infilling, token_masking, token_deletion
from transformers import BartTokenizer

In [None]:
cleaned_row_idx_file = open('/content/drive/MyDrive/ParsBART/cleaned_data/cleaned_row_idx.txt', "r", encoding="utf-8")
cleaned_row_counter_file = open('/content/drive/MyDrive/ParsBART/cleaned_data/cleaned_row_counter.txt', "r", encoding="utf-8")

In [None]:
skiped_rows = int(cleaned_row_idx_file.read())
print(skiped_rows)
cleaned_row_idx_file.close()
saved_clean_rows = int(cleaned_row_counter_file.read())
print(saved_clean_rows)
cleaned_row_counter_file.close()

125844577
1800000


In [None]:
max_mem_size = 4
sentence_sep_pattern = "([!\.\?⸮؟]+)[ \n]+"
min_line_per_doc = 4
min_word_per_line = 5
end_marks = ("!",".","?","⸮","؟")
words_should_be_filtered_out = [
    "برچسب:",
    "برچسب ها :",
    "» مطالب و مقالات »",
    "برچسبها:",
    "برچسب :",
    "برچسبها :",
    "برچسب ها:",
    "http://",
    "حقوق این وب سایت محفوظ",
    "فراموشی رمز ورود",
    "https://",
    "ورود به سایت",
    "... ادامه خبر",
    "بیشتر بدانید ...",
    "...",
    "کلید واژه ها:",
    "کلید واژه ها :",
    "کلید واژهها:",
    "archive :",
    "archives :",
    "archive:",
    "archives:",
    "Archive :",
    "Archives :",
    "این مطلب را به اشتراک بگذارید",
    "کلیه حقوق مادی و معنوی",
    "هیچ نظری هنوز ثبت نشده است",
    "گر قبلا در بیان ثبت نام کرده اید می توانید ابتدا وارد شوید",
    "پست الکترونيک شما میتوانید از این تگهای html استفاده کنید",
    "پست الکترونیک برای عموم قابل مشاهده باشد اخطار!",
    "Archive:",
    "Archives:",
    "(نظر، انتقاد، پیشنهاد ...)",
    "...",
    ". . ."
]

batch_num = 0
map_batch_size = 5000
map_batch_num = 0
duplicate_set = set()
filter_batch_size = 1000


def clean_text(text):
    pattern = re.compile(sentence_sep_pattern)
    text = pattern.sub(r'\1\n', text)
    sentences = [sentence.replace('\n', ' ').strip() for sentence in text.split('\n') if sentence.strip()]
    if len(sentences) < min_line_per_doc:
        return None

    final_sentences = []
    for sentence in sentences:
        sentence_hash = hash(sentence)
        if sentence_hash in duplicate_set:
            continue
        else:
            duplicate_set.add(sentence_hash)
        if len(sentence.split()) < min_word_per_line:
            continue
        if not sentence.endswith(end_marks):
            continue
        if "javascript" in sentence:
            continue
        if "lorem ipsum" in sentence:
            continue
        if "{" in sentence:
            continue
        remove = False
        for filter_word in words_should_be_filtered_out:
            if filter_word in sentence:
                remove = True
                break
        if remove:
            continue
        final_sentences.append(sentence)
    if len(final_sentences) < min_line_per_doc:
        return None
    cleaned = " ".join(final_sentences).replace('\n'," ")
    try:
      if langdetect.detect(cleaned) != "fa":
          return None
    except:
      return None
    return cleaned


train_streaming_dataset = load_dataset(
    "SLPL/naab", split="train", streaming=True
).skip(skiped_rows).with_format(type="torch")


def add_clean_text(example):
    example["cleaned_text"] = []
    example["idx"] = []
    global map_batch_num
    for counter, text in enumerate(example['text']):
        cleaned_text = clean_text(text)
        if cleaned_text is None:
            example["cleaned_text"].append("None")
            example["idx"].append(map_batch_num * map_batch_size + counter + skiped_rows)
        else:
            example["cleaned_text"].append(cleaned_text)
            example["idx"].append(map_batch_num * map_batch_size + counter + skiped_rows)
    map_batch_num += 1
    return example

def filter_cleaned_text(example):
    filter_array=[]
    for counter, cleaned_text in enumerate(example['cleaned_text']):
        if cleaned_text == "None":
          filter_array.append(False)
        else:
          filter_array.append(True)
    return filter_array


output = train_streaming_dataset.map(add_clean_text, batched=True, batch_size=map_batch_size)
output_filtered = output.filter(filter_cleaned_text, batched=True, batch_size=filter_batch_size)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


The repository for SLPL/naab contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/SLPL/naab.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


In [None]:
tokenizer = BartTokenizer.from_pretrained("/content/drive/MyDrive/ParsBART/bart-ir/tokenizer_bart_ir")
MAX_POSITION_EMBEDDINGS = 256
perturbations = [
    document_rotation,
    sentence_permutation,
    token_infilling,
    token_masking,
    token_deletion,
]

perturbations_text_domain = [
    document_rotation,
    sentence_permutation,
]

perturbations_token_domain = [
    token_infilling,
    token_masking,
    token_deletion,
]

def collate_fn(cleaned_text):

    perturbation_function = random.choice(perturbations)
    if perturbation_function in perturbations_text_domain:
        # need to truncate the text to 256 tokens
        t_text = tokenizer(cleaned_text, truncation=True, max_length=MAX_POSITION_EMBEDDINGS)
        text_truncated = tokenizer.decode(t_text["input_ids"], skip_special_tokens=True)
        perturbed_text = perturbation_function(text_truncated)
    else:
        original_input_ids = tokenizer(
            cleaned_text, return_tensors="pt", truncation=True, max_length=MAX_POSITION_EMBEDDINGS
        )["input_ids"][0]
        perturbed_input_ids = perturbation_function(
                tokenized_sequence=original_input_ids,
                mask_token_id=tokenizer.mask_token_id,
                mask_probability=0.15,
                list_special_tokens=tokenizer.all_special_ids,
            )
        perturbed_text = tokenizer.decode(perturbed_input_ids)

    return perturbed_text, perturbation_function.__name__

In [None]:
from tqdm import tqdm
total_size = 0
data={'idx':[],'text':[], 'perturbed_text':[], 'perturbation_function':[]}
for counter, batch_example in tqdm(enumerate(output_filtered)):
  perturbed_text, perturbation_function = collate_fn(batch_example['cleaned_text'])
  data['idx'].append(batch_example['idx'].item())
  data['text'].append(batch_example['cleaned_text'])
  data['perturbed_text'].append(perturbed_text)
  data['perturbation_function'].append(perturbation_function)
  total_size += len(perturbed_text.encode('utf-8'))

  if (counter + 1) % 100000 == 0:
    df = pd.DataFrame.from_dict(data)
    df.to_csv(f'/content/drive/MyDrive/ParsBART/cleaned_data/df{(counter + 1 + saved_clean_rows) / 100000}.csv', index=False)

    df = df.iloc[0:0]
    data['idx'].clear()
    data['text'].clear()
    data['perturbed_text'].clear()
    data['perturbation_function'].clear()
    cleaned_row_idx_file = open('/content/drive/MyDrive/ParsBART/cleaned_data/cleaned_row_idx.txt', "w", encoding="utf-8")
    cleaned_row_idx_file.write(str(batch_example['idx'].item() + 1))
    cleaned_row_idx_file.close()
    cleaned_row_counter_file = open('/content/drive/MyDrive/ParsBART/cleaned_data/cleaned_row_counter.txt', "w", encoding="utf-8")
    cleaned_row_counter_file.write(str(counter + 1 + saved_clean_rows))
    cleaned_row_counter_file.close()
    cleaning_log = open('/content/drive/MyDrive/ParsBART/cleaned_data/cleaning_log.txt', "a", encoding="utf-8")
    cleaning_log.write(f"Cleaned documetns: {counter + 1 + saved_clean_rows} from: {batch_example['idx'].item() + 1} rows ---- Total size: {total_size/(1024*1024)} MB/n")
    cleaning_log.close()
    print(f"Cleaned documetns: {counter + 1 + saved_clean_rows} from: {batch_example['idx'].item() + 1} rows ---- Total size: {total_size/(1024*1024)} MB")

  if total_size > 5.1 * 1024 * 1024 * 1024:
    print(f"out of total_size range!!!!!!!!!! ----  Cleaned documetns: {counter + 1 + saved_clean_rows} from: {batch_example['idx'].item() + 1} rows ---- Total size: {total_size/(1024*1024)} MB")
    break