In [20]:
import importlib
import json
import logging
from pathlib import Path
from typing import Callable, Mapping
from sklearn.model_selection import train_test_split

from datasets import Dataset,load_dataset, DatasetDict
from transformers import AutoTokenizer, PreTrainedTokenizer
import pickle
import config
import pandas as pd
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /home/sa6981/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [32]:
importlib.reload(config)

<module 'config' from '/scratch/sa6981/llm_unlearn/finetune_copyright/config.py'>

In [3]:
LOGGER = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

In [15]:
def prepare_dataset(dataset_path: Path, min_length: int, context_length: int, 
                    test_size: float, shuffle: bool) -> None:
    """Prepare dataset for training
    """
    tokenizer =  AutoTokenizer.from_pretrained(config.model_name)
    LOGGER.info(f'Start preparing dataset from {dataset_path}')
    text = preprocess_data(dataset_path=dataset_path, min_length=min_length, tokenizer=tokenizer)
    dataset = Dataset.from_dict({'text': [text]})
    tokenized_dataset = dataset.map(tokenize, batched=True, fn_kwargs={'tokenizer': tokenizer, 'context_length': context_length},
                                         remove_columns=dataset.column_names)
    LOGGER.info(f'The tokenized dataset is composed of {tokenized_dataset.num_rows} elements, each one composed of {context_length} tokens.')
    tokenized_dataset_dict = tokenized_dataset.train_test_split(test_size=test_size, shuffle=shuffle)
    LOGGER.info(f'The training dataset is composed of {tokenized_dataset_dict["train"].num_rows} elements, the test dataset is composed of {tokenized_dataset_dict["test"].num_rows} elements.')
#     tokenized_dataset_dict.push_to_hub(hf_repo)
    LOGGER.info(f'Preparing dataset finished.')
    return tokenized_dataset_dict

In [3]:
def preprocess_data(dataset_path: Path, min_length: int, tokenizer: PreTrainedTokenizer) -> str:
    """Prepare dataset for training from the jsonl file.

    Args:
        dataset_path (Path): Extracted text from the book
        min_length (int): Filter pages without text
        tokenizer (PreTrainedTokenizer): HuggingFace tokenizer

    Yields:
        str: text of the pages
    """
    with open(dataset_path, 'r') as f:
        grouped_text = ""
        for line in f:
            elt = json.loads(line)
            text: str = list(elt.values())[0]
            if len(text) > min_length:
                grouped_text += text
        # End of paragraphs defined by ".\n is transformed into EOS token"
        grouped_text = grouped_text.replace(".\n", "." + tokenizer.eos_token)
        return preprocess_text(grouped_text)

In [22]:
def preprocess_text(text: str) -> str:
    text = text.replace('\n', ' ')
    return text

In [8]:
def tokenize(element: Mapping, tokenizer: Callable, context_length: int) -> str:
    inputs = tokenizer(element['text'], truncation=True, return_overflowing_tokens=True, 
                       return_length=True, max_length=context_length)
    inputs_batch = []
    for length, input_ids in zip(inputs['length'], inputs['input_ids']):
        if length == context_length: # We drop the last input_ids that are shorter than max_length
            inputs_batch.append(input_ids)
    return {"input_ids": inputs_batch}


In [33]:
if __name__ == '__main__':

    df_dict = prepare_dataset(
        dataset_path=config.extraction_path, 
        min_length=config.min_length,
        context_length=config.context_length,
        test_size=config.test_size,
        shuffle=config.shuffle,
#         hf_repo=config.hf_repo
    )

INFO:__main__:Start preparing dataset from /scratch/sa6981/llm_unlearn/finetune_copyright/extracted_text.jsonl


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

INFO:__main__:The tokenized dataset is composed of 305 elements, each one composed of 2048 tokens.
INFO:__main__:The training dataset is composed of 274 elements, the test dataset is composed of 31 elements.
INFO:__main__:Preparing dataset finished.


In [34]:
df_dict

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 274
    })
    test: Dataset({
        features: ['input_ids'],
        num_rows: 31
    })
})

In [35]:
import pickle

# Assuming your DatasetDict is named dataset_dict
with open('dataset_dict.pkl', 'wb') as f:
    pickle.dump(df_dict, f)

## Create dataset for unlearning

In [3]:
data_dict = {'texts': [], 'labels': []}

In [14]:
def preprocess_data_unlearn(dataset_path: Path, min_length: int, tokenizer: PreTrainedTokenizer) -> str:
    """Prepare dataset for training from the jsonl file.

    Args:
        dataset_path (Path): Extracted text from the book
        min_length (int): Filter pages without text
        tokenizer (PreTrainedTokenizer): HuggingFace tokenizer

    Yields:
        str: text of the pages
    """
    with open(dataset_path, 'r') as f:
        for line in f:
            elt = json.loads(line)
            text: str = list(elt.values())[0]
            # Count words
            text = text.replace(".\n", ".")
            num_words = count_words(text)
            x = " ".join(text.split()[:num_words * 1 // 5])
            y = " ".join(text.split()[num_words * 1 // 5:])
            
            # Append to the dictionaries
            data_dict['texts'].append(x)
            data_dict['labels'].append(y)
        return data_dict

In [9]:
tokenizer =  AutoTokenizer.from_pretrained(config.model_name)

In [4]:
dataset_path=config.extraction_path

In [5]:
def count_words(text):
    words = text.split()
    return len(words)

In [18]:
unlearn_dataset_dict = preprocess_data_unlearn(dataset_path=dataset_path, min_length=100, tokenizer=tokenizer)

In [23]:
# Split the data_dict into train and test
train_data_dict, test_data_dict = {}, {}
for key, values in unlearn_dataset_dict.items():
    train_values, test_values = train_test_split(values, test_size=0.2, random_state=42)
    train_data_dict[key] = train_values
    test_data_dict[key] = test_values
    
# Create Datasets
train_dataset = Dataset.from_dict(train_data_dict)
test_dataset = Dataset.from_dict(test_data_dict)
# Create a DatasetDict
unlearned_DatasetDict = DatasetDict({"train": train_dataset, "test": test_dataset})

In [24]:
unlearned_DatasetDict

DatasetDict({
    train: Dataset({
        features: ['texts', 'labels'],
        num_rows: 808
    })
    test: Dataset({
        features: ['texts', 'labels'],
        num_rows: 203
    })
})

In [28]:
# Assuming your DatasetDict is named dataset_dict
with open('unlearned_DatasetDict.pkl', 'wb') as f:
    pickle.dump(unlearned_DatasetDict, f)

## create smaller dataset

In [25]:
def preprocess_data_unlearn_small(dataset_path: Path, min_length: int, tokenizer: PreTrainedTokenizer) -> str:
    """Prepare dataset for training from the jsonl file.

    Args:
        dataset_path (Path): Extracted text from the book
        min_length (int): Filter pages without text
        tokenizer (PreTrainedTokenizer): HuggingFace tokenizer

    Yields:
        str: text of the pages
    """
    with open(dataset_path, 'r') as f:
        for line in f:
            elt = json.loads(line)
            text: str = list(elt.values())[0]
#             print(text,"line over")
            # Count words
            text = text.replace(".\n", ".")
            x,y = split_text_into_sentences(text, max_chars=200)
            
            # Append to the dictionaries
            data_dict['texts'].extend(x)
            data_dict['labels'].extend(y)
            
        return data_dict

In [26]:
unlearn_dataset_dict = preprocess_data_unlearn_small(dataset_path=dataset_path, min_length=100, tokenizer=tokenizer)

In [30]:
len(unlearn_dataset_dict['labels'])

22390

In [31]:
# Split the data_dict into train and test
train_data_dict, test_data_dict = {}, {}
for key, values in unlearn_dataset_dict.items():
    train_values, test_values = train_test_split(values, test_size=0.1, random_state=42)
    train_data_dict[key] = train_values
    test_data_dict[key] = test_values
    
# Create Datasets
train_dataset = Dataset.from_dict(train_data_dict)
test_dataset = Dataset.from_dict(test_data_dict)
# Create a DatasetDict
unlearned_DatasetDict = DatasetDict({"train": train_dataset, "test": test_dataset})

In [32]:
unlearned_DatasetDict

DatasetDict({
    train: Dataset({
        features: ['texts', 'labels'],
        num_rows: 20151
    })
    test: Dataset({
        features: ['texts', 'labels'],
        num_rows: 2239
    })
})

In [23]:
def split_text_into_sentences(orig_text, max_chars=200):
    # Tokenize the original text into sentences
    sentences = sent_tokenize(orig_text)
    
    # Initialize variables to store text and label sentences
    text = []
    label = []

    # Process each sentence
    for sentence in sentences:
        sentence = preprocess_text(sentence)
        if len(sentence) <= max_chars:
            # If the sentence is shorter than or equal to max_chars, consider it as "text"
            text.append(sentence)
            label.append('')
        else:
            # If the sentence is longer, split it into "text" and "label"
            text.append(sentence[:max_chars])
            label.append(sentence[max_chars:])
            
    return text, label

In [33]:
# Assuming your DatasetDict is named dataset_dict
with open('unlearned_DatasetDict_small.pkl', 'wb') as f:
    pickle.dump(unlearned_DatasetDict, f)

In [34]:
df = pd.DataFrame(unlearn_dataset_dict)

In [36]:
df.shape

(22390, 2)

In [37]:
df.to_csv("bad_dataset_small.csv")