In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pdf2image import convert_from_path
import pytesseract
import re
import os
import glob

In [4]:
def clean_text(text):
    #replacing multiple spaces and newlines with a single space
    text = re.sub(r'\s+', ' ', text).strip()
    #fixing hyphenation issues
    text = re.sub(r'-\s+', '', text)
    return text

all_docs_text = ""
pdf_folder_path = "./data/"

pdf_files = glob.glob(os.path.join(pdf_folder_path, "*.pdf"))
print(f"Found {len(pdf_files)} PDF files to process.")

for pdf_path in pdf_files:
    print(f"Processing: {os.path.basename(pdf_path)}")
    try:
        doc_images = convert_from_path(pdf_path)
        for page_data in doc_images:
            all_docs_text += pytesseract.image_to_string(page_data) + "\n"
    except Exception as e:
        print(f"      Error processing {os.path.basename(pdf_path)}: {e}")

print("\nCleaning the combined text from all documents...")
cleaned_text = clean_text(all_docs_text)

print("Splitting the combined text into chunks...")
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
)
chunks = text_splitter.split_text(cleaned_text)

# 6. Save the processed chunks to the file ONCE
with open('processed_chunks.txt', 'w') as f:
    for chunk in chunks:
        f.write(chunk + '\n')

print(f"\nSuccessfully processed {len(pdf_files)} PDFs and saved {len(chunks)} chunks to processed_chunks.txt")

Found 8 PDF files to process.
Processing: 8.pdf
Processing: 6.pdf
Processing: 7.pdf
Processing: 5.pdf
Processing: 4.pdf
Processing: 1.pdf
Processing: 3.pdf
Processing: 2.pdf

Cleaning the combined text from all documents...
Splitting the combined text into chunks...

Successfully processed 8 PDFs and saved 96 chunks to processed_chunks.txt


In [5]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="output",
    # eval_strategy="epoch",
    push_to_hub=False,
)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "HuggingFaceTB/SmolLM2-135M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [7]:
data = chunks

max_len = max([len(i) for i in data])
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

def tokenize(examples):
    return tokenizer(examples, padding='max_length', truncation=True, max_length=max_len)


data = [tokenize(i) for i in data]


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
