In [None]:
# !pip install PyPDF2
# !pip install datasets
# !pip install evaluate
# !pip install transformers[torch]

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.7/232.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3

In [None]:

import os
import pandas as pd
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datasets import DatasetDict, Dataset
from PyPDF2 import PdfReader
from transformers import pipeline
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import evaluate
import numpy as np



In [None]:
#definition to generate dataset
def generate_dataset(dataset_dir,label_encoder_path,test_size ):
    # function to extract text from PDF files
    def extract_text_from_pdf(pdf_path):
        text = ""
        with open(pdf_path, "rb") as file:
            reader = PdfReader(file)
            for page in reader.pages:
                text += page.extract_text()
        return text.strip()

    # DataFrame to store the extracted data
    columns = ["document","category"]
    data = []

    # Iterate over the PDF files and extract text

    for category in os.listdir(dataset_dir):
        category_dir = os.path.join(dataset_dir, category)
        if os.path.isdir(category_dir):
            for file_name in os.listdir(category_dir):
                if file_name.endswith(".pdf"):
                    pdf_path = os.path.join(category_dir, file_name)
                    document = extract_text_from_pdf(pdf_path)
                    data.append([document,category])

    # Create DataFrame
    df = pd.DataFrame(data, columns=columns)

    df = df.rename(columns={ 'document':'text','category': 'label_text'})

    label_encoder = LabelEncoder()
    df['label'] = label_encoder.fit_transform(df['label_text'])
    import pickle
    # Save the label encoder to a file
    with open(label_encoder_path +"/label_encoder.pkl", "wb") as f:
        pickle.dump(label_encoder, f)

    df_train, df_test = train_test_split(df, test_size= test_size, random_state=42)
    # Assuming 'df_train' and 'df_test' are your DataFrames for training and testing data respectively
    train_dataset = Dataset.from_pandas(df_train)
    test_dataset = Dataset.from_pandas(df_test)
    train_dataset = train_dataset.remove_columns('__index_level_0__')
    test_dataset = test_dataset.remove_columns('__index_level_0__')
    # Define the features of the datasets
    features = {
        'document': ('string',),
        'category': ('string',)
    }

    # Set the features of the datasets
    # Create DatasetDict
    dataset = DatasetDict({
        'train': train_dataset,
        'test': test_dataset
    })

    return dataset



In [None]:
#Tokenization and train,test generation
def tokenize_dataset(dataset):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    #tokenizer.pad_token = tokenizer.eos_token
    def tokenize_function(examples):
      return tokenizer(examples["text"], padding="max_length", truncation=True)

    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(len(tokenized_datasets["train"])))
    eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(len(tokenized_datasets["test"])))

    return tokenizer, train_dataset, eval_dataset

In [None]:
#Model Training
def model_training(output_dir , model, train_dataset,eval_dataset):
    from transformers import TrainingArguments, Trainer
    metric = evaluate.load("accuracy")

    def compute_metrics(eval_pred):
      logits, labels = eval_pred
      predictions = np.argmax(logits, axis=-1)
      return metric.compute(predictions=predictions, references=labels)

    training_args = TrainingArguments(
      output_dir=output_dir,
      #evaluation_strategy="epoch",
      per_device_train_batch_size=1,  # Reduce batch size here
      per_device_eval_batch_size=1,    # Optionally, reduce for evaluation as well
      gradient_accumulation_steps=4
      )


    trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train_dataset,
      eval_dataset= eval_dataset,
      compute_metrics=compute_metrics,

    )

    trainer.train()
    return trainer

In [None]:
#Start

result_directory_path = "/content/drive/MyDrive/bert-80-20"
if not os.path.exists(result_directory_path):
    os.makedirs(result_directory_path)

dataset_dir = r"/content/drive/MyDrive/dataset/dataset_pdf"
output_dir = result_directory_path + "/test_trainer"
label_encoder_path =result_directory_path
model_path = result_directory_path +"/saved_bert_model_80_20"

test_size = 0.20
dataset = generate_dataset(dataset_dir,label_encoder_path , test_size)
tokenizer, train_dataset, eval_dataset = tokenize_dataset(dataset)

num_labels = len(set(dataset["train"]["label"]))

model=BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels= num_labels)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/1780 [00:00<?, ? examples/s]

Map:   0%|          | 0/445 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
 trainer = model_training(output_dir , model, train_dataset,eval_dataset)

Step,Training Loss


In [None]:

trainer.evaluate()

trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)