In [1]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [2]:
from PyPDF2 import PdfWriter,PdfReader
from io import StringIO, BytesIO
import PyPDF2
import requests
import re
from openpyxl import load_workbook
import os
import pandas as pd
from tqdm import tqdm
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.optim import AdamW
from tqdm import tqdm
from datasets import Dataset

import torch

In [3]:
excel_file = '/kaggle/input/parspec-dataset/Parspec_dataset.xlsx'

In [4]:
train = pd.read_excel(excel_file,'train_data')
test = pd.read_excel(excel_file,'test_data')
train.shape,test.shape

((2570, 2), (400, 2))

In [5]:
# For train and test, replace 'target_col' from str to int
key_pairs = {'cable': 0, 'fuses':1, 'lighting':2, 'others':3}
train['target_col'] = train['target_col'].map(key_pairs)
test['target_col'] = test['target_col'].map(key_pairs)
train.shape,test.shape

((2570, 2), (400, 2))

In [6]:
train.head()

Unnamed: 0,datasheet_link,target_col
0,https://lfillumination.com/files/specsheets/EF...,2
1,https://lfillumination.com/files/specsheets/EF...,2
2,https://lfillumination.com/files/specsheets/EF...,2
3,https://www.waclighting.com/storage/waclightin...,2
4,https://www.acuitybrands.com/api/products/geta...,2


In [7]:
def get_text(urls:list[str],filename: str,batch_size = 8):
    pdf_texts = []
    pdf_io_bytes = ""
    l = len(urls)
    batches = [urls[i:min(i+batch_size,l)] for i in range(0,l,batch_size)]
    for batch in tqdm(batches):
         # # write to new pdf
         for i,url in enumerate(batch):   
            try:
                response = requests.get(url, timeout=5.0)    
                pdf_io_bytes = BytesIO(response.content)
                pdf_text = PyPDF2.PdfReader(pdf_io_bytes).pages[0].extract_text()
                pdf_texts.append( url + pdf_text)
            except Exception as e:
                pdf_texts.append(url)
            if i%10 == 9:
                pd.DataFrame({'urls':urls[:len(pdf_texts)],'text':pdf_texts}).to_csv(f'{filename}.csv')
    return pdf_texts

In [8]:
train["data_text"] = get_text(train['datasheet_link'],"train")
test["data_text"] = get_text(test['datasheet_link'],"test")

100%|██████████| 322/322 [42:00<00:00,  7.83s/it]
100%|██████████| 50/50 [04:24<00:00,  5.28s/it]


In [9]:
train = train[train['data_text'] != "  "].reset_index()
print(train.shape)
train.head()

(2570, 4)


Unnamed: 0,index,datasheet_link,target_col,data_text
0,0,https://lfillumination.com/files/specsheets/EF...,2,https://lfillumination.com/files/specsheets/EF...
1,1,https://lfillumination.com/files/specsheets/EF...,2,https://lfillumination.com/files/specsheets/EF...
2,2,https://lfillumination.com/files/specsheets/EF...,2,https://lfillumination.com/files/specsheets/EF...
3,3,https://www.waclighting.com/storage/waclightin...,2,https://www.waclighting.com/storage/waclightin...
4,4,https://www.acuitybrands.com/api/products/geta...,2,https://www.acuitybrands.com/api/products/geta...


In [10]:
test[test['data_text'] != "  "].shape

(400, 3)

In [11]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Convert datasets to tokenized format

def tokenize_data(data):
    return tokenizer(data["data_text"], padding=True, truncation=True, return_tensors="pt")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [12]:
train_dataset = Dataset.from_pandas(train)
test_dataset = Dataset.from_pandas(test)

train_dataset = train_dataset.map(tokenize_data, batched=True)
val_dataset = test_dataset.map(tokenize_data, batched=True)


Map:   0%|          | 0/2570 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

In [13]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16)

In [14]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=4)
optimizer = AdamW(model.parameters(), lr=2e-5)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop
epochs = 3
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    correct_preds = 0
    total_preds = 0
    
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{epochs}"):
        # Move data to the device
        input_ids = torch.stack(batch["input_ids"],dim = 1).to(device)
        attention_mask = torch.stack(batch["attention_mask"],dim = 1).to(device)
        labels = batch["target_col"].to(device)
        
        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        
        # Backward pass
        loss.backward()
        optimizer.step()

        # Track loss and accuracy
        running_loss += loss.item()
        _, preds = torch.max(logits, dim=1)
        correct_preds += torch.sum(preds == labels)
        total_preds += labels.size(0)
    
    epoch_loss = running_loss / len(train_dataloader)
    epoch_accuracy = correct_preds.double() / total_preds
    print(f"Epoch {epoch+1} Loss: {epoch_loss:.4f} Accuracy: {epoch_accuracy:.4f}")


Epoch 1/3: 100%|██████████| 161/161 [01:12<00:00,  2.21it/s]


Epoch 1 Loss: 0.5262 Accuracy: 0.8191


Epoch 2/3: 100%|██████████| 161/161 [01:12<00:00,  2.22it/s]


Epoch 2 Loss: 0.0988 Accuracy: 0.9747


Epoch 3/3: 100%|██████████| 161/161 [01:12<00:00,  2.22it/s]

Epoch 3 Loss: 0.0455 Accuracy: 0.9891





In [16]:
model.eval()
correct_preds = 0
total_preds = 0

for batch in tqdm(val_dataloader, desc="Evaluating"):
    input_ids = torch.stack(batch["input_ids"],dim = 1).to(device)
    attention_mask = torch.stack(batch["attention_mask"],dim = 1).to(device)
    labels = batch["target_col"].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    _, preds = torch.max(logits, dim=1)
    correct_preds += torch.sum(preds == labels)
    total_preds += labels.size(0)

val_accuracy = correct_preds.double() / total_preds
print(f"Validation Accuracy: {val_accuracy:.4f}")


Evaluating: 100%|██████████| 25/25 [00:02<00:00,  8.54it/s]


Validation Accuracy: 0.9225


In [17]:
model.save_pretrained("./distilbert-classification")
tokenizer.save_pretrained("./distilbert-classification")

('./distilbert-classification/tokenizer_config.json',
 './distilbert-classification/special_tokens_map.json',
 './distilbert-classification/vocab.txt',
 './distilbert-classification/added_tokens.json')

In [18]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer

# Load the saved model and tokenizer
load_model = DistilBertForSequenceClassification.from_pretrained("/kaggle/working/distilbert-classification")
load_tokenizer = DistilBertTokenizer.from_pretrained("/kaggle/working/distilbert-classification")
