In [5]:
! pip install transformers torch sklearn pandas
! pip install pdfplumber  

Collecting sklearn
  Using cached sklearn-0.0.post12.tar.gz (2.6 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'error'


  error: subprocess-exited-with-error
  
  × python setup.py egg_info did not run successfully.
  │ exit code: 1
  ╰─> [15 lines of output]
      The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
      rather than 'sklearn' for pip commands.
      
      Here is how to fix this error in the main use cases:
      - use 'pip install scikit-learn' rather than 'pip install sklearn'
      - replace 'sklearn' by 'scikit-learn' in your pip requirements files
        (requirements.txt, setup.py, setup.cfg, Pipfile, etc ...)
      - if the 'sklearn' package is used by one of your dependencies,
        it would be great if you take some time to track which package uses
        'sklearn' instead of 'scikit-learn' and report it to their issue tracker
      - as a last resort, set the environment variable
        SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True to avoid this error
      
      More information is available at
      https://github.com/scikit-learn/sklearn-pypi-packag




[notice] A new release of pip available: 22.2.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
import pdfplumber
import os

def pdf_to_text(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text

def load_dataset(root_dir):
    texts = []
    labels = []
    label_map = {}
    for label_dir in os.listdir(root_dir):
        class_path = os.path.join(root_dir, label_dir)
        if os.path.isdir(class_path):
            label_index = len(label_map)
            label_map[label_dir] = label_index
            for filename in os.listdir(class_path):
                if filename.lower().endswith('.pdf'):
                    file_path = os.path.join(class_path, filename)
                    text = pdf_to_text(file_path)
                    if text:  
                        texts.append(text)
                        labels.append(label_index)
    return texts, labels, label_map

texts, labels, label_map = load_dataset('./data')


In [7]:
from sklearn.model_selection import train_test_split

train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)


In [8]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def encode_texts(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")

train_encodings = encode_texts(train_texts)
test_encodings = encode_texts(test_texts)


  from .autonotebook import tqdm as notebook_tqdm


In [9]:
from torch.utils.data import Dataset, DataLoader
import torch

class ResumeDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ResumeDataset(train_encodings, train_labels)
test_dataset = ResumeDataset(test_encodings, test_labels)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)


In [10]:
from transformers import BertForSequenceClassification, AdamW

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_map))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)

def train(model, data_loader):
    model.train()
    total_loss = 0
    for batch in data_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()
    return total_loss / len(data_loader)

for epoch in range(3):
    loss = train(model, train_loader)
    print(f'Epoch {epoch+1}, Loss: {loss}')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1, Loss: 2.0453323869101974
Epoch 2, Loss: 0.8170448834278498
Epoch 3, Loss: 0.5861756590355831


In [11]:
from sklearn.metrics import accuracy_score

def evaluate(model, data_loader):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in data_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            predictions.extend(torch.argmax(logits, dim=-1).tolist())
            true_labels.extend(batch['labels'].tolist())
    return accuracy_score(true_labels, predictions)

accuracy = evaluate(model, test_loader)
print(f'Test Accuracy: {accuracy}')


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Test Accuracy: 0.7907444668008048


In [20]:
import pdfplumber
from transformers import BertTokenizer, BertForSequenceClassification
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def pdf_to_text(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() if page.extract_text() else ""
    return text

def prepare_input(text):
    encoded_input = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")
    return encoded_input

def predict(model, encoded_input):
    model.eval() 
    with torch.no_grad():
        outputs = model(**encoded_input.to(device))
        pred = torch.argmax(outputs.logits, dim=1).item()
    return pred

text = """
We are looking for an Entry Level Tax Accountant to join our team in Metro Atlanta! We are looking for someone who is self-motivated and able to work in a fast-paced environment. All qualified individuals are encouraged to apply

RESPONSIBILITIES -
• Prepare Business, Real Estate Partnership
• Prepare multi-State Income Tax Preparation
• Prepare sales tax return preparation
• Conduct tax research and form meaningful conclusions for specific projects
• Respond to tax notices

SKILLS -
• Bachelor's Degree Required, in Accounting/Finance or related degree
• 6 Months + of related experience (internships will be considered)
• Strong interpersonal skills. Candidate must have the ability to work in teams and help promote a team environment as well as professionally manage client relationships
• Ability to use Microsoft Offices, Outlook, and various accounting applications
"""
encoded_input = prepare_input(text)
profession_index = predict(model, encoded_input)
profession_name = {v: k for k, v in label_map.items()}[profession_index]  

print("Predicted Profession:", profession_name)


Predicted Profession: BANKING
