In [None]:
pip install accelerate -U

In [None]:
pip install transformers -U

In [None]:
import pandas as pd
import numpy as np
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
from tqdm.auto import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import evaluate
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, TrainingArguments, Trainer
from transformers import get_scheduler

In [7]:
import pandas

In [8]:
file_path = './data/data1.csv'

data = pd.read_csv(file_path)

In [None]:
data.head()

In [5]:
label_mapping = {
    'inventory, supplies and equipment': 0,
    'professional services': 1,
    'transportation and travel': 2,
    'utilities': 3,
    'employee benefits and compensation': 4,
    'meals and entertainment': 5,
    'tax payments': 6,
    'legal and compliance fees': 7,
    'business development and investment': 8
}

In [6]:
data['Label'] = data['Label'].map(label_mapping)

In [7]:
text = data['PayeeNote'].tolist()
labels = data['Label'].tolist()

In [8]:
# split the data into train and test sets
train_texts, val_texts, train_labels, val_labels = train_test_split(text, labels, test_size=0.2, random_state=42)

# confirm the shape
print(len(train_texts))
print(len(val_texts))
print(len(train_labels))
print(len(val_labels))

1527
382
1527
382


In [21]:
checkpoint = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(checkpoint)
model = DistilBertForSequenceClassification.from_pretrained(checkpoint, num_labels=9)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
# use GPU if available, else CPU

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [23]:
# custom dataset
class CustomDataset(Dataset):
    def __init__(self, text, labels, tokenizer, max_len=20):
        self.text = text
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
        text = str(self.text[idx])
        labels = torch.tensor(self.labels[idx])

        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_len, return_tensors='pt')

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': labels
        }

In [24]:
train_dataset = CustomDataset(train_texts, train_labels, tokenizer)
test_dataset = CustomDataset(val_texts, val_labels, tokenizer)

In [25]:
# evaluation strategy for the model
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, reference=labels)

In [26]:
# training arguments
training_args = TrainingArguments(output_dir="training_arguments", evaluation_strategy="epoch")

In [27]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
# train the model
trainer.train()

The end. Find the fine_tuned_model and tokenizer files in the fine-tuned-model directory