In [23]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")


Using device: cuda


In [24]:
import pandas as pd
import torch
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, AdamW, get_scheduler
from torch.utils.data import Dataset, DataLoader
from transformers import DataCollatorWithPadding
from torch.cuda.amp import autocast, GradScaler
from tqdm.auto import tqdm


In [25]:
import pandas as pd
import numpy as np

df = pd.read_csv("train.csv")

# Replace empty strings in sub_category with 'NULL'
df['sub_category'] = df['sub_category'].replace('', 'NULL')


In [26]:
model_name = "distilbert-base-uncased"
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=2)  # Adjust `num_labels` to match categories
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model.to(device)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [27]:
from sklearn.preprocessing import LabelEncoder

# Encode the category column
category_encoder = LabelEncoder()
df['category_label'] = category_encoder.fit_transform(df['category'])

# Encode the sub_category column
sub_category_encoder = LabelEncoder()
df['sub_category_label'] = sub_category_encoder.fit_transform(df['sub_category'])

# If using just one label for training, select one (e.g., `category_label` or `sub_category_label`)
# Here, we'll use `category_label` for this example
df = df[['crimeaditionalinfo', 'category_label']]


In [28]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer

model_name = "distilbert-base-uncased"
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=len(df['category_label'].unique()))
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model.to(device)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [29]:
from torch.utils.data import Dataset, DataLoader
from transformers import DataCollatorWithPadding

class CrimeDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.data = dataframe
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        inputs = self.tokenizer(
            item['crimeaditionalinfo'],
            truncation=True,
            max_length=128,
            padding="max_length",
            return_tensors="pt"
        )
        label = torch.tensor(item['category_label'])
        inputs["labels"] = label
        return {key: val.squeeze(0) for key, val in inputs.items()}


In [30]:
train_dataset = CrimeDataset(df, tokenizer)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=data_collator)


In [31]:
from transformers import AdamW, get_scheduler
from torch.cuda.amp import autocast, GradScaler
from tqdm.auto import tqdm

optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 150  # Increase if needed
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
scaler = GradScaler()
progress_bar = tqdm(range(num_training_steps))


  scaler = GradScaler()
  0%|          | 250/58560 [07:42<29:57:11,  1.85s/it]


In [34]:
model.train()
for epoch in range(num_epochs):
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with autocast():  # Mixed precision for faster computation
            outputs = model(**batch)
            loss = outputs.loss
        
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        lr_scheduler.step()
        progress_bar.update(1)
        
        # Optionally print loss every few steps
        if progress_bar.n % 100 == 0:
            print(f"Step {progress_bar.n}, Loss: {loss.item()}")


  with autocast():  # Mixed precision for faster computation


Step 100, Loss: 0.4648323059082031




Step 200, Loss: 0.9719181060791016




ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

