In [1]:
pip install transformers pandas


Note: you may need to restart the kernel to use updated packages.


In [3]:
import torch
print(torch.cuda.is_available())  # Should return True

True


In [4]:
import logging
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoTokenizer
import torch
import pandas as pd
from torch.utils.data import Dataset


  from .autonotebook import tqdm as notebook_tqdm





In [10]:
# Suppress specific warnings from transformers
logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)

# Load and preprocess training data
train_data = pd.read_csv('train.csv')  # Ensure 'train.csv' is in your working directory

# Fill missing values in columns
train_data['information'].fillna("Unknown information", inplace=True)
train_data['sub_category'].fillna("Unknown sub_category", inplace=True)

# Encode labels for train data
train_data['sub_category_label'] = train_data['sub_category'].astype('category').cat.codes


In [11]:
train_labels = torch.tensor(train_data['sub_category_label'].values)


In [12]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(train_data['sub_category_label'].unique())
)


In [13]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


In [14]:
def tokenize_data(text_series):
    return tokenizer(list(text_series), padding=True, truncation=True, max_length=128, return_tensors="pt")

# Tokenize training data
train_encodings = tokenize_data(train_data['information'])


In [15]:
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# Create dataset
train_dataset = TextDataset(train_encodings, train_labels)


In [17]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(train_data['sub_category_label'].unique())
)


In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)


cuda


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [22]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="no",  # Disable evaluation
    save_strategy="epoch",    # Save after each epoch
    logging_strategy="epoch", # Log after each epoch
    per_device_train_batch_size=8,
    num_train_epochs=5,       # Set number of epochs
    weight_decay=0.01,
    push_to_hub=False,
    report_to="none",         # Disable logging to external systems
)




In [23]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)


In [24]:
trainer.train()


  item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
  attn_output = torch.nn.functional.scaled_dot_product_attention(
 20%|██        | 11711/58555 [18:58<1:15:15, 10.37it/s]

{'loss': 1.474, 'grad_norm': 6.256118297576904, 'learning_rate': 4e-05, 'epoch': 1.0}


  item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
 40%|████      | 23422/58555 [38:05<55:20, 10.58it/s]  

{'loss': 1.2936, 'grad_norm': 8.041215896606445, 'learning_rate': 3e-05, 'epoch': 2.0}


  item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
 60%|██████    | 35133/58555 [57:18<41:53,  9.32it/s]  

{'loss': 1.1565, 'grad_norm': 6.1970744132995605, 'learning_rate': 2e-05, 'epoch': 3.0}


  item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
 80%|████████  | 46844/58555 [1:16:33<18:44, 10.41it/s]

{'loss': 0.9776, 'grad_norm': 13.413317680358887, 'learning_rate': 1e-05, 'epoch': 4.0}


  item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
100%|██████████| 58555/58555 [1:35:45<00:00, 10.39it/s]

{'loss': 0.8006, 'grad_norm': 20.735576629638672, 'learning_rate': 0.0, 'epoch': 5.0}


100%|██████████| 58555/58555 [1:35:46<00:00, 10.19it/s]

{'train_runtime': 5746.7397, 'train_samples_per_second': 81.512, 'train_steps_per_second': 10.189, 'train_loss': 1.140459106956067, 'epoch': 5.0}





TrainOutput(global_step=58555, training_loss=1.140459106956067, metrics={'train_runtime': 5746.7397, 'train_samples_per_second': 81.512, 'train_steps_per_second': 10.189, 'total_flos': 1.552233202246656e+16, 'train_loss': 1.140459106956067, 'epoch': 5.0})

In [26]:
# Save the final model and tokenizer
model.save_pretrained("./results/sc_final_model")
tokenizer.save_pretrained("./results/sc_final_model")

# Save the model weights separately
torch.save(model.state_dict(), "./results/sc_final_model_weights.pth")


