In [None]:
import pandas as pd

TEST_PATH = './data/Genre Classification Dataset/test_data_solution.txt' 
# ID ::: TITLE ::: GENRE ::: DESCRIPTION
TRAIN_PATH = './data/Genre Classification Dataset/train_data.txt'

In [None]:
# Read the data
train = pd.read_csv(TRAIN_PATH, sep=':::', names=['id', 'title', 'genre', 'description'], engine='python')
test = pd.read_csv(TEST_PATH, sep=':::', names=['id', 'title', 'genre', 'description'], engine='python')

from sklearn.model_selection import train_test_split

# Encode the labels and save the mapping
genre_map = {genre: i for i, genre in enumerate(train['genre'].unique())}
train['genre'] = train['genre'].factorize()[0].astype('int')

# Split the data into train and validation
train, val = train_test_split(train, test_size=0.2, random_state=42, stratify=train['genre'])

In [None]:
# Train the pre-trained model on the train data using transformers library 
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
num_labels = train['genre'].nunique()
# Load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

# Tokenize the data
train_encodings = tokenizer(train['description'].tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val['description'].tolist(), truncation=True, padding=True)
test_encodings = tokenizer(test['description'].tolist(), truncation=True, padding=True)

# Create the dataset
import torch
import os

class GenreDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)

train_dataset = GenreDataset(train_encodings, train['genre'].tolist())
val_dataset = GenreDataset(val_encodings, val['genre'].tolist())
test_dataset = GenreDataset(test_encodings, test['genre'].tolist())

# Train the model
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total # of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()


In [None]:
# Evaluate the model
trainer.evaluate()