In [1]:
pip install transformers datasets streamlit matplotlib


Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
print(torch.cuda.is_available())  # Should return True


True


In [3]:
import pandas as pd

data = pd.read_csv('train.csv')  # Ensure 'train.csv' is in your working directory


In [4]:
from transformers import DistilBertTokenizer
from sklearn.preprocessing import LabelEncoder

# Initialize tokenizer and encoders
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
category_encoder = LabelEncoder()
sub_category_encoder = LabelEncoder()

# Fill missing values in the 'information', 'category', and 'sub_category' columns
data['information'].fillna("Unknown information", inplace=True)
data['category'].fillna("Unknown category", inplace=True)
data['sub_category'].fillna("Unknown sub-category", inplace=True)

# Encode category and sub_category columns, even if there are "Unknown" values
data['category_label'] = category_encoder.fit_transform(data['category'])
data['sub_category_label'] = sub_category_encoder.fit_transform(data['sub_category'])

# Tokenize information column, ensuring 'Unknown information' is processed
def tokenize_text(text):
    return tokenizer(text, padding="max_length", truncation=True, return_tensors="pt")

data['inputs'] = data['information'].apply(tokenize_text)



  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from torch.utils.data import Dataset

class TextClassificationDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)

# Split dataset for category and sub-category tasks
category_dataset = TextClassificationDataset(
    dict(input_ids=[x['input_ids'].squeeze() for x in data['inputs']],
         attention_mask=[x['attention_mask'].squeeze() for x in data['inputs']]),
    data['category_label'].tolist()
)

sub_category_dataset = TextClassificationDataset(
    dict(input_ids=[x['input_ids'].squeeze() for x in data['inputs']],
         attention_mask=[x['attention_mask'].squeeze() for x in data['inputs']]),
    data['sub_category_label'].tolist()
)


In [6]:
pip install transformers


Note: you may need to restart the kernel to use updated packages.


In [7]:
from transformers import AutoModelForSequenceClassification, TrainingArguments


In [14]:
# Ensure necessary imports
import logging
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoTokenizer
import torch
import pandas as pd
from torch.utils.data import Dataset

# Suppress specific warnings from transformers
logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)

# Load and preprocess training data
train_data = pd.read_csv('train.csv')  # Ensure 'train.csv' is in your working directory

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Fill missing values in columns
train_data['information'].fillna("Unknown information", inplace=True)
train_data['category'].fillna("Unknown category", inplace=True)

# Encode labels for train data
train_data['category_label'] = train_data['category'].astype('category').cat.codes

# Tokenize data
def tokenize_data(text_series):
    return tokenizer(list(text_series), padding=True, truncation=True, max_length=128, return_tensors="pt")

train_encodings = tokenize_data(train_data['information'])

# Convert labels to tensors
train_labels = torch.tensor(train_data['category_label'].values)

# Define custom dataset
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# Create dataset
train_dataset = TextDataset(train_encodings, train_labels)

# Define model
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(train_data['category_label'].unique())
)

# Check if GPU is available and move the model to CUDA
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set up training arguments (without evaluation)
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="no",  # Disable evaluation
    save_strategy="epoch",    # Save after each epoch
    logging_strategy="epoch", # Log after each epoch
    per_device_train_batch_size=8,
    num_train_epochs=5,       # Reduced to 5 epochs
    weight_decay=0.01,
    push_to_hub=False,
    report_to="none",         # Disable any logging reports to external systems
)

# Initialize Trainer with only the train dataset
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

# Train the model
trainer.train()

# Save the final model and tokenizer
model.save_pretrained("./results/final_model")
tokenizer.save_pretrained("./results/final_model")

# Save the model weights separately
torch.save(model.state_dict(), "./results/final_model_weights.pth")


  0%|          | 1044/1756650 [09:22<262:50:48,  1.86it/s]
  1%|          | 3175/292775 [07:49<11:53:35,  6.76it/s]
  item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
 20%|██        | 11711/58555 [21:28<1:58:48,  6.57it/s]

{'loss': 0.7392, 'grad_norm': 1.6270290613174438, 'learning_rate': 4e-05, 'epoch': 1.0}


  item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
 40%|████      | 23422/58555 [45:43<58:43,  9.97it/s]   

{'loss': 0.6537, 'grad_norm': 5.032623291015625, 'learning_rate': 3e-05, 'epoch': 2.0}


  item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
 60%|██████    | 35133/58555 [1:05:28<36:56, 10.57it/s]  

{'loss': 0.5809, 'grad_norm': 1.1534992456436157, 'learning_rate': 2e-05, 'epoch': 3.0}


  item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
 80%|████████  | 46844/58555 [1:24:39<18:42, 10.43it/s]  

{'loss': 0.4961, 'grad_norm': 4.197898864746094, 'learning_rate': 1e-05, 'epoch': 4.0}


  item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
100%|██████████| 58555/58555 [1:43:57<00:00, 10.44it/s]

{'loss': 0.4174, 'grad_norm': 17.624574661254883, 'learning_rate': 0.0, 'epoch': 5.0}


100%|██████████| 58555/58555 [1:43:59<00:00,  9.39it/s]


{'train_runtime': 6238.9944, 'train_samples_per_second': 75.081, 'train_steps_per_second': 9.385, 'train_loss': 0.5774774217349073, 'epoch': 5.0}
