In [None]:
# Install necessary libraries
!pip install -q datasets transformers

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/471.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pandas as pd
import torch
from datasets import Dataset, DatasetDict
from transformers import (
    DistilBertTokenizer,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
import os

In [None]:
# Check for CUDA availability
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using CUDA device")
else:
    device = torch.device("cpu")
    print("CUDA device not found, using CPU")

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

train_df = train_df.rename(columns={"target": "labels"})
test_df = test_df.rename(columns={"target": "labels"})

train_df['labels'] = train_df['labels'].astype(int)
test_df['labels'] = test_df['labels'].astype(int)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

dataset_dict = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
# Enable gradient checkpointing
model.gradient_checkpointing_enable()
model.to(device)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["review"], truncation=True, max_length=128)

# Tokenize the datasets
tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns('review')
tokenized_datasets.set_format("torch")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# Compute metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average="binary", zero_division=0
    )
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

In [None]:
# Define training arguments with CUDA support
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=50,
    save_total_limit=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    # Remove 'no_cuda' parameter to allow Trainer to use CUDA if available
    disable_tqdm=False,
    gradient_checkpointing=True,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
# Train the model
train_result = trainer.train()
trainer.save_model("./saved_model")

In [None]:
# Evaluate the model
metrics = trainer.evaluate()

trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)
trainer.save_state()

# Saving the loss values for each epoch to a CSV file
train_losses = trainer.state.log_history
loss_df = pd.DataFrame(train_losses)
loss_df.to_csv("training_loss.csv", index=False)
print(f"Final evaluation metrics: {metrics}")

In [None]:
!zip -r /content/distilbert.zip /content/saved_model

  adding: content/saved_model/ (stored 0%)
  adding: content/saved_model/model.safetensors (deflated 8%)
  adding: content/saved_model/vocab.txt (deflated 53%)
  adding: content/saved_model/training_args.bin (deflated 51%)
  adding: content/saved_model/special_tokens_map.json (deflated 42%)
  adding: content/saved_model/tokenizer_config.json (deflated 75%)
  adding: content/saved_model/config.json (deflated 46%)


In [None]:
from google.colab import files
files.download("/content/distilbert.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>