#### Import Necessary Libraries

In [40]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast, Trainer, TrainingArguments
from datasets import Dataset
import torch
import evaluate

#### Load and Preprocess the Data

In [41]:
# Load cleaned data
train_df = pd.read_csv('E:\\Vocational\\Lighthouse Labs\\Flex Course\\Projects\\P05_Large Language Models\\llm_project\\data\\cleaned_train.csv.gz', compression='gzip')
test_df = pd.read_csv('E:\\Vocational\\Lighthouse Labs\\Flex Course\\Projects\\P05_Large Language Models\\llm_project\\data\\cleaned_test.csv.gz', compression='gzip')

# Select a subset of the data (e.g., 1% of the original dataset)
train_subset, _ = train_test_split(train_df, test_size=0.99, random_state=42)
test_subset, _ = train_test_split(test_df, test_size=0.99, random_state=42)

# Convert pandas DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_subset)
test_dataset = Dataset.from_pandas(test_subset)

#### Tokenize the Data

In [42]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples["clean_text"], padding="max_length", truncation=True, max_length=512)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set the format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])



Map:   0%|          | 0/249 [00:00<?, ? examples/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

#### Initialize the Model and Metrics

In [43]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

accuracy_metric = evaluate.load("accuracy")

def compute_metrics(p):
    predictions, labels = p
    # Convert predictions to a tensor if they are in NumPy array format
    predictions = torch.tensor(predictions) if isinstance(predictions, np.ndarray) else predictions
    labels = torch.tensor(labels) if isinstance(labels, np.ndarray) else labels
    predictions = torch.argmax(predictions, dim=1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Define Training Arguments and Trainer

In [44]:
training_args = TrainingArguments(
    output_dir='my_model',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


#### Train the Model

In [45]:
trainer.train()

  0%|          | 0/48 [00:00<?, ?it/s]

{'loss': 0.6978, 'learning_rate': 1.5833333333333333e-05, 'epoch': 0.62}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 0.6831721663475037, 'eval_accuracy': 0.6451612903225806, 'eval_runtime': 248.2705, 'eval_samples_per_second': 0.999, 'eval_steps_per_second': 0.064, 'epoch': 1.0}
{'loss': 0.6878, 'learning_rate': 1.1666666666666668e-05, 'epoch': 1.25}
{'loss': 0.6747, 'learning_rate': 7.500000000000001e-06, 'epoch': 1.88}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 0.6659445762634277, 'eval_accuracy': 0.6088709677419355, 'eval_runtime': 1372.8028, 'eval_samples_per_second': 0.181, 'eval_steps_per_second': 0.012, 'epoch': 2.0}
{'loss': 0.6489, 'learning_rate': 3.3333333333333333e-06, 'epoch': 2.5}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 0.6499881148338318, 'eval_accuracy': 0.7540322580645161, 'eval_runtime': 577.4453, 'eval_samples_per_second': 0.429, 'eval_steps_per_second': 0.028, 'epoch': 3.0}
{'train_runtime': 5304.2853, 'train_samples_per_second': 0.141, 'train_steps_per_second': 0.009, 'train_loss': 0.6700204710165659, 'epoch': 3.0}


TrainOutput(global_step=48, training_loss=0.6700204710165659, metrics={'train_runtime': 5304.2853, 'train_samples_per_second': 0.141, 'train_steps_per_second': 0.009, 'train_loss': 0.6700204710165659, 'epoch': 3.0})

#### Save the Model

In [46]:
trainer.save_model("my_model")
tokenizer.save_pretrained("my_model")

('my_model\\tokenizer_config.json',
 'my_model\\special_tokens_map.json',
 'my_model\\vocab.txt',
 'my_model\\added_tokens.json',
 'my_model\\tokenizer.json')

#### Evaluate the Model

In [47]:
# Load the fine-tuned model
model = DistilBertForSequenceClassification.from_pretrained('my_model')
tokenizer = DistilBertTokenizerFast.from_pretrained('my_model')

# Create the Trainer again with the fine-tuned model
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Evaluate the model
results = trainer.evaluate()
print(results)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 0.6499881148338318, 'eval_accuracy': 0.7540322580645161, 'eval_runtime': 587.2083, 'eval_samples_per_second': 0.422, 'eval_steps_per_second': 0.027}
