<a href="https://colab.research.google.com/github/prajapatkavitha/4213-Project4/blob/main/Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
!pip install transformers datasets accelerate -U

import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    TrainingArguments,
    Trainer,
    pipeline
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score



In [9]:
data = {
    "text": [
        "The software update completely crashed my system, losing all my data.",
        "Installation was seamless, and the device performs exactly as advertised.",
        "Waited over an hour for the technician only for them to cancel the appointment.",
        "The mobile app is intuitive and fast, making it a joy to use.",
        "I found the documentation incomplete and utterly useless.",
        "Five stars! The battery life on this laptop is incredible.",
        "This warranty claim process is unnecessarily complicated and frustrating.",
        "My issue was resolved immediately by a very professional support agent.",
        "The packaging was flimsy, and the item arrived dented.",
        "An outstanding value for the price, definitely purchasing again."
    ] * 10,
    "label": [1, 0, 1, 0, 0, 1, 0, 1, 0, 1] * 10
}
df = pd.DataFrame(data).sample(frac=1).reset_index(drop=True)
X_train, X_temp, y_train, y_temp = train_test_split(df['text'], df['label'], test_size=0.3, random_state=42, stratify=df['label'])
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)
train_dataset = Dataset.from_dict({'text': X_train.tolist(), 'label': y_train.tolist()})
val_dataset = Dataset.from_dict({'text': X_val.tolist(), 'label': y_val.tolist()})
test_dataset = Dataset.from_dict({'text': X_test.tolist(), 'label': y_test.tolist()})
print(f"Train samples: {len(train_dataset)}, Validation samples: {len(val_dataset)}")

Train samples: 70, Validation samples: 15


In [10]:
MODEL_CHECKPOINT = "distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_CHECKPOINT)
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)
tokenized_train.set_format("torch", columns=['input_ids', 'attention_mask', 'label'])
tokenized_val.set_format("torch", columns=['input_ids', 'attention_mask', 'label'])

Map:   0%|          | 0/70 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

In [12]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='binary')
    return {"accuracy": acc, "f1": f1}
model = DistilBertForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=2)
output_dir = './results'
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="none"
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    compute_metrics=compute_metrics,
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
print("\nEvaluating on Test Set...")
tokenized_test_raw = test_dataset.map(tokenize_function, batched=True)
tokenized_test_raw.set_format("torch", columns=['input_ids', 'attention_mask', 'label'])
test_results = trainer.evaluate(tokenized_test_raw)
print("Test Results:", test_results)
final_model_path = trainer.model.save_pretrained('./final_sentiment_model')
final_tokenizer_path = tokenizer.save_pretrained('./final_sentiment_model')
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model='./final_sentiment_model',
    tokenizer='./final_sentiment_model',
    device=0
)
test_sentences = [
    "The customer support was excellent and solved my problem instantly.",
    "I've never been more frustrated with a piece of equipment.",
    "The installation guide was decent, though a little unclear."
]
print("\nPipeline Inference Test:")
predictions = sentiment_pipeline(test_sentences)
for sentence, pred in zip(test_sentences, predictions):
    sentiment = "Positive" if pred['label'] == 'LABEL_1' else "Negative"
    print(f"Text: '{sentence}' -> {sentiment} (Score: {pred['score']:.4f})")


Evaluating on Test Set...


Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Test Results: {'eval_loss': 0.6913252472877502, 'eval_model_preparation_time': 0.0024, 'eval_accuracy': 0.5333333333333333, 'eval_f1': 0.0, 'eval_runtime': 0.0962, 'eval_samples_per_second': 155.974, 'eval_steps_per_second': 20.797}


Device set to use cuda:0



Pipeline Inference Test:
Text: 'The customer support was excellent and solved my problem instantly.' -> Negative (Score: 0.5568)
Text: 'I've never been more frustrated with a piece of equipment.' -> Negative (Score: 0.5709)
Text: 'The installation guide was decent, though a little unclear.' -> Negative (Score: 0.5762)
