In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load data
df = pd.read_csv("youtube-comments-sentiment.csv")

In [3]:
# Normalize sentiment to lowercase
df['Sentiment'] = df['Sentiment'].str.lower()
label_map = {'negative': 0, 'neutral': 1, 'positive': 2}
df = df[df['Sentiment'].isin(label_map.keys())]
df['label'] = df['Sentiment'].map(label_map)

In [4]:
# Only keep positive and negative sentiments
df = df[df['Sentiment'].isin(['positive', 'negative'])]

# Map to numeric labels: negative=0, positive=1
label_map = {'negative': 0, 'positive': 1}
df['label'] = df['Sentiment'].map(label_map)

In [5]:
# # Map sentiment to numeric labels
# label_map = {'negative': 0, 'neutral': 1, 'positive': 2}
# df = df[df['Sentiment'].isin(label_map.keys())]
# df['label'] = df['Sentiment'].map(label_map)

In [22]:
# Use only 50% of the data
df = df.sample(frac=0.5, random_state=42).reset_index(drop=True)

In [23]:
# Split data
train_df, test_df = train_test_split(df, test_size=0.1, stratify=df['label'], random_state=42)

In [24]:
# Convert to HuggingFace Dataset
train_dataset = Dataset.from_pandas(train_df[['CommentText', 'label']])
test_dataset = Dataset.from_pandas(test_df[['CommentText', 'label']])

In [25]:
# Tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')

def tokenize(batch):
    return tokenizer(batch['CommentText'], padding='max_length', truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

Map: 100%|██████████| 310226/310226 [00:19<00:00, 15900.26 examples/s]
Map: 100%|██████████| 34470/34470 [00:02<00:00, 15646.23 examples/s]


In [26]:
print(df['Sentiment'].value_counts())
print(df['label'].value_counts())

Sentiment
negative    173268
positive    171428
Name: count, dtype: int64
label
0    173268
1    171428
Name: count, dtype: int64


In [27]:
# # Tokenizer
# tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# def tokenize(batch):
#     return tokenizer(batch['CommentText'], padding='max_length', truncation=True, max_length=128)

# train_dataset = train_dataset.map(tokenize, batched=True)
# test_dataset = test_dataset.map(tokenize, batched=True)

In [28]:
# Set format for PyTorch
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [33]:
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased-finetuned-sst-2-english',  # or 'distilbert-base-uncased'
    num_labels=2
)

In [34]:
# # Model
# model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

In [35]:
import transformers
print(transformers.__version__)

4.52.4


In [36]:
import torch
print(torch.__version__)

2.7.0+cpu


In [None]:
# Training arguments
    # evaluation_strategy="epoch",
    # save_strategy="epoch",
    # load_best_model_at_end=True,
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_dir='./logs',
    logging_steps=50,
    metric_for_best_model="eval_loss"
)

In [43]:
# Compute metrics
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1': f1_score(labels, preds, average='binary')
    }

In [44]:
# # Compute metrics
# from sklearn.metrics import accuracy_score, f1_score

# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     preds = logits.argmax(axis=-1)
#     return {
#         'accuracy': accuracy_score(labels, preds),
#         'f1': f1_score(labels, preds, average='weighted')
#     }

In [45]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [46]:
# Train
# trainer.train(resume_from_checkpoint=True)
trainer.train()

Step,Training Loss


KeyboardInterrupt: 

In [None]:
# Save model
model.save_pretrained('distilbert-finetuned-youtube')
tokenizer.save_pretrained('distilbert-finetuned-youtube')