In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
import tensorflow as tf

In [2]:
print("Num GPUs Available:", len(tf.config.list_physical_devices('GPU')))
print("GPU Devices:", tf.config.list_physical_devices('GPU'))

Num GPUs Available: 1
GPU Devices: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [3]:
# Load data
df = pd.read_csv("youtube-comments-sentiment.csv")
df.head()

Unnamed: 0,CommentID,VideoID,VideoTitle,AuthorName,AuthorChannelID,CommentText,Sentiment,Likes,Replies,PublishedAt,CountryCode,CategoryID
0,UgyRjrEdJIPrf68uND14AaABAg,mcY4M9gjtsI,They killed my friend.#tales #movie #shorts,@OneWhoWandered,UC_-UEXaBL1dqqUPGkDll49A,Anyone know what movie this is?,Neutral,0,2,2025-01-15 00:54:55,NZ,1
1,UgxXxEIySAwnMNw8D7N4AaABAg,2vuXcw9SZbA,Man Utd conceding first penalty at home in yea...,@chiefvon3068,UCZ1LcZESjYqzaQRhjdZJFwg,The fact they're holding each other back while...,Positive,0,0,2025-01-13 23:51:46,AU,17
2,UgxB0jh2Ur41mcXr5IB4AaABAg,papg2tsoFzg,Welcome to Javascript Course,@Abdulla-ip8qr,UCWBK35w5Swy1iF5xIbEyw3A,waiting next video will be?,Neutral,1,0,2020-07-06 13:18:16,IN,27
3,UgwMOh95MfK0GuXLLrF4AaABAg,31KTdfRH6nY,Building web applications in Java with Spring ...,@finnianthehuman,UCwQ2Z03nOcMxWozBb_Cv66w,Thanks for the great video.\n\nI don't underst...,Neutral,0,1,2024-09-18 12:04:12,US,27
4,UgxJuUe5ysG8OSbABAl4AaABAg,-hV6aeyPHPA,After a new engine her car dies on her way hom...,@ryoutubeplaylistb6137,UCTTcJ0tsAKQokmHB2qVb1qQ,Good person helping good people.\nThis is how ...,Positive,3,1,2025-01-10 19:39:03,US,2


In [4]:
# Normalize sentiment to lowercase
df['Sentiment'] = df['Sentiment'].str.lower()
label_map = {'negative': 0, 'neutral': 1, 'positive': 2}
df = df[df['Sentiment'].isin(label_map.keys())]
df['label'] = df['Sentiment'].map(label_map)

In [5]:
# Only keep positive and negative sentiments
df = df[df['Sentiment'].isin(['positive', 'negative'])]

# Map to numeric labels: negative=0, positive=1
label_map = {'negative': 0, 'positive': 1}
df['label'] = df['Sentiment'].map(label_map)

In [6]:
# # Map sentiment to numeric labels
# label_map = {'negative': 0, 'neutral': 1, 'positive': 2}
# df = df[df['Sentiment'].isin(label_map.keys())]
# df['label'] = df['Sentiment'].map(label_map)

In [None]:
# Use only 50% of the data
df = df.sample(frac=0.5, random_state=42).reset_index(drop=True)

In [12]:
# Split data
train_df, val_df = train_test_split(df, test_size=0.1, stratify=df['label'], random_state=42)

In [13]:
# Tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')

def encode_texts(texts, tokenizer, max_len=128):
    return tokenizer(
        list(texts),
        padding='max_length',
        truncation=True,
        max_length=max_len,
        return_tensors='tf'
    )



In [14]:
# Encode datasets
train_encodings = encode_texts(train_df['CommentText'], tokenizer)
val_encodings = encode_texts(val_df['CommentText'], tokenizer)

train_labels = tf.convert_to_tensor(train_df['label'].values)
val_labels = tf.convert_to_tensor(val_df['label'].values)

In [15]:
# Build TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
)).shuffle(1000).batch(32)

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
)).batch(32)

In [16]:
# Load model
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english', num_labels=2)

All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


In [17]:
# Compile model
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = [tf.keras.metrics.SparseCategoricalAccuracy('accuracy')]

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

In [18]:
# Train
model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=1,
    callbacks=[tf.keras.callbacks.ModelCheckpoint('trained_model.keras')]
)



NotImplementedError: Saving the model to HDF5 format requires the model to be a Functional model or a Sequential model. It does not work for subclassed models, because such models are defined via the body of a Python method, which isn't safely serializable. Consider saving to the Tensorflow SavedModel format (by setting save_format="tf") or using `save_weights`.

In [20]:
# Save model and tokenizer
# model.save('trained_model.keras')
model.save_pretrained('distilbert-finetuned-youtube-tf')
tokenizer.save_pretrained('distilbert-finetuned-youtube-tf')


('distilbert-finetuned-youtube-tf\\tokenizer_config.json',
 'distilbert-finetuned-youtube-tf\\special_tokens_map.json',
 'distilbert-finetuned-youtube-tf\\vocab.txt',
 'distilbert-finetuned-youtube-tf\\added_tokens.json')

In [21]:
model.evaluate(val_dataset)


116/647 [====>.........................] - ETA: 1:33 - loss: 0.2991 - accuracy: 0.8669

KeyboardInterrupt: 