In [None]:
# 1. Install the necessary packages
# NOTE: Since transformers is already installed, this line can be commented out.
!pip install transformers

# 2. Import the required libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, TFGPT2ForSequenceClassification, AdamWeightDecay

# 3. Load your dataset
# NOTE: Make sure you have uploaded your dataset to your Google Colab session
df = pd.read_csv('/testdata.csv')

# 4. Split the data into training and testing sets
X = df['clean_text']
y = df['sentiment_type']
y = pd.get_dummies(y).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Tokenize the data using the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Set the padding token
tokenizer.pad_token = tokenizer.eos_token

average_length = int(X_train.apply(len).mean())
max_length = average_length + 2

train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=max_length, return_tensors="tf")
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=max_length, return_tensors="tf")

# Convert tokenized encodings to TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices(({"input_ids": train_encodings["input_ids"], "attention_mask": train_encodings["attention_mask"]}, y_train)).shuffle(1000).batch(8).repeat()
test_dataset = tf.data.Dataset.from_tensor_slices(({"input_ids": test_encodings["input_ids"], "attention_mask": test_encodings["attention_mask"]}, y_test)).batch(8)

# 6. Fine-tune a GPT-2 model on the dataset
base_model = TFGPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=3)

# Add a global average pooling layer to aggregate outputs
input_ids = tf.keras.Input(shape=(max_length,), dtype=tf.int32, name="input_ids")
attention_mask = tf.keras.Input(shape=(max_length,), dtype=tf.int32, name="attention_mask")
embeddings = base_model(input_ids, attention_mask=attention_mask)[0]
pooled = tf.keras.layers.GlobalAveragePooling1D()(embeddings)
outputs = tf.keras.layers.Dense(3, activation="softmax")(pooled)
model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=outputs)

optimizer = AdamWeightDecay(learning_rate=5e-5)
loss = tf.keras.losses.CategoricalCrossentropy()
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

# Custom training loop
steps_per_epoch = len(X_train) // 8
model.fit(train_dataset, epochs=3, steps_per_epoch=steps_per_epoch, validation_data=test_dataset)

# 7. Test the model's performance
results = model.evaluate(test_dataset)
print("Test Loss:", results[0])
print("Test Accuracy:", results[1])




Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFGPT2ForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFGPT2ForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Epoch 2/3
Epoch 3/3
Test Loss: 0.41513437032699585
Test Accuracy: 0.8698152899742126


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import cross_val_score

# Predict sentiment labels for the test dataset
y_pred_probs = model.predict(test_dataset)
y_pred_classes = np.argmax(y_pred_probs, axis=1)
y_true_classes = np.argmax(y_test, axis=1)

# Calculate metrics
accuracy = accuracy_score(y_true_classes, y_pred_classes)
precision = precision_score(y_true_classes, y_pred_classes, average='weighted')
recall = recall_score(y_true_classes, y_pred_classes, average='weighted')
f1 = f1_score(y_true_classes, y_pred_classes, average='weighted')
roc_auc = roc_auc_score(y_true_classes, y_pred_probs, multi_class='ovr', average='weighted')

# Display metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")

# Display confusion matrix
conf_matrix = confusion_matrix(y_true_classes, y_pred_classes)
print("\nConfusion Matrix:\n", conf_matrix)

# Cross-Validation Scores
# Note: You'll need a classifier and data prepared in a suitable format for cross-validation.
# The following code assumes a classifier named 'clf' and data 'X' and 'y'.
# Uncomment and modify accordingly to your use case.

# scores = cross_val_score(clf, X, y, cv=5)  # 5-fold cross-validation
# print("\nCross-Validation Scores:", scores)
# print("Average Cross-Validation Score:", scores.mean())


Accuracy: 0.8698
Precision: 0.8805
Recall: 0.8698
F1-Score: 0.8725
ROC AUC Score: 0.9609

Confusion Matrix:
 [[ 476   37   60]
 [  36  907   34]
 [ 188   75 1490]]
