In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer
import tensorflow as tf
from sklearn.metrics import classification_report


data = pd.read_csv("Training_Essay_Data.csv")


# Ensure that the labels are integers
data['generated'] = data['generated'].astype(int)

# Split the dataset into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(data['text'], data['generated'], test_size=0.2, random_state=42)

# Load the pre-trained LLM and tokenizer
model_name = "distilbert-base-uncased"  # Example model, you can choose any LLM model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Tokenize the input texts
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True)

# TensorFlow dataset creation
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), train_labels)).batch(8)
val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_encodings), val_labels)).batch(8)

# Model compilation
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

# Model training
history = model.fit(train_dataset, validation_data=val_dataset, epochs=3, batch_size=8)

bert_history = history

# Evaluate the model
loss, accuracy = model.evaluate(val_dataset)
print("Validation Loss:", loss)
print("Validation Accuracy:", accuracy)

val_predictions = model.predict(val_dataset)

# Get the predicted labels
predicted_labels = np.argmax(val_predictions.logits, axis=1)

# Compute precision, recall, and F1 score
report = classification_report(val_labels, predicted_labels)
print(report)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

Epoch 1/3


Cause: for/else statement not yet supported


Cause: for/else statement not yet supported
Epoch 2/3
Epoch 3/3
Validation Loss: 0.6701749563217163
Validation Accuracy: 0.6071367263793945
              precision    recall  f1-score   support

           0       0.61      1.00      0.76      3539
           1       0.00      0.00      0.00      2290

    accuracy                           0.61      5829
   macro avg       0.30      0.50      0.38      5829
weighted avg       0.37      0.61      0.46      5829



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


This code uses the Hugging Face transformers library to load the pre-trained RoBERTa tokenizer and model. It preprocesses the data, tokenizes the input texts using the tokenizer, and creates TensorFlow datasets for training and validation. Finally, it initializes and compiles the RoBERTa model for sequence classification and trains it on the dataset.


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification
import tensorflow as tf
from sklearn.metrics import classification_report


# Load the dataset

df = pd.read_csv("Training_Essay_Data.csv")


# Preprocess the data
df['generated'] = df['generated'].apply(lambda x: 1 if x == 'ai' else 0)

# Split the data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(df['text'], df['generated'], test_size=0.2, random_state=42)

# Load the RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Tokenize the input texts
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True)

# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels.values
)).shuffle(len(train_texts)).batch(8)

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels.values
)).batch(8)

# Initialize the RoBERTa model
model = TFRobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

# Compile the model
model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

# Train the model
history = model.fit(train_dataset, validation_data=val_dataset, epochs=2, batch_size=16)

roberta_history = history

# Evaluate the model
loss, accuracy = model.evaluate(val_dataset)
print("Validation Loss:", loss)
print("Validation Accuracy:", accuracy)

val_predictions = model.predict(val_dataset)

# Get the predicted labels
predicted_labels = np.argmax(val_predictions.logits, axis=1)

# Compute precision, recall, and F1 score
report = classification_report(val_labels, predicted_labels)
print(report)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predicti

Epoch 1/2
Epoch 2/2
Validation Loss: 0.0
Validation Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5829

    accuracy                           1.00      5829
   macro avg       1.00      1.00      1.00      5829
weighted avg       1.00      1.00      1.00      5829

