In [1]:
# Required imports
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast, TFDistilBertForSequenceClassification, TFTrainingArguments
from transformers.trainer_tf import  TFTrainer

# # Mount Google Drive and read the dataset
# from google.colab import drive
# drive.mount('/content/drive')
file_path = "stress_data_updated.csv"

# Read the data
df = pd.read_csv(file_path)

# Drop unnecessary columns
df.drop(columns=["Unnamed: 0"], inplace=True)

# Convert labels from string to integer
df["Type"].replace({"Stressed": 0, "Relaxed": 1}, inplace=True)

# Function to preprocess text data
def preprocess(text):
    if text is None:
        return ''
    text = re.sub(r"[^A-Za-z0-9\s]", "", text).lower()
    return text

df["clean_sentence"] = df["Sentence"].apply(preprocess)

# Check for missing values and handle them
df["clean_sentence"].fillna("", inplace=True)  # Replace missing values in Sentence with empty strings
df["Type"].fillna(0, inplace=True)  # Replace missing values in Type with 0

# Convert labels to integer type
df["Type"] = df["Type"].astype(int)

# Separate the data into input features and labels
X = df["clean_sentence"]
y = df["Type"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Initialize tokenizer and tokenize the input data
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, return_tensors="tf")
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True, return_tensors="tf")

# Convert encodings to TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train.astype(np.int32)))
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test.astype(np.int32)))

# Define training arguments
training_args = TFTrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=300,
    weight_decay=0.0,  # Temporarily set weight decay to zero
    logging_dir="./logs",
    logging_steps=10,  # Log at every 10 steps for debugging
    evaluation_strategy="steps",  # Set evaluation strategy to steps
    eval_steps=100,  # Evaluate every 100 steps; adjust as necessary
)

# Initialize model and trainer
with training_args.strategy.scope():
    model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = TFTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the model




Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]




Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [2]:
trainer.train()

In [3]:
trainer.evaluate(test_dataset)
pred=trainer.predict(test_dataset)[1]
from sklearn.metrics import classification_report

print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       395
           1       1.00      1.00      1.00       384

    accuracy                           1.00       779
   macro avg       1.00      1.00      1.00       779
weighted avg       1.00      1.00      1.00       779



In [7]:

## Save the model
# model.save_pretrained({PATH})


In [13]:
from transformers import TFDistilBertForSequenceClassification, DistilBertTokenizerFast
import tensorflow as tf

def predict_stress_level(custom_text, model_path="bert_stress_classifier"):
    # Load the saved model
    loaded_model = TFDistilBertForSequenceClassification.from_pretrained(model_path)

    # Load the tokenizer
    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

    # Tokenize the custom text
    encoded_input = tokenizer(custom_text, truncation=True, padding=True, return_tensors='tf')

    # Generate predictions
    output = loaded_model(encoded_input)
    predicted_class = tf.argmax(output.logits, axis=1).numpy()[0]

    # Map predicted class to label
    predicted_label = "Stressed" if predicted_class == 0 else "Relaxed"

    return predicted_label


In [11]:
predict_stress_level("""As I sit quietly in my favorite spot, listening to the gentle rustle of leaves and feeling the warmth of the sun on my skin,
I can't help but feel a sense of calm wash over me, 
easing away the tensions of the day.""")

Some layers from the model checkpoint at bert_stress_classifier were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at bert_stress_classifier and are newly initialized: ['dropout_79']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


'Relaxed'

In [12]:
predict_stress_level("""
As deadlines loom and tasks pile up, I feel the weight of responsibility bearing down on me,
my heart racing with anxiety
as I struggle to keep up with the demands of work and life""")

Some layers from the model checkpoint at bert_stress_classifier were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at bert_stress_classifier and are newly initialized: ['dropout_99']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


'Stressed'