# Stress Type Classifier from Daily Journal Entries
This notebook builds a high-performance model to classify stress types (School, Relationship, Health, Financial, Work) from journal entries using Logistic Regression, SVM, DistilBERT, and an ensemble approach.

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch

In [None]:
# Download NLTK stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Set random seed for reproducibility
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7c716e37e0b0>

In [None]:
# Load dataset (assuming it's uploaded to Colab)
# For Colab, upload 'stress_journal_dataset.csv' manually or adjust path if using Google Drive
data = pd.read_csv('/content/stress_journal_dataset.csv')

# Create label mapping
label_map = {stress: idx for idx, stress in enumerate(data['stress_type'].unique())}
data['label'] = data['stress_type'].map(label_map)
print('Label Mapping:', label_map)

Label Mapping: {'School Stress': 0, 'Relationship Stress': 1, 'Health Stress': 2, 'Financial Stress': 3, 'Work Stress': 4}


In [None]:
# Text preprocessing function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

data['clean_text'] = data['journal_entry'].apply(clean_text)
print(data['journal_entry'].head())

0    Exams are piling up, and I can't keep up with ...
1    My partner and I had another argument about ou...
2    The doctor said my blood pressure is high agai...
3    Bills are due, and my account is almost empty....
4    I bombed my presentation today. I’m so behind ...
Name: journal_entry, dtype: object


In [None]:
print(data['clean_text'].head())

0    exams piling cant keep readings stayed 3 studying
1    partner another argument future plans feel dra...
2    doctor said blood pressure high im worried health
3    bills due account almost empty dont know ill m...
4    bombed presentation today im behind course ove...
Name: clean_text, dtype: object


In [None]:
print(data['stress_type'].head())

0          School Stress
1    Relationship Stress
2          Health Stress
3       Financial Stress
4          School Stress
Name: stress_type, dtype: object


In [None]:
# Split data
X = data['clean_text']
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), min_df=2)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Apply SMOTE to balance classes
smote = SMOTE(random_state=42)
X_train_tfidf_smote, y_train_smote = smote.fit_resample(X_train_tfidf, y_train)

In [None]:
# Logistic Regression with Grid Search
lr_params = {'C': [0.1, 1, 10, 100]}
lr_grid = GridSearchCV(LogisticRegression(max_iter=1000), lr_params, cv=5, scoring='f1_macro')
lr_grid.fit(X_train_tfidf_smote, y_train_smote)
lr_model = lr_grid.best_estimator_
print('Best Logistic Regression Params:', lr_grid.best_params_)

Best Logistic Regression Params: {'C': 1}


In [None]:
# SVM with Grid Search
svm_params = {'C': [0.1, 1, 10], 'kernel': ['linear']}
svm_grid = GridSearchCV(SVC(probability=True), svm_params, cv=5, scoring='f1_macro')
svm_grid.fit(X_train_tfidf_smote, y_train_smote)
svm_model = svm_grid.best_estimator_
print('Best SVM Params:', svm_grid.best_params_)

Best SVM Params: {'C': 1, 'kernel': 'linear'}


In [None]:
# DistilBERT Fine-Tuning
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

class StressDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = tokenizer(self.texts[idx], truncation=True, padding='max_length', max_length=128, return_tensors='pt')
        item = {key: val.squeeze() for key, val in encoding.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

train_dataset = StressDataset(X_train.tolist(), y_train.tolist())
test_dataset = StressDataset(X_test.tolist(), y_test.tolist())

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=5)
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=10,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    report_to='none'  # Disable W&B logging to avoid API key prompt
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,1.5968,1.551168
2,1.4142,1.379124
3,1.1508,1.244509


TrainOutput(global_step=30, training_loss=1.387271563212077, metrics={'train_runtime': 20.2966, 'train_samples_per_second': 11.825, 'train_steps_per_second': 1.478, 'total_flos': 7948469145600.0, 'train_loss': 1.387271563212077, 'epoch': 3.0})

In [None]:
# Ensemble Model
ensemble_model = VotingClassifier(
    estimators=[
        ('lr', lr_model),
        ('svm', svm_model),
    ],
    voting='soft'
)
ensemble_model.fit(X_train_tfidf_smote, y_train_smote)

# Evaluate all models
def evaluate_model(name, model, X_test_tfidf, y_test):
    if name == 'DistilBERT':
        predictions = trainer.predict(test_dataset).predictions.argmax(axis=1)
    else:
        predictions = model.predict(X_test_tfidf)
    print(f'\n{name} Results:')
    print(classification_report(y_test, predictions, target_names=label_map.keys()))
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, predictions))

evaluate_model('Logistic Regression', lr_model, X_test_tfidf, y_test)
evaluate_model('SVM', svm_model, X_test_tfidf, y_test)
evaluate_model('DistilBERT', None, X_test_tfidf, y_test)
evaluate_model('Ensemble', ensemble_model, X_test_tfidf, y_test)


Logistic Regression Results:
                     precision    recall  f1-score   support

      School Stress       1.00      0.75      0.86         4
Relationship Stress       0.80      0.80      0.80         5
      Health Stress       0.50      0.25      0.33         4
   Financial Stress       0.50      1.00      0.67         4
        Work Stress       1.00      0.67      0.80         3

           accuracy                           0.70        20
          macro avg       0.76      0.69      0.69        20
       weighted avg       0.75      0.70      0.69        20

Confusion Matrix:
[[3 0 0 1 0]
 [0 4 1 0 0]
 [0 1 1 2 0]
 [0 0 0 4 0]
 [0 0 0 1 2]]

SVM Results:
                     precision    recall  f1-score   support

      School Stress       0.60      0.75      0.67         4
Relationship Stress       1.00      0.80      0.89         5
      Health Stress       0.50      0.25      0.33         4
   Financial Stress       0.57      1.00      0.73         4
        Work S


DistilBERT Results:
                     precision    recall  f1-score   support

      School Stress       0.67      1.00      0.80         4
Relationship Stress       1.00      0.60      0.75         5
      Health Stress       1.00      0.75      0.86         4
   Financial Stress       0.57      1.00      0.73         4
        Work Stress       1.00      0.33      0.50         3

           accuracy                           0.75        20
          macro avg       0.85      0.74      0.73        20
       weighted avg       0.85      0.75      0.74        20

Confusion Matrix:
[[4 0 0 0 0]
 [1 3 0 1 0]
 [0 0 3 1 0]
 [0 0 0 4 0]
 [1 0 0 1 1]]

Ensemble Results:
                     precision    recall  f1-score   support

      School Stress       1.00      0.75      0.86         4
Relationship Stress       0.75      0.60      0.67         5
      Health Stress       0.50      0.25      0.33         4
   Financial Stress       0.50      1.00      0.67         4
        Work Stres

In [None]:
# Predict sample text
sample_text = "I’m freaking out about my exams next week."
cleaned_sample = clean_text(sample_text)
sample_tfidf = tfidf.transform([cleaned_sample])

# Ensemble prediction
ensemble_pred = ensemble_model.predict(sample_tfidf)[0]
ensemble_stress = [k for k, v in label_map.items() if v == ensemble_pred][0]

# DistilBERT prediction
sample_encoding = tokenizer(sample_text, truncation=True, padding='max_length', max_length=128, return_tensors='pt')
with torch.no_grad():
    outputs = model(**{k: v.to(model.device) for k, v in sample_encoding.items()})
    distilbert_pred = outputs.logits.argmax().item()
distilbert_stress = [k for k, v in label_map.items() if v == distilbert_pred][0]

print(f'Sample Text: {sample_text}')
print(f'Cleaned Text: {cleaned_sample}')
print(f'Ensemble Predicted Stress Type: {ensemble_stress} (Label: {ensemble_pred})')
print(f'DistilBERT Predicted Stress Type: {distilbert_stress} (Label: {distilbert_pred})')

Sample Text: I’m freaking out about my exams next week.
Cleaned Text: im freaking exams next week
Ensemble Predicted Stress Type: School Stress (Label: 0)
DistilBERT Predicted Stress Type: School Stress (Label: 0)


In [None]:
# Predict sample text
sample_text = "I’m worried about my body i ate too much fast food last night"
cleaned_sample = clean_text(sample_text)
sample_tfidf = tfidf.transform([cleaned_sample])

# Ensemble prediction
ensemble_pred = ensemble_model.predict(sample_tfidf)[0]
ensemble_stress = [k for k, v in label_map.items() if v == ensemble_pred][0]

# DistilBERT prediction
sample_encoding = tokenizer(sample_text, truncation=True, padding='max_length', max_length=128, return_tensors='pt')
with torch.no_grad():
    outputs = model(**{k: v.to(model.device) for k, v in sample_encoding.items()})
    distilbert_pred = outputs.logits.argmax().item()
distilbert_stress = [k for k, v in label_map.items() if v == distilbert_pred][0]

print(f'Sample Text: {sample_text}')
print(f'Cleaned Text: {cleaned_sample}')
print(f'Ensemble Predicted Stress Type: {ensemble_stress} (Label: {ensemble_pred})')
print(f'DistilBERT Predicted Stress Type: {distilbert_stress} (Label: {distilbert_pred})')

Sample Text: I’m worried about my body i ate too much fast food last night
Cleaned Text: im worried body ate much fast food last night
Ensemble Predicted Stress Type: Health Stress (Label: 2)
DistilBERT Predicted Stress Type: Health Stress (Label: 2)


In [None]:
# Predict sample text
sample_text = "I’m conserened about my girl friend i cheated on her"
cleaned_sample = clean_text(sample_text)
sample_tfidf = tfidf.transform([cleaned_sample])

# Ensemble prediction
ensemble_pred = ensemble_model.predict(sample_tfidf)[0]
ensemble_stress = [k for k, v in label_map.items() if v == ensemble_pred][0]

# DistilBERT prediction
sample_encoding = tokenizer(sample_text, truncation=True, padding='max_length', max_length=128, return_tensors='pt')
with torch.no_grad():
    outputs = model(**{k: v.to(model.device) for k, v in sample_encoding.items()})
    distilbert_pred = outputs.logits.argmax().item()
distilbert_stress = [k for k, v in label_map.items() if v == distilbert_pred][0]

print(f'Sample Text: {sample_text}')
print(f'Cleaned Text: {cleaned_sample}')
print(f'Ensemble Predicted Stress Type: {ensemble_stress} (Label: {ensemble_pred})')
print(f'DistilBERT Predicted Stress Type: {distilbert_stress} (Label: {distilbert_pred})')

Sample Text: I’m conserened about my girl friend i cheated on her
Cleaned Text: im conserened girl friend cheated
Ensemble Predicted Stress Type: Relationship Stress (Label: 1)
DistilBERT Predicted Stress Type: Relationship Stress (Label: 1)
