In [28]:
import os
os.environ["USE_TF"] = "0"
os.environ["TRANSFORMERS_NO_TF"] = "1"


In [29]:
import pandas as pd

# Correct path based on your screenshot
df = pd.read_csv("/Users/walidelmasri/Downloads/261project/Sentiment_analysis_dataset.csv")

# Check dataset size
print(f"Dataset contains {df.shape[0]} samples and {df.shape[1]} columns.")


Dataset contains 37130 samples and 2 columns.


In [30]:
# Peek at the first few rows
print(df.head())

# Column names
print(df.columns)

# Check for missing values
print(df.isnull().sum())

# Check class distribution
print(df.value_counts(df.columns[1]))


                                           Statement      Status
0  life doesn’t feel worth it that’s kind of it? ...  Depression
1  This life sucks and if it were for my religiou...  Depression
2  its been 9 months now for our marriage and she...      Stress
3  I do not feel particularly sad or anxious or a...  Depression
4  I am taking Venlafaxine. it is an SSRI. does n...  Depression
Index(['Statement', 'Status'], dtype='object')
Statement    254
Status         0
dtype: int64
Status
Normal                  11446
Depression              10783
Suicidal                 7457
Anxiety                  2721
Bipolar                  2014
Stress                   1868
Personality disorder      841
Name: count, dtype: int64


clean the text

In [31]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/walidelmasri/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [32]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # remove URLs
    text = re.sub(r'\@w+|\#','', text)  # remove mentions and hashtags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # remove punctuation and numbers
    text = " ".join([word for word in text.split() if word not in stop_words])  # remove stopwords
    return text

df['clean_text'] = df['Statement'].fillna("").apply(clean_text)



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/walidelmasri/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Encode labels


In [33]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['encoded_label'] = le.fit_transform(df['Status'])


# Optional: see label mappings
label_map = dict(zip(le.classes_, le.transform(le.classes_)))
print(label_map)


{'Anxiety': np.int64(0), 'Bipolar': np.int64(1), 'Depression': np.int64(2), 'Normal': np.int64(3), 'Personality disorder': np.int64(4), 'Stress': np.int64(5), 'Suicidal': np.int64(6)}


Train/test split

In [34]:
from sklearn.model_selection import train_test_split

X = df['clean_text']
y = df['encoded_label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


Tf/idf vectorization

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


smote here after splitting, applied only to the test set

In [36]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_tfidf, y_train)


Train and evaluate models


In [37]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Naive Bayes
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)
nb_pred = nb.predict(X_test_tfidf)
nb_acc = accuracy_score(y_test, nb_pred)

# Decision Tree
dt = DecisionTreeClassifier()
dt.fit(X_train_tfidf, y_train)
dt_pred = dt.predict(X_test_tfidf)
dt_acc = accuracy_score(y_test, dt_pred)

# K-Nearest Neighbors
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_tfidf, y_train)
knn_pred = knn.predict(X_test_tfidf)
knn_acc = accuracy_score(y_test, knn_pred)

print(f"Naive Bayes Accuracy: {nb_acc:.4f}")
print(f"Decision Tree Accuracy: {dt_acc:.4f}")
print(f"KNN Accuracy: {knn_acc:.4f}")


Naive Bayes Accuracy: 0.6582
Decision Tree Accuracy: 0.6567
KNN Accuracy: 0.3225


Now we will use BERT

In [None]:
# STEP 0: Run this in your terminal first if not done:
# pip install transformers datasets torch scikit-learn

import os

import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

# Avoid TensorFlow-related errors
os.environ["USE_TF"] = "0"

# STEP 1: Encode text labels (Status column → integers)
label_encoder = LabelEncoder()
df['encoded_label'] = label_encoder.fit_transform(df['Status'])

# STEP 2: HuggingFace Dataset expects "text" and "label" columns
dataset = Dataset.from_pandas(
    df[['clean_text', 'encoded_label']].rename(columns={'clean_text': 'text', 'encoded_label': 'label'})
)

# STEP 3: Split and reduce size for quick testing
dataset = dataset.train_test_split(test_size=0.2)
dataset["train"] = dataset["train"].shuffle(seed=42).select(range(1000))
dataset["test"] = dataset["test"].shuffle(seed=42).select(range(300))

# STEP 4: Tokenize text
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
def tokenize_function(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=128)

dataset = dataset.map(tokenize_function, batched=True)
dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# STEP 5: Load BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=7)
model.config.return_dict = True  # Important to avoid Trainer errors

# STEP 6: Accuracy metric 
def compute_metrics(eval_pred):
    predictions = torch.argmax(torch.tensor(eval_pred.predictions), axis=1)
    return {"accuracy": accuracy_score(eval_pred.label_ids, predictions)}

# STEP 7: Training arguments (1 epoch for mini test but use 3 for fully testing the model although it takes time)
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="no",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    logging_dir="./logs"
)

# STEP 8: Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=compute_metrics
)

# STEP 9: Train & evaluate
trainer.train()
eval_results = trainer.evaluate()

# STEP 10: Print result
print(f"\n✅ BERT Accuracy on test set: {eval_results['eval_accuracy']:.4f}")



  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 1000/1000 [00:00<00:00, 2274.68 examples/s]
Map: 100%|██████████| 300/300 [00:00<00:00, 2959.04 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.221381,0.56



✅ BERT Accuracy on test set: 0.5600


Now we will see the model with highest accuracy


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
import torch

# Split data
X = df['clean_text']
y = df['encoded_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# TF-IDF Vectorization for classical models
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# 1. Multinomial Naive Bayes
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)
nb_acc = accuracy_score(y_test, nb.predict(X_test_tfidf))

# 2. Decision Tree
dt = DecisionTreeClassifier()
dt.fit(X_train_tfidf, y_train)
dt_acc = accuracy_score(y_test, dt.predict(X_test_tfidf))

# 3. KNN
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_tfidf, y_train)
knn_acc = accuracy_score(y_test, knn.predict(X_test_tfidf))

# 4. BERT
dataset = Dataset.from_pandas(df[['clean_text', 'encoded_label']].rename(columns={'clean_text': 'text', 'encoded_label': 'label'}))
dataset = dataset.train_test_split(test_size=0.2)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def tokenize(batch): return tokenizer(batch['text'], padding=True, truncation=True, max_length=128)
dataset = dataset.map(tokenize, batched=True)
dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=7)

def compute_metrics(pred):
    preds = torch.argmax(torch.tensor(pred.predictions), axis=1)
    return {'accuracy': accuracy_score(pred.label_ids, preds)}

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=compute_metrics
)

trainer.train()
bert_results = trainer.evaluate()
bert_acc = bert_results['eval_accuracy']

# Print comparison
print("\n📊 Model Accuracy Comparison:")
print(f"Multinomial Naive Bayes: {nb_acc:.4f}")
print(f"Decision Tree:            {dt_acc:.4f}")
print(f"K-Nearest Neighbors:      {knn_acc:.4f}")
print(f"BERT:                     {bert_acc:.4f}")

# Best Model
accuracies = {
    "Multinomial Naive Bayes": nb_acc,
    "Decision Tree": dt_acc,
    "K-Nearest Neighbors": knn_acc,
    "BERT": bert_acc
}

best_model = max(accuracies, key=accuracies.get)
print(f"\n🏆 Best Model: {best_model} with {accuracies[best_model]:.4f} accuracy.")


Map: 100%|██████████| 29704/29704 [00:12<00:00, 2471.19 examples/s]
Map: 100%|██████████| 7426/7426 [00:02<00:00, 2498.43 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6288,0.590669,0.771748
2,0.4467,0.549216,0.785887
3,0.2833,0.62455,0.793563



📊 Model Accuracy Comparison:
Multinomial Naive Bayes: 0.6582
Decision Tree:            0.6576
K-Nearest Neighbors:      0.3225
BERT:                     0.7936

🏆 Best Model: BERT with 0.7936 accuracy.


now we save the best model

In [12]:
# Save fine-tuned BERT model and tokenizer
trainer.save_model("best_model_bert/")
tokenizer.save_pretrained("best_model_bert/")


('best_model_bert/tokenizer_config.json',
 'best_model_bert/special_tokens_map.json',
 'best_model_bert/vocab.txt',
 'best_model_bert/added_tokens.json')

now to load it later on:

In [13]:
from transformers import BertForSequenceClassification, BertTokenizer

model = BertForSequenceClassification.from_pretrained("best_model_bert/")
tokenizer = BertTokenizer.from_pretrained("best_model_bert/")
