# Modified SVM-mentalBERT Simulation

## Text Preparation

In [7]:
# add required libraries
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

In [8]:
!pip install -q nltk
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
# set random seed to reproduce the same result every time
np.random.seed(500)

In [14]:
# import the corpus
Corpus = pd.read_csv("500_Reddit_users_posts_labels.csv", encoding='latin-1')

## Data Preprocessing

In [15]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet as wn
from collections import defaultdict

# Download necessary NLTK packages
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

# Lowercasing
Corpus['Post'] = [entry.lower() for entry in Corpus['Post']]

# Tokenization and Lemmatization
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
tag_map = defaultdict(lambda: wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

def preprocess_text(text):
    tokens = word_tokenize(text)
    final_words = []
    for word, tag in pos_tag(tokens):
        if word.isalpha() and word not in stop_words:
            lemma = lemmatizer.lemmatize(word, tag_map[tag[0]])
            final_words.append(lemma)
    return ' '.join(final_words)

Corpus['cleaned_post'] = Corpus['Post'].apply(preprocess_text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Using MentalBERT Model to Improve Text Feature Representation

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel

# Load MentalBERT tokenizer and model, providing your Hugging Face token
tokenizer = AutoTokenizer.from_pretrained("mental/mental-bert-base-uncased", use_auth_token='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') # replace 'YOUR_HUGGINGFACE_TOKEN' with your actual token
model = AutoModel.from_pretrained("mental/mental-bert-base-uncased", use_auth_token='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') #This line was changed to remove the extra 'N'

# Function to get CLS embeddings from MentalBERT
def get_mentalbert_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token representation
    return embeddings.cpu().numpy()

# Apply the function to the corpus
Corpus['embeddings'] = Corpus['cleaned_post'].apply(lambda x: get_mentalbert_embeddings(x))



config.json:   0%|          | 0.00/639 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Using MentalBERT's Embeddings to Resolve Data Imbalance (Solution 2)

In [None]:
from imblearn.over_sampling import SMOTE
import numpy as np

# Convert MentalBERT embeddings to numpy array for SMOTE
X_embeddings = np.vstack(Corpus['embeddings'].values)
y = Corpus['Label'].values  # Labels for classification

# Split the data into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_embeddings, y, test_size=0.3, random_state=42)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

print(f"Resampled dataset shape: {X_resampled.shape}, {y_resampled.shape}")

## Dimensionality Reduction (Solution 3)

In [None]:
from sklearn.decomposition import PCA

# Optionally apply PCA to reduce dimensionality (e.g., reduce to 100 dimensions)
pca = PCA(n_components=100)
X_resampled_pca = pca.fit_transform(X_resampled)
X_test_pca = pca.transform(X_test)

print(f"Reduced dimensions: {X_resampled_pca.shape}, {X_test_pca.shape}")

## Input Preprocessed Data to SVM and Classify

In [None]:
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier

# Initialize the SVM model and wrap with OVR classifier
svm = SVC(probability=True, random_state=42)
ovr_classifier = OneVsRestClassifier(svm)

# Train the classifier on resampled data (with or without PCA)
ovr_classifier.fit(X_resampled_pca, y_resampled)  # Use X_resampled if not applying PCA

# Predict on the test data
y_pred = ovr_classifier.predict(X_test_pca)  # Use X_test if not applying PCA

## Evaluate the Results (Precision, Recall, F1-Score, Confusion Matrix)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

# Classification report
print(classification_report(y_test, y_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Blues)
plt.show()

## Hyperparameter Tuning (if necessary)

In [None]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameter grid
param_grid = {
    'estimator__C': [0.1, 1, 10],
    'estimator__kernel': ['linear', 'rbf', 'poly'],
    'estimator__gamma': [0.1, 1, 'scale', 'auto']
}

# Perform grid search with OVR classifier
grid = GridSearchCV(ovr_classifier, param_grid, refit=True, verbose=3)
grid.fit(X_resampled_pca, y_resampled)  # Use resampled PCA data

# Evaluate the best model
print("Best Hyperparameters:", grid.best_params_)
y_pred_best = grid.predict(X_test_pca)

# Classification report for the best model
print(classification_report(y_test, y_pred_best))

## Final Evaluation with Tuned Model

In [None]:
# Final evaluation with the best model
cm_tuned = confusion_matrix(y_test, y_pred_best)
disp_tuned = ConfusionMatrixDisplay(confusion_matrix=cm_tuned)
disp_tuned.plot(cmap=plt.cm.Blues)
plt.show()

# Final classification report
print(classification_report(y_test, y_pred_best))