# Modified SVM-mentalBERT Simulation

## Text Preparation

In [7]:
# add required libraries
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

In [8]:
!pip install -q nltk
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
# set random seed to reproduce the same result every time
np.random.seed(500)

In [14]:
# import the corpus
Corpus = pd.read_csv("500_Reddit_users_posts_labels.csv", encoding='latin-1')

## Data Preprocessing

In [15]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet as wn
from collections import defaultdict

# Download necessary NLTK packages
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

# Lowercasing
Corpus['Post'] = [entry.lower() for entry in Corpus['Post']]

# Tokenization and Lemmatization
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
tag_map = defaultdict(lambda: wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

def preprocess_text(text):
    tokens = word_tokenize(text)
    final_words = []
    for word, tag in pos_tag(tokens):
        if word.isalpha() and word not in stop_words:
            lemma = lemmatizer.lemmatize(word, tag_map[tag[0]])
            final_words.append(lemma)
    return ' '.join(final_words)

Corpus['cleaned_post'] = Corpus['Post'].apply(preprocess_text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Using MentalBERT Model to Improve Text Feature Representation

In [16]:
import torch
from transformers import AutoTokenizer, AutoModel

# Load MentalBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("mental/mental-bert-base-uncased")
model = AutoModel.from_pretrained("mental/mental-bert-base-uncased")

# Function to get CLS embeddings from MentalBERT
def get_mentalbert_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token representation
    return embeddings.cpu().numpy()

# Apply the function to the corpus
Corpus['embeddings'] = Corpus['cleaned_post'].apply(lambda x: get_mentalbert_embeddings(x))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/mental/mental-bert-base-uncased.
401 Client Error. (Request ID: Root=1-66fba723-19c4f98c26ddf8d12d94cde5;94f51242-fe8f-4796-bca9-ef94f87f4be0)

Cannot access gated repo for url https://huggingface.co/mental/mental-bert-base-uncased/resolve/main/config.json.
Access to model mental/mental-bert-base-uncased is restricted. You must have access to it and be authenticated to access it. Please log in.

## Using MentalBERT's Embeddings to Resolve Data Imbalance (Solution 2)

## Dimensionality Reduction (Solution 3)

## Input Preprocessed Data to SVM and Classify

## Evaluate the Results (Precision, Recall, F1-Score, Confusion Matrix)

## Hyperparameter Tuning (if necessary)

## Final Evaluation with Tuned Model