In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, TrainingArguments, Trainer
from torch.utils.data import Dataset, DataLoader

# Reset index after dropping NA values
data = pd.read_csv('/kaggle/input/combined-data-balanced/combined_data_balanced.csv')
ndata = data.dropna().reset_index(drop=True)  # Reset index here

text = ndata['text']
labels = ndata['mental_state'].values 
label_map = {emotion: idx for idx, emotion in enumerate(np.unique(labels))}
num_labels = len(label_map)
y_encoded = np.array([label_map[l] for l in labels])

2025-07-08 04:26:25.196191: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751948785.378074      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751948785.432872      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [12]:
# Split data
X_train, X_val, y_train, y_val = train_test_split(
    text, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)

In [13]:
from tqdm import tqdm
MAX_LEN  = 4095
# STEP 2: PRE-TOKENIZE DATA (Replace EmotionDataset)
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
def pre_tokenize(texts, labels, tokenizer, max_len):
    input_ids = []
    attention_masks = []
    for text in tqdm(texts):
        enc = tokenizer.encode_plus(
            text,
            max_length=max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True
        )
        input_ids.append(enc['input_ids'])
        attention_masks.append(enc['attention_mask'])
    return {
        'input_ids': torch.tensor(input_ids),
        'attention_mask': torch.tensor(attention_masks),
        'labels': torch.tensor(labels)
    }

# Apply to train/val
train_data = pre_tokenize(X_train.tolist(), y_train, tokenizer, MAX_LEN)
val_data = pre_tokenize(X_val.tolist(), y_val, tokenizer, MAX_LEN)

100%|██████████| 6892/6892 [00:09<00:00, 728.67it/s]
100%|██████████| 1724/1724 [00:02<00:00, 755.12it/s]


In [15]:
from transformers import LongformerForSequenceClassification,LongformerTokenizer
# Load model
model = LongformerForSequenceClassification.from_pretrained(
    'allenai/longformer-base-4096',
    num_labels=num_labels
).to(device)

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
!pip install transformers lightgbm sentencepiece
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import LongformerTokenizer, LongformerModel
import torch
import lightgbm as lgb
from tqdm import tqdm
import time
from sklearn.metrics import classification_report

# Load your dataset
# df = pd.read_csv('your_data.csv')  # Should have 'text' and 'label' columns

# Initialize Longformer (handles up to 4096 tokens)
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
model = LongformerModel.from_pretrained('allenai/longformer-base-4096')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
model.eval()

# Truncation function (Longformer max is 4096 tokens)
def truncate_text(text):
    tokens = tokenizer.tokenize(text)
    return tokenizer.convert_tokens_to_string(tokens[:4095])  # 4096 - [CLS] token

# Embedding generation
def get_longformer_embeddings(texts, batch_size=8):
    embeddings = []
    
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size].tolist()
        
        # Truncate long texts
        batch = [truncate_text(text) for text in batch]
        
        inputs = tokenizer(
            batch,
            padding=True,
            truncation=True,
            max_length=4096,
            return_tensors="pt",
            return_attention_mask=True
        ).to(device)
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Attention-weighted mean pooling
        attention_mask = inputs['attention_mask']
        last_hidden = outputs.last_hidden_state
        embeddings_batch = (last_hidden * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(1).unsqueeze(-1)
        embeddings.append(embeddings_batch.cpu().numpy())
    
    return np.vstack(embeddings)

# Preprocess data
print("Preprocessing texts...")
ndata['text'] = ndata['text'].apply(lambda x: x[:100000] if len(x) > 100000 else x)  # Limit extreme lengths

# Generate embeddings
print("Generating Longformer embeddings...")
start_time = time.time()
embeddings = get_longformer_embeddings(ndata['text'])
print(f"Embeddings generated in {time.time()-start_time:.2f} seconds")



Preprocessing texts...
Generating Longformer embeddings...


100%|██████████| 1077/1077 [05:40<00:00,  3.16it/s]

Embeddings generated in 340.55 seconds





In [10]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(ndata['mental_state'])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    embeddings, y_encoded, test_size=0.2, random_state=42
)

  

def train_evaluate_xgboost():
    from xgboost import XGBClassifier
    
    model = XGBClassifier(
        objective='multi:softmax',
        num_class=len(le.classes_),
        n_estimators=500,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        eval_metric='mlogloss',
        early_stopping_rounds=20,
        use_label_encoder=False,
        reg_alpha= 0.8,   # L1 regularization (alpha)
        reg_lambda= 0.5,  # L2 regularization (lambda)
        gamma= 0.1,       # Minimum loss reduction for split

        # Additional tree parameters
        min_child_weight= 3,
        max_delta_step= 2
    )
    
    model.fit(
        X_train, y_train,
        eval_set=[(X_test, y_test)],
        verbose=False
    )
    
    y_pred = model.predict(X_test)
    return model, y_pred



print("\nTraining XGBoost...")
xgb_model, xgb_pred = train_evaluate_xgboost()
print("XGBoost Classification Report:")
print(classification_report(y_test, xgb_pred, target_names=le.classes_))




Training XGBoost...
XGBoost Classification Report:
                      precision    recall  f1-score   support

             anxiety       0.62      0.66      0.64       227
             bipolar       0.61      0.54      0.57       229
          depression       0.56      0.52      0.54       213
              lonely       0.67      0.77      0.72       200
              normal       0.86      0.91      0.88       227
personality_disorder       0.61      0.60      0.60       210
                ptsd       0.70      0.67      0.69       209
              stress       0.63      0.63      0.63       209

            accuracy                           0.66      1724
           macro avg       0.66      0.66      0.66      1724
        weighted avg       0.66      0.66      0.66      1724

