# CONSTANTS

In [90]:
DATASET_SOURCE = "datasets/extracted/dataset-cleaned.csv"

# TRAINING CONFIGS

EPOCHS = 5
BATCH_SIZE = 6
LEARNING_RATE = 1e-5
SEED = 7984

MODEL_SAVE_PATH = "models/sucidality"
MODEL_CHECKPOINT_PATH = "models/sucidality-checkpoint"
MODEL_LOGGING_PATH = "models/sucidality-checkpoint/logs"


# Setup

## Install dependencies

In [3]:
!pip install transformers transformers[torch] datasets numpy pandas scikit-learn 

Collecting scikit-learn
  Obtaining dependency information for scikit-learn from https://files.pythonhosted.org/packages/1f/ad/9799aabeabcb9a293c87b6f96cc78655b8abc7d35560cd99007093b5d445/scikit_learn-1.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Using cached scikit_learn-1.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting scipy>=1.5.0 (from scikit-learn)
  Obtaining dependency information for scipy>=1.5.0 from https://files.pythonhosted.org/packages/b8/46/1d255bb55e63de02f7b2f3a2f71b59b840db21d61ff7cd41edbfc2da448a/scipy-1.11.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Using cached scipy-1.11.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (59 kB)
Collecting joblib>=1.1.1 (from scikit-learn)
  Obtaining dependency information for joblib>=1.1.1 from https://files.pythonhosted.org/packages/10/40/d551139c85db202f1f384ba8bcf96aca2f329440a844f924c8a0040b6d02/joblib-1.3.2

## Import all dependencies

In [7]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
import transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import TrainingArguments, Trainer
from datasets import Dataset, DatasetDict, load_metric

## Load cleaned dataset

In [8]:
ds = pd.read_csv(DATASET_SOURCE, lineterminator='\n')
ds = ds[['cleaned', 'class']]
ds = ds.rename(columns={'cleaned': 'text', 'class': 'label'})
ds = ds.dropna()

ds

Unnamed: 0,text,label
0,teenager nintendo,0
1,suicide default option abandonment issue issue...,1
2,honestly not know anymore ina new school award...,1
3,want flip fucking chandler highway overpass fa...,0
4,lao life ruin figure loser deserve die no read...,1
...,...,...
262313,kind come party friend dog people no thank,0
262314,see human film watch religiously help write es...,0
262315,coffee make sleepy suck,0
262316,no pain pussy want die try kill not kill hang ...,1


## Split dataset for training, evaluation and testing.

In [9]:
train, temp = train_test_split(ds, random_state=SEED, test_size=0.2, stratify=ds['label'])
val, test = train_test_split(temp, random_state=SEED, test_size=0.5, stratify=temp['label'])

train, temp, val, test

(                                                     text  label
 156594  consider buy gun want die live state get gun e...      1
 116160  game thingie explain happen game lose billion ...      0
 128769                                 feel sad feel well      0
 1202    not understand college coach not run ball dump...      0
 240691  want know bear pain want normal die tired way ...      1
 ...                                                   ...    ...
 142070  jewish new year eve year tough wish good new y...      0
 215373  vegan teacher got ban celebrate no soggy tit s...      0
 85      wish relate read post help feel like write hop...      1
 179933     get commitment issue try fix tell somebody get      0
 153753             spineless moronic fucking matter tired      1
 
 [209613 rows x 2 columns],
                                                      text  label
 169025  fight boyfriend stay parent house not want sta...      1
 138009  mon pass away past summer good friend

# Setup Model & tokenize the input texts.

In [10]:
device = torch.device("cuda")

In [11]:
tokenizer = AutoTokenizer.from_pretrained("gooohjy/suicidal-electra")

In [12]:
def dataset_conversion(train, test, val):
  """Converts pandas dataframe to Dataset."""

  train.reset_index(drop=True, inplace=True)
  test.reset_index(drop=True, inplace=True)
  val.reset_index(drop=True, inplace=True)

  train_dataset = Dataset.from_pandas(train)
  test_dataset = Dataset.from_pandas(test)
  val_dataset = Dataset.from_pandas(val)

  return DatasetDict({"train": train_dataset, "test": test_dataset, "val": val_dataset})

raw_datasets = dataset_conversion(train, test, val)

raw_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 209613
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 26202
    })
    val: Dataset({
        features: ['text', 'label'],
        num_rows: 26202
    })
})

In [13]:
def tokenize_function(dataset):
    return tokenizer(dataset["text"], padding="max_length", truncation=True)
    
tokenized_datasets = raw_datasets.map(tokenize_function, batched=False)
tokenized_datasets

Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 209613/209613 [01:00<00:00, 3483.06 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26202/26202 [00:07<00:00, 3459.69 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26202/26202 [00:07<00:00, 3534.24 examples/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 209613
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 26202
    })
    val: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 26202
    })
})

# Training

## Load the model & trainer methods

In [14]:
model = AutoModelForSequenceClassification.from_pretrained("gooohjy/suicidal-electra", num_labels=2)

In [34]:
# Define custom metrics for computation

def compute_metrics(eval_pred):
    metric_acc = load_metric("accuracy")
    metric_rec = load_metric("recall")
    metric_pre = load_metric("precision")
    metric_f1 = load_metric("f1")
    
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    accuracy = metric_acc.compute(predictions=predictions, references=labels)["accuracy"]
    recall = metric_rec.compute(predictions=predictions, references=labels)["recall"]
    precision = metric_pre.compute(predictions=predictions, references=labels)["precision"]
    f1 = metric_f1.compute(predictions=predictions, references=labels)["f1"]

    return {'accuracy': accuracy, 'recall': recall, 'precision': precision, 'f1': f1}

In [91]:
# Define model and training parameters

def get_trainer(datasets, epochs=EPOCHS):

    training_args = TrainingArguments(
        output_dir=MODEL_CHECKPOINT_PATH,
        overwrite_output_dir = True,
        learning_rate=LEARNING_RATE,
        num_train_epochs=epochs,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        seed=SEED,
        # evaluation_strategy="epoch",
        logging_dir=MODEL_LOGGING_PATH,
        save_strategy="steps",
        save_steps=1500
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=datasets['train'],
        eval_dataset=datasets['val'],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    return trainer

## Train a sample

In [92]:
SAMPLE_SIZE = 20

sample = {
    'train': tokenized_datasets['train'].shuffle(seed=SEED).select(range(SAMPLE_SIZE)),
    'test': tokenized_datasets['test'].shuffle(seed=SEED).select(range(SAMPLE_SIZE)),
    'val': tokenized_datasets['val'].shuffle(seed=SEED).select(range(SAMPLE_SIZE))
}

sample

{'train': Dataset({
     features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
     num_rows: 20
 }),
 'test': Dataset({
     features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
     num_rows: 20
 }),
 'val': Dataset({
     features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
     num_rows: 20
 })}

In [94]:
trainer = get_trainer(sample)
trainer.train()

Step,Training Loss


TrainOutput(global_step=20, training_loss=1.2119610346417175e-07, metrics={'train_runtime': 3.1428, 'train_samples_per_second': 31.819, 'train_steps_per_second': 6.364, 'total_flos': 26311105536000.0, 'train_loss': 1.2119610346417175e-07, 'epoch': 5.0})

In [95]:
result = trainer.evaluate()

pd.DataFrame(data=result, index=['SAMPLE TEST RESULTS']).transpose()

Unnamed: 0,SAMPLE TEST RESULTS
eval_loss,0.842932
eval_accuracy,0.95
eval_recall,0.909091
eval_precision,1.0
eval_f1,0.952381
eval_runtime,0.4789
eval_samples_per_second,41.76
eval_steps_per_second,8.352
epoch,5.0


## Train model with the complete dataset

In [None]:
trainer = get_trainer(tokenized_datasets)
trainer.train()



Step,Training Loss
500,0.2135
1000,0.2667
1500,0.3541
2000,0.3421


In [None]:
result = trainer.evaluate()
pd.DataFrame(data=result, index=['SAMPLE TEST RESULTS']).transpose()

## Save model

In [None]:
trainer.save_model(MODEL_SAVE_PATH)