## Constants

List of all constants to be used throughout the application.

In [1]:
PROJECT_PATH = '.'


# Dataset Labels
LABEL_NON_SUICIDAL = 0
LABEL_SUICIDAL = 1


# Root Datasets Path
DATASETS_PATH = f'{PROJECT_PATH}/datasets'


# Source Datasets

DATASETS_UPLOAD_PATH = f'{DATASETS_PATH}/uploads'
DATASETS_DOWNLOAD_PATH = f'{DATASETS_PATH}/downloads'
DATASETS_EXPORT_PATH = f'{DATASETS_PATH}/export'

TDA_DATASET_URL = "https://www.kaggle.com/datasets/thedevastator/c-ssrs-labeled-suicidality-in-500-anonymized-red"
TDA_DATASET_EXPORT_PATH = f'{DATASETS_EXPORT_PATH}/tda-dataset.csv'

AG_DATASET_URL = "https://www.kaggle.com/datasets/amangoyl/reddit-dataset-for-multi-task-nlp"
AG_DATASET_EXPORT_PATH = f'{DATASETS_EXPORT_PATH}/ag-dataset.csv'

IMS_DATASET_URL = "https://www.kaggle.com/datasets/imeshsonu/suicideal-phrases"
IMS_DATASET_EXPORT_PATH = f'{DATASETS_EXPORT_PATH}/ims-dataset.csv'

LAX_DATASET_URL = "https://raw.githubusercontent.com/laxmimerit/twitter-suicidal-intention-dataset/master/twitter-suicidal_data.csv"
LAX_DATASET_EXPORT_PATH = f'{DATASETS_EXPORT_PATH}/lax-dataset.csv'

MSH_DATASET_URL = "https://www.kaggle.com/datasets/mohanedmashaly/suicide-notes"
MSH_DATASET_EXPORT_PATH = f'{DATASETS_EXPORT_PATH}/msh-dataset.csv'


NTL_DATASET_URL = "https://www.kaggle.com/datasets/natalialech/suicidal-ideation-on-twitter"
NTL_DATASET_EXPORT_PATH = f'{DATASETS_EXPORT_PATH}/ntl-dataset.csv'

SOURCE_DATASETS = [
  TDA_DATASET_EXPORT_PATH,
  AG_DATASET_EXPORT_PATH,
  IMS_DATASET_EXPORT_PATH,
  LAX_DATASET_EXPORT_PATH,
  MSH_DATASET_EXPORT_PATH,
  NTL_DATASET_EXPORT_PATH
]

# Final Cleaned Dataset
CLEANED_DATASET_PATH = f'{DATASETS_EXPORT_PATH}/final-cleaned-dataset.csv'

## Setup Environment

In [2]:
import os

# Disable tensorflow warnings.
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

try:
    for path in [PROJECT_PATH, DATASETS_PATH, DATASETS_DOWNLOAD_PATH, DATASETS_EXPORT_PATH]:
        os.makedirs(path, exist_ok=True)
    
    os.chdir(PROJECT_PATH)
    print(f"WORKING PATH: {PROJECT_PATH}")
except OSError:
    print("Error: Can't change the Current Working Directory")

WORKING PATH: .


In [24]:
!pip install typing-extensions pydantic opendatasets \
    pandas swifter unidecode contractions pyspellchecker \
    wordninja symspellpy spacy matplotlib seaborn \
    keras tensorflow

!pip install transformers
!pip install tensorrt

!pip3 install torch torchvision torchaudio

!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m112.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [4]:
import warnings
warnings.simplefilter(action='ignore')

import opendatasets as od
import pandas as pd
import numpy as np
import ast
import spacy
import unidecode
import contractions as contract
import re
import wordninja
import collections
import pkg_resources
from spellchecker import SpellChecker
from symspellpy import SymSpell, Verbosity
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
import seaborn as sns
import transformers
from transformers import pipeline
from tqdm import tqdm
from datetime import datetime

tqdm.pandas()

from swifter import set_defaults
set_defaults(
    dask_threshold=1,
    scheduler="processes",
    allow_dask_on_strings=True,
    progress_bar=True,
)

transformers.logging.set_verbosity(transformers.logging.ERROR)


## Data Source: thedevastator

https://www.kaggle.com/datasets/thedevastator/c-ssrs-labeled-suicidality-in-500-anonymized-red

### Pull Data

In [5]:
od.download(TDA_DATASET_URL, DATASETS_DOWNLOAD_PATH)
tda_data = pd.read_csv(f'{DATASETS_DOWNLOAD_PATH}/c-ssrs-labeled-suicidality-in-500-anonymized-red/500_anonymized_Reddit_users_posts_labels - 500_anonymized_Reddit_users_posts_labels.csv')
tda_data

Downloading c-ssrs-labeled-suicidality-in-500-anonymized-red.zip to ./datasets/downloads/c-ssrs-labeled-suicidality-in-500-anonymized-red


100%|██████████| 1.28M/1.28M [00:00<00:00, 21.6MB/s]







Unnamed: 0,User,Post,Label
0,user-0,"['Its not a viable option, and youll be leavin...",Supportive
1,user-1,['It can be hard to appreciate the notion that...,Ideation
2,user-2,"['Hi, so last night i was sitting on the ledge...",Behavior
3,user-3,['I tried to kill my self once and failed badl...,Attempt
4,user-4,['Hi NEM3030. What sorts of things do you enjo...,Ideation
...,...,...,...
495,user-495,"['Its not the end, it just feels that way. Or ...",Supportive
496,user-496,"['It was a skype call, but she ended it and Ve...",Indicator
497,user-497,['That sounds really weird.Maybe you were Dist...,Supportive
498,user-498,['Dont know there as dumb as it sounds I feel ...,Attempt


### Analyze data based on the labels

In [6]:
tda_posts = tda_data.copy(deep=True)

def tda_fix_post(text):
  if text.endswith("']") or text.endswith('"]'): return text
  return text + "']"

tda_posts['Post'] = tda_posts['Post'].map(tda_fix_post)
tda_posts['Post'] = tda_posts['Post'].apply(ast.literal_eval)
tda_posts['Post'] = tda_posts['Post'].map(lambda post: " ".join( post ))


tda_labels = tda_posts['Label'].unique()
for tda_label in tda_labels:
  items = tda_posts[tda_posts['Label']==tda_label]
  items = items.sample(n = 10)

  print(f'LABEL: {tda_label}')
  for index in items['Post'].index:
    print(f"- {items['Post'][index]}\n")

  print()

tda_posts

LABEL: Supportive
- Dont feel guilty. Better than doing nothing, get someone(his family or a friend) to show him that he is cared for. If you get a chance you could go personally and help him. For now keep him encouraged. Try to cheer him up and distract him from these thoughts. I understand how you feel. I also get such thoughts. If you need a friend, Ill be there for you(you can PM me). Dont waste yourself. You are a blessing to this world.Your family will feel Tired sad. BTW, its better you try taking that anti-depressing medication. Please throw away the gun.  True. Everyone has a guardian angel. 

- Im feeling pretty emotionally low just now too, so all I can say is -- a lot of life insurance policies wont pay after a Suicide. So just consider that as a practical matter.On a brighter note.. student loan forgiveness is a real idea that might actually happen someday.. you might think about getting involved in organizations that are pushing for it, if you have the energy and time. Ot

Unnamed: 0,User,Post,Label
0,user-0,"Its not a viable option, and youll be leaving ...",Supportive
1,user-1,It can be hard to appreciate the notion that y...,Ideation
2,user-2,"Hi, so last night i was sitting on the ledge o...",Behavior
3,user-3,I tried to kill my self once and failed badly ...,Attempt
4,user-4,Hi NEM3030. What sorts of things do you enjoy ...,Ideation
...,...,...,...
495,user-495,"Its not the end, it just feels that way. Or at...",Supportive
496,user-496,"It was a skype call, but she ended it and Vent...",Indicator
497,user-497,That sounds really weird.Maybe you were Distra...,Supportive
498,user-498,Dont know there as dumb as it sounds I feel Hy...,Attempt


### Findings

- Based on the above samples, we may conclude that the "Attempt" label clearly gives the indication of sucidilaty.
- Also, it looks like the "Ideation" label is for the state of a person when he/she is tending towards sucidilaty. (Ideation is just before attempt)

So, for this dataset we'll use both "Attempt" & "Ideation" as suicidal. And the rest are non suicidal.

### Primary Cleanup & Save Dataset

We'll clean the dataset and finally transform it to support as a training data for **suicidal-electra** model.

In [7]:
tda_export = tda_posts[['Post', 'Label']].copy()

tda_export['Label'] = tda_export['Label'].map(lambda label: LABEL_SUICIDAL if (label == 'Ideation' or label == 'Attempt') else LABEL_NON_SUICIDAL)
tda_export = tda_export.rename(columns={"Post": "text", "Label": "class"})

tda_export.to_csv(TDA_DATASET_EXPORT_PATH, index=False)

tda_export

Unnamed: 0,text,class
0,"Its not a viable option, and youll be leaving ...",0
1,It can be hard to appreciate the notion that y...,1
2,"Hi, so last night i was sitting on the ledge o...",0
3,I tried to kill my self once and failed badly ...,1
4,Hi NEM3030. What sorts of things do you enjoy ...,1
...,...,...
495,"Its not the end, it just feels that way. Or at...",0
496,"It was a skype call, but she ended it and Vent...",0
497,That sounds really weird.Maybe you were Distra...,0
498,Dont know there as dumb as it sounds I feel Hy...,1


## Data Source: ag

https://www.kaggle.com/datasets/amangoyl/reddit-dataset-for-multi-task-nlp

### Pull Data

In [8]:
od.download(AG_DATASET_URL, DATASETS_DOWNLOAD_PATH)
ag_data = pd.read_csv(f'{DATASETS_DOWNLOAD_PATH}/reddit-dataset-for-multi-task-nlp/Dataset_Suicidal_Sentiment.csv')
ag_data

Downloading reddit-dataset-for-multi-task-nlp.zip to ./datasets/downloads/reddit-dataset-for-multi-task-nlp


100%|██████████| 56.7M/56.7M [00:01<00:00, 39.7MB/s]





Unnamed: 0.1,Unnamed: 0,Post,Suicidal_label,Sentiment_label
0,0,Ex Wife Threatening SuicideRecently I left my ...,0,0
1,1,Am I weird I don t get affected by compliments...,1,1
2,2,Finally is almost over So I can never hear ...,1,0
3,3,i need helpjust help me im crying so hard,0,0
4,4,I m so lostHello my name is Adam and I ve b...,0,0
...,...,...,...,...
226948,227680,I sound like a dudebro but I can t handle my f...,0,0
226949,227681,Fuck my sister She is such I fucking bitch and...,1,0
226950,227682,I ve been suicidal for years and no one knowsT...,0,1
226951,227683,My boyfriend is sick so I took some Polaroids ...,1,0


### Analyze data based on the labels

----

**Suicidal Labels**

**Suicidal** - 0

**Non-Suicidal** -1

----

**Sentiment Labels**

**Negative** - 0

**Positive** - 1

**Neutral** - 2

In [9]:
# Suicidal & Negative

print("Suicidal & Negative:\n")

ag_sample = ag_data[(ag_data['Suicidal_label']==0) & (ag_data['Sentiment_label']==0)].sample(n=10)

for index in ag_sample.index:
  print(f"- {ag_sample['Post'][index]}")



# Non-Suicidal & Negative

print("\n\nNon-Suicidal & Negative:\n")

ag_sample = ag_data[(ag_data['Suicidal_label']==1) & (ag_data['Sentiment_label']==0)].sample(n=10)

for index in ag_sample.index:
  print(f"- {ag_sample['Post'][index]}")



# Suicidal & Positive

print("\n\nSuicidal & Positive:\n")

ag_sample = ag_data[(ag_data['Suicidal_label']==0) & (ag_data['Sentiment_label']==1)].sample(n=10)

for index in ag_sample.index:
  print(f"- {ag_sample['Post'][index]}")



# Non-Suicidal & Positive

print("\n\nNon-Suicidal & Positive:\n")

ag_sample = ag_data[(ag_data['Suicidal_label']==1) & (ag_data['Sentiment_label']==1)].sample(n=10)

for index in ag_sample.index:
  print(f"- {ag_sample['Post'][index]}")



# Suicidal & Neutral

print("\n\nSuicidal & Neutral:\n")

ag_sample = ag_data[(ag_data['Suicidal_label']==0) & (ag_data['Sentiment_label']==2)].sample(n=10)

for index in ag_sample.index:
  print(f"- {ag_sample['Post'][index]}")



# Non-Suicidal & Neutral

print("\n\nNon-Suicidal & Neutral:\n")

ag_sample = ag_data[(ag_data['Suicidal_label']==1) & (ag_data['Sentiment_label']==2)].sample(n=10)

for index in ag_sample.index:
  print(f"- {ag_sample['Post'][index]}")

Suicidal & Negative:

- I have a longing to kill myself  or  what s the point I m home for a semester dealing with my meds and I don t have a job  just to give you some context into my monotonous existence I have a great mother and father who love me dearly  and I have some friends coming to town in about a week or so and another good friend visiting shortly after that  so I do have something here to look forward to  But they ll be gone eventually I have plenty of people who care about me  but I just see no point to existence  Currently  I m stuck in the nest with my folks  My extremely loving folks are wonderful and they tell me daily that I have so much to live for But I still want to die  I get to see my shrink this Thursday  and I still want to die  I guess my main problem is that I m so bored right now that I want to kill myself  But even when next semester rolls around  I don t really see the point in anything either  I want to be a rock star physicist  but I know at most I ll ju

### Findings

- Based on the above samples, we may conclude that the **Suicidal & Negative** is an absolute sucidal.

- Some of the other **Suicidal & Positive** & **Suicidal & Neutral** is not a complete suicidal. But, we may ignore.

So, for this dataset we'll use all items with `"Suicidal_label" = 0`. It also seems like the "sentiments" has very less corelation with suicidality. We can surely ignore it.

### Primary Cleanup & Save Dataset

We'll clean the dataset and finally transform it to support as a training data for **suicidal-electra** model.

In [10]:
ag_export = ag_data[['Post', 'Suicidal_label']].copy()
ag_export = ag_export.rename(columns={"Post": "text", "Suicidal_label": "class"})

ag_export['class'] = ag_export['class'].map(lambda label: LABEL_SUICIDAL if (label == 0) else LABEL_NON_SUICIDAL)

ag_export.to_csv(AG_DATASET_EXPORT_PATH, index=False)

ag_export

Unnamed: 0,text,class
0,Ex Wife Threatening SuicideRecently I left my ...,1
1,Am I weird I don t get affected by compliments...,0
2,Finally is almost over So I can never hear ...,0
3,i need helpjust help me im crying so hard,1
4,I m so lostHello my name is Adam and I ve b...,1
...,...,...
226948,I sound like a dudebro but I can t handle my f...,1
226949,Fuck my sister She is such I fucking bitch and...,0
226950,I ve been suicidal for years and no one knowsT...,1
226951,My boyfriend is sick so I took some Polaroids ...,0


## Data Source: imeshsonu

https://www.kaggle.com/datasets/imeshsonu/suicideal-phrases

### Pull Data

In [11]:
od.download(IMS_DATASET_URL, DATASETS_DOWNLOAD_PATH)
ims_data = pd.read_csv(f'{DATASETS_DOWNLOAD_PATH}/suicideal-phrases/Train_suicide1.csv')
ims_data

Downloading suicideal-phrases.zip to ./datasets/downloads/suicideal-phrases


100%|██████████| 66.9k/66.9k [00:00<00:00, 20.5MB/s]







Unnamed: 0,Tweet,Suicide
0,i hate myself so much i want to KILL myself ho...,Potential Suicide post
1,RT @DrugForumsBest: I woke up with a bag over ...,Potential Suicide post
2,i wanna fucking kill myself,Potential Suicide post
3,why do i destroy all my opportunities? am i go...,Potential Suicide post
4,u - understand what the fuck im doing wrong wi...,Potential Suicide post
...,...,...
1594,hi! just joined and i think it's brill! haha s...,Not Suicide post
1595,@WilliamSledd Love the videos William New to t...,Not Suicide post
1596,@itsanimesh I got an Openmoko FreeRunner and n...,Not Suicide post
1597,@PeoplemapsJulie I am looking for the berocca ...,Not Suicide post


### Analyze data based on the labels

In [12]:
ims_posts = ims_data.copy(deep=True)

ims_labels = ims_posts['Suicide'].unique()

for label in ims_labels:
  items = ims_posts[ims_posts['Suicide']==label]
  
  items = items.sample(n = 10)
  items = items.dropna()

  print(f'LABEL: {label}\n--------------------------------')
  for index in items.index:
    print(f"- {items['Tweet'][index]}")
    
  print()

ims_posts

LABEL: Potential Suicide post 
--------------------------------
- I thought I’ve hated myself before but now i really truly hate myself so god damn much.
- https://t.co/6MPqRunPeX https://t.co/6MPqRunPeX
- RT @iriswestallens: I just want to remind y’all that without Iris, Barry Allen would be dead right now. But she’s useless right? #TheFlash…
- Dunno why my phones done me so dirty and made videos of me and my ex’s Cos of face recognition with the background… https://t.co/x6KfbmgBP9
- RT @camphalfblood: Random commenter asked why I had such an extreme social warrior agenda in Magnus Chase. What he calls extreme social jus…
- maybe i think i deserve to die
- i don't want to be here anymore
- Like all I want to do is shove a knife into my fucking stomach I hate myself so god damn much
- I work hard ʰᵉ ʷᵒʳᵏˢ ʰᵃʳᵈ every day of my life I work 'til I ache in my bones, at the end ᴬᵀ ᵀᴴᴱ ᴱᴺᴰ ᴼᶠ ᵀᴴᴱ ᴰᴬᴬᴬᴬᴬᴬ… https://t.co/T8btgG2HTl
- @UnclaimedAna2 The worst part is I tire easily and I love nap

Unnamed: 0,Tweet,Suicide
0,i hate myself so much i want to KILL myself ho...,Potential Suicide post
1,RT @DrugForumsBest: I woke up with a bag over ...,Potential Suicide post
2,i wanna fucking kill myself,Potential Suicide post
3,why do i destroy all my opportunities? am i go...,Potential Suicide post
4,u - understand what the fuck im doing wrong wi...,Potential Suicide post
...,...,...
1594,hi! just joined and i think it's brill! haha s...,Not Suicide post
1595,@WilliamSledd Love the videos William New to t...,Not Suicide post
1596,@itsanimesh I got an Openmoko FreeRunner and n...,Not Suicide post
1597,@PeoplemapsJulie I am looking for the berocca ...,Not Suicide post


### Findings

- Based on the above samples, we may conclude that the "Potential Suicide post" label clearly gives an indication of suicidality.
- Also, it looks like the "Not Suicide post" label is not suicidal.

So, for this dataset, we'll use both "Attempt" & "Ideation" as suicidal. And the rest are nonsuicidal.

### Primary Cleanup & Save Dataset

We'll clean the dataset and finally transform it to support as a training data for **suicidal-electra** model.

In [13]:
ims_export = ims_posts[['Tweet', 'Suicide']].copy()

ims_export['Suicide'] = ims_export['Suicide'].map(lambda label: LABEL_SUICIDAL if (label == 'Potential Suicide post ') else LABEL_NON_SUICIDAL)

ims_export = ims_export.rename(columns={"Tweet": "text", "Suicide": "class"})

ims_export.to_csv(IMS_DATASET_EXPORT_PATH, index=False)
ims_export

Unnamed: 0,text,class
0,i hate myself so much i want to KILL myself ho...,1
1,RT @DrugForumsBest: I woke up with a bag over ...,1
2,i wanna fucking kill myself,1
3,why do i destroy all my opportunities? am i go...,1
4,u - understand what the fuck im doing wrong wi...,1
...,...,...
1594,hi! just joined and i think it's brill! haha s...,0
1595,@WilliamSledd Love the videos William New to t...,0
1596,@itsanimesh I got an Openmoko FreeRunner and n...,0
1597,@PeoplemapsJulie I am looking for the berocca ...,0


## Data Source: laxmimerit

https://raw.githubusercontent.com/laxmimerit/twitter-suicidal-intention-dataset/master/twitter-suicidal_data.csv

### Pull Data

In [14]:
od.download(LAX_DATASET_URL, DATASETS_DOWNLOAD_PATH)
lax_data = pd.read_csv(f'{DATASETS_DOWNLOAD_PATH}/twitter-suicidal_data.csv')
lax_data

Downloading https://raw.githubusercontent.com/laxmimerit/twitter-suicidal-intention-dataset/master/twitter-suicidal_data.csv to ./datasets/downloads/twitter-suicidal_data.csv


3629056it [00:00, 87488657.37it/s]


Unnamed: 0,tweet,intention
0,my life is meaningless i just want to end my l...,1
1,muttering i wanna die to myself daily for a fe...,1
2,work slave i really feel like my only purpose ...,1
3,i did something on the 2 of october i overdose...,1
4,i feel like no one cares i just want to die ma...,1
...,...,...
9114,have you ever laid on your bed at night and cr...,1
9115,the fault the blame the pain s still there i m...,1
9116,stop asking me to trust you when i m still cou...,1
9117,i never know how to handle sadness crying make...,1


### Analyze data based on the labels

In [15]:
lax_posts = lax_data.copy(deep=True)

lax_labels = lax_posts['intention'].unique()

for label in lax_labels:
  items = lax_posts[lax_posts['intention']==label]
  
  items = items.sample(n=10)
  items = items.dropna()

  print(f'LABEL: {label}\n--------------------------------')
  for index in items.index:
    print(f"- {items['tweet'][index]}")
    
  print()

lax_posts

LABEL: 1
--------------------------------
- am i selfish for wanting to live so i have a pretty good life tbf but i ama massive dickhead i mean reeeeaaaaal dick to those who are close to me and i dont fully appreciate how kind people are to me i always think that itd be a whole lot better for everyone if i didnt exist sapping all their love and giving nothing back or for future friends lovers children ect who will have to deal with me i have been thinking that i could just get rid of myself and make it alot easier for possibly dozens of people but i dont want to die i love life theres so much that i love doing but this just makes me feel even more selfish becausei amonly staying here because i have such a good time personally i dont know what responses i want if any i just needed to let someone know this becausei amusually quite a happy person and i doubt anyone would ever think that id ever even considered suicide thankyou for reading anywayedit just ignore this i dont really think i 

Unnamed: 0,tweet,intention
0,my life is meaningless i just want to end my l...,1
1,muttering i wanna die to myself daily for a fe...,1
2,work slave i really feel like my only purpose ...,1
3,i did something on the 2 of october i overdose...,1
4,i feel like no one cares i just want to die ma...,1
...,...,...
9114,have you ever laid on your bed at night and cr...,1
9115,the fault the blame the pain s still there i m...,1
9116,stop asking me to trust you when i m still cou...,1
9117,i never know how to handle sadness crying make...,1


### Findings

- Based on the above samples, we may conclude that the "1" label clearly gives an indication of suicidality.
- Also, it looks like the "0" label is not suicidal.

### Primary Cleanup & Save Dataset

We'll clean the dataset and finally transform it to support as a training data for **suicidal-electra** model.

In [16]:
lax_export = lax_posts[['tweet', 'intention']].copy()

lax_export['intention'] = lax_export['intention'].map(lambda label: LABEL_SUICIDAL if (label == 1) else LABEL_NON_SUICIDAL)

lax_export = lax_export.rename(columns={"tweet": "text", "intention": "class"})

lax_export.to_csv(LAX_DATASET_EXPORT_PATH, index=False)
lax_export

Unnamed: 0,text,class
0,my life is meaningless i just want to end my l...,1
1,muttering i wanna die to myself daily for a fe...,1
2,work slave i really feel like my only purpose ...,1
3,i did something on the 2 of october i overdose...,1
4,i feel like no one cares i just want to die ma...,1
...,...,...
9114,have you ever laid on your bed at night and cr...,1
9115,the fault the blame the pain s still there i m...,1
9116,stop asking me to trust you when i m still cou...,1
9117,i never know how to handle sadness crying make...,1


## Data Source: mohanedmashaly

https://www.kaggle.com/datasets/mohanedmashaly/suicide-notes

### Pull Data

In [17]:
od.download(MSH_DATASET_URL, DATASETS_DOWNLOAD_PATH)
msh_data = pd.read_csv(f'{DATASETS_DOWNLOAD_PATH}/suicide-notes/test.csv')
msh_data

Downloading suicide-notes.zip to ./datasets/downloads/suicide-notes


100%|██████████| 143k/143k [00:00<00:00, 14.6MB/s]







Unnamed: 0,id,text
0,1,I have to put this in writing somehow so I sti...
1,2,I've made my peace with the fact that I'm goin...
2,3,I really want help now i just cant so this any...
3,4,I can't think of any reason to stay. I have no...
4,5,
...,...,...
493,494,Since I was 9 I felt depressed and almost took...
494,495,I don't know why I'm reaching out now of all t...
495,496,"I dont believe in an afterlife, but i stay up ..."
496,497,"I don't think I actually will kill myself, but..."


### Findings

- The above are all suicide notes, which can be directly assigned as suicidal.

### Primary Cleanup & Save Dataset

We'll clean the dataset and finally transform it to support as a training data for **suicidal-electra** model.

In [18]:
msh_export = msh_data[['text']].copy()

msh_export['class'] = LABEL_SUICIDAL

msh_export.to_csv(MSH_DATASET_EXPORT_PATH, index=False)
msh_export

Unnamed: 0,text,class
0,I have to put this in writing somehow so I sti...,1
1,I've made my peace with the fact that I'm goin...,1
2,I really want help now i just cant so this any...,1
3,I can't think of any reason to stay. I have no...,1
4,,1
...,...,...
493,Since I was 9 I felt depressed and almost took...,1
494,I don't know why I'm reaching out now of all t...,1
495,"I dont believe in an afterlife, but i stay up ...",1
496,"I don't think I actually will kill myself, but...",1


## Data Source: natalialech (ntl)

https://www.kaggle.com/datasets/natalialech/suicidal-ideation-on-twitter

### Pull Data
natalialech data is pre-fetched & populated and uploaded to our uploads directory.

In [21]:
ntl_data = pd.read_csv(f'{DATASETS_UPLOAD_PATH}/twitter_updated_dataset.csv')
ntl_data

Unnamed: 0,id,label,dataset,screen_name,followers_count,full_text,lang,hashtags,type
0,1608974517421985793,4,training,UpdateResearch,2.0,New trending GIF tagged via Giphy https://t.c...,en,[],tweet
1,1519909372314341376,2,training,biatchuu,20067.0,good morning chuuya day 🥳🎂🍰🧁🎉 i'm on vacation ...,en,[],tweet
2,1608975908509192193,4,training,,,,,,
3,1608975532112384000,4,training,xxlululo,2109.0,a client just sent some coin bc I gave him a n...,en,[],tweet
4,1608975358275223553,4,training,TwiterlessGuru,2032.0,Damn. Can’t even pretend that this isn’t true…...,en,[],tweet
...,...,...,...,...,...,...,...,...,...
18755,1608975992307191808,4,training,TradingChromat1,402.0,$USO Awaiting Short Signal based off 22 signal...,en,[],tweet
18756,1608975404391608320,4,test,hoeiswho,119.0,avi update\n(still trying to get a life),en,[],tweet
18757,1608975257389658112,4,test,CONEJlTO,510.0,i miss u so much (mickey mouse waffle),en,[],tweet
18758,1608975782407462912,4,validation,,,,,,


### Analyze data based on labels


**Data Labels**
```
0 - active suicidal ideation
1 - passive suicidal ideation
2 - sarcasm regarding suicidal ideation
3 - suicide-related tweets (awareness, news, chatter about suicide)
4 - other
```

In [22]:
ntl_posts = ntl_data.copy(deep=True)

ntl_labels = sorted(ntl_posts['label'].unique())

for label in ntl_labels:
  items = ntl_posts[ntl_posts['label']==label]
  
  items = items.sample(n=10)
  items = items.dropna()

  print(f'LABEL: {label}\n--------------------------------')
  for index in items.index:
    print(f"- {items['full_text'][index]}")
    
  print()

LABEL: 0
--------------------------------
- TW // Suicide

I think the meds I'm taking are giving me more suicidal thoughts and I hate it😭 I feel so alone and overwhelmed
- Nothing in my life has ever made me want to take my life ,
 more than people’s reaction to me trying to take my life ❤️🙏🏻
- None of my friends care about my health and honestly that’s enough for me to want to unalive myself lol
- TW!!!! 

I have two sides, 
"I'm so fucking tired of existing, every second is agony I just wanna jump off a bridge and feel my bones crumble upon impact"
And
"Haha I'm a literal G O D"
- I wish I could just disappear right now like how I feel I wanna either hurt someone or hurt myself ! Smh
- my family wonders why I wanna kms but then they yell at me and tell me I’m a burden 🖤🖤
- the urge to hang myself up is getting bigger.
- i   think   im   going    to    kill   myself    soon
- boutta hang myself with the bb belt
- tw  //  suicide

the  thought  of  slitting  my  throat  is  really  ap

### Findings

After reading through multiple samples we conclude that Labels 0 & 1 are suicidal, whereas Labels 2 & 4 are non-suicidal.
But, for this case, we must exclude the news data (Label 3), as it seems this data may conflict with the training of suicidality.

### Primary Cleanup & Save Dataset

In [23]:
ntl_export = ntl_data[['full_text', 'label']].copy()
ntl_export = ntl_export[ntl_export['label'] != 3]
ntl_export = ntl_export.dropna()


ntl_export['label'] = ntl_export['label'].map(lambda label: LABEL_SUICIDAL if (label == 0 or label == 1) else LABEL_NON_SUICIDAL)

ntl_export = ntl_export.rename(columns={"full_text": "text", "label": "class"})

ntl_export.to_csv(NTL_DATASET_EXPORT_PATH, index=False)

ntl_export

Unnamed: 0,text,class
0,New trending GIF tagged via Giphy https://t.c...,0
1,good morning chuuya day 🥳🎂🍰🧁🎉 i'm on vacation ...,0
3,a client just sent some coin bc I gave him a n...,0
4,Damn. Can’t even pretend that this isn’t true…...,0
5,Multiple introductions of salmonid alphavirus ...,0
...,...,...
18753,"Hellbent Everglow (@hellbentb) is on, replac...",0
18754,yeonbin was at dior event what if i drown myse...,0
18755,$USO Awaiting Short Signal based off 22 signal...,0
18756,avi update\n(still trying to get a life),0


## Data Merging, Cleanup & Normalizing

### Define Pre-Processors

In [None]:
# Summarizer
summarize = pipeline("summarization", model="philschmid/bart-large-cnn-samsum", framework='pt', device=0)

# Create chunks

def chunks(l, n):
    # looping till length l
    for i in range(0, len(l), n):
        yield l[i:i + n]


# generate summary recursively

def recursive_summary(text, chunk_size=250, min_length=40, max_length=50):

    if len( text ) <= max_length:
      return text
  
    lines = list(map(lambda words: ' '.join(words), list(chunks(text.split(), chunk_size))))
    summaries = []
    for line in lines:

        if len(line) < 30:
            continue

        s = summarize(line, min_length=min_length, max_length=max_length)
        s = s[0]["summary_text"]
        summaries.append(s)

    text = ' '.join(summaries)

    if len(lines) <= 1:
        return text

    return recursive_summary(text, chunk_size=chunk_size, min_length=min_length, max_length=max_length)

In [None]:
# Defining Methods

nlp = spacy.load("en_core_web_sm")

vocab = collections.Counter()

sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)

dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
bigram_path = pkg_resources.resource_filename("symspellpy", "frequency_bigramdictionary_en_243_342.txt")

sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)


# Spell Check using Symspell
def fix_spelling(text):
    suggestions = sym_spell.lookup_compound(text, max_edit_distance=2)
    correctedtext = suggestions[0].term # get the first suggestion, otherwise returns original text if nothing is corrected
    return correctedtext

# Remove some important words from stopwords list
deselect_stop_words = ['no', 'not']

for w in deselect_stop_words:
    nlp.vocab[w].is_stop = False

# Remove extra whitespaces from text
def remove_whitespace(text):
    text = text.strip()
    return " ".join(text.split())

# Remove accented characters from text, e.g. café
def remove_accented_chars(text):
    text = unidecode.unidecode(text)
    return text

# Remove URL
def remove_url(text):
    return re.sub(r'http\S+', '', text)

# Removing symbols and digits
def remove_symbols_digits(text):
    return re.sub('[^a-zA-Z\s]', ' ', text)

# Removing special characters
def remove_special(text):
    return text.replace("\r", " ").replace("\n", " ").replace("    ", " ").replace('"', '')

# Fix word lengthening (characters are wrongly repeated)
def fix_lengthening(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

def normalize_text(text, accented_chars=True, contractions=True, convert_num=True,
                       extra_whitespace=True, lemmatization=True, lowercase=True,
                       url=True, symbols_digits=True, special_chars=True,
                       stop_words=True, lengthening=True, spelling=True):
    """preprocess text with default option set to true for all steps"""
    if accented_chars == True: # remove accented characters
        text = remove_accented_chars(text)
    if contractions == True: # expand contractions
        text = contract.fix(text)
    if lowercase == True: # convert all characters to lowercase
        text = text.lower()
    if url == True: # remove URLs before removing symbols
        text = remove_url(text)
    if symbols_digits == True: # remove symbols and digits
        text = remove_symbols_digits(text)
    if special_chars == True: # remove special characters
        text = remove_special(text)
    if extra_whitespace == True: # remove extra whitespaces
        text = remove_whitespace(text)
    if lengthening == True: # fix word lengthening
        text = fix_lengthening(text)
    if spelling == True: # fix spelling
        text = fix_spelling(text)

    doc = nlp(text) # tokenise text

    clean_text = []

    # return text

    for token in doc:
        flag = True
        edit = token.text
        # remove stop words
        if stop_words == True and token.is_stop and token.pos_ != 'NUM':
            flag = False
        # exclude number words
        if convert_num == True and token.pos_ == 'NUM' and flag == True:
            flag = False
        # convert tokens to base form
        elif lemmatization == True and token.lemma_ != "-PRON-" and flag == True:
            edit = token.lemma_
        # append tokens edited and not removed to list
        if edit != "" and flag == True:
            clean_text.append(edit)
    return " ".join(clean_text)

### Load & merge all datasets

In [None]:
ds = pd.DataFrame()

for DATASET in SOURCE_DATASETS:
  try:
    source_ds = pd.read_csv(DATASET, lineterminator='\n').reset_index(drop=True)
    ds = pd.concat([ds, source_ds], axis=0)
  except:
    print(f"WARNING: Failed loading: {DATASET}")

    
print("UNCLEARED: ", ds.shape)

ds = ds.dropna(axis=0)
ds = ds.reset_index(drop=True)

print("CLEARED: ", ds.shape)

ds


### Text Progressing

We'll run the text processing pipeline for the dataset, to make sure all the textual contents are corrected and normalized, which can later be used to train the suicidal-electra model.

### Remove outliers
Text with large textual content can be ignored.

In [None]:
# Get word count of posts
text_len = [len(x.split()) for x in ds['text']]
pd.Series(text_len).hist(bins=80)
plt.show()
print(pd.Series(text_len).describe(percentiles=[.1,.25,.5,.6,.7,.8,.9, .95]))

In [None]:
# Subset dataset to obtain rows with less than or equal to 62 words

MAX_WORD_COUNT = 430

ds = ds[ds['text'].apply(lambda x: len(x.split()) <= MAX_WORD_COUNT)]
ds.reset_index(drop=True, inplace=True)

ds

#### Sample Preprocessing

Initially we'll run a test on a small sample to make sure everything is working as expected.

In [None]:
summary_threshold = 300
summary_chunk_size = 500
summary_min_length=50
summary_max_length=120

sample = ds.sample(n=10)

# apply summarizer to large text
print("STEP: RECURSIVE SUMMARIZE")
sample['summary'] = sample['text'].progress_apply(lambda text : text if len(text) <= summary_threshold else recursive_summary(text, chunk_size=summary_chunk_size, min_length=summary_min_length, max_length=summary_max_length) )

# apply text normalizer
print("STEP: NORMALIZE")
sample['cleaned'] = sample['summary'].swifter.apply(lambda row: normalize_text(row))

sample

In [None]:
for index in sample.index:
  print(f"RAW: {sample['text'][index]}\n")
  print(f"SMR: {sample['summary'][index]}\n")
  print(f"CLN: {sample['cleaned'][index]}\n")
  print("-------\n")

#### Full Dataset Pre-Processing

Preprocess the complete dataset.

In [None]:
export_ds = ds.copy()

start = datetime.now()
print(f'START: {start.strftime("%d/%m/%Y %H:%M:%S")}')

# WHEN DEVELOPING, USE SMALLER SAMPLE.
export_ds = export_ds.sample(100)

print("STEP: RECURSIVE SUMMARIZE")
export_ds['summary'] = export_ds['text'].progress_apply(lambda text : text if len(text) <= summary_threshold else recursive_summary(text, chunk_size=summary_chunk_size, min_length=summary_min_length, max_length=summary_max_length) )

print("STEP: NORMALIZE")
export_ds['cleaned'] = export_ds['summary'].swifter.apply(lambda row: normalize_text(row))


finish = datetime.now()
print(f'FINISH: {finish.strftime("%d/%m/%Y %H:%M:%S")}')
print(f'DURATION: {finish-start}')
print(f'PERFORMANCE: {(finish-start)/export_ds.shape[0]} sec/item, {export_ds.shape[0]/(finish-start).total_seconds()} items/sec')

export_ds

### Remove Irrelevant Words

In [None]:
# Cleanup of irrelevant words.

tokenizer = Tokenizer()
tokenizer.fit_on_texts(export_ds['cleaned'])
word_freq = pd.DataFrame(tokenizer.word_counts.items(), columns=['word','count']).sort_values(by='count', ascending=False)

In [None]:
# Plot bar graph for word frequency
plt.figure(figsize=(16, 8))
sns.barplot(x='count',y='word',data=word_freq.iloc[:30])
plt.title('Most Frequent Words')
plt.xlabel("Frequency")
plt.ylabel("Word")
plt.show()

### Remove Anomalous Words
Remove anomalous words that may adversely affect the model by unintended bias.

In [None]:
# Removed anomalous words
export_ds['cleaned'] = export_ds['cleaned'].replace('filler', '')

export_ds

### Remove 0 Word Text

In [None]:
# Remove rows with text length 0
export_ds = export_ds[export_ds['cleaned'].apply(lambda x: len(x.split())!=0)]
export_ds.reset_index(drop=True, inplace=True)

export_ds.shape

### Save Cleaned Dataset

Save the final cleaned dataset to a csv file.

In [None]:
export_ds.to_csv(CLEANED_DATASET_PATH, index=False)