In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [16]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, recall_score, accuracy_score, f1_score, precision_score
from collections import Counter
from prettytable import PrettyTable
import random

In [3]:
SEED = 4222
EPOCHS = 5

In [4]:
# Change to your own directory
try:
    os.chdir("/content/drive/MyDrive/SuicidePreventionProject")
    print("Directory changed")
except OSError:
    print("Error: Can't change the Current Working Directory")

Directory changed


In [5]:
# Load dataset
suicide_detection_df = pd.read_csv('Data/suicide_bad_final_cleaned.csv', header=0)
suicide_detection_df.drop(columns=['text'], axis=1, inplace=True)
suicide_detection_df = suicide_detection_df.rename(columns={"cleaned_text": "text"})
classes = {"suicide": 1, "non-suicide": 0}
suicide_detection_df = suicide_detection_df.replace({"class": classes})
suicide_detection_df.head()

Unnamed: 0,class,text
0,1,sex wife threaten suicide recently leave wife ...
1,0,weird not affect compliment come know girl fee...
2,0,finally hear bad year swear fucking god annoying
3,1,need help help cry hard
4,1,end tonight not anymore quit


In [6]:
# Split dataset into train, validation and test sets
train_text, test_text, train_labels, test_labels = train_test_split(suicide_detection_df['text'], suicide_detection_df['class'],
                                                                    random_state=SEED,
                                                                    test_size=0.2,
                                                                    stratify=suicide_detection_df['class'])

## Import vocab

In [7]:
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# load the vocabulary
vocab_filename = 'Data/vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

## Import Embeddings

In [8]:
# load embedding as a dict
def load_embedding(filename):
	# load embedding into memory, skip first line
	file = open(filename,'r')
	lines = file.readlines()[1:]
	file.close()
	# create a map of words to vectors
	embedding = dict()
	for line in lines:
		parts = line.split()
		# key is string word, value is numpy array for vector
		embedding[parts[0]] = np.asarray(parts[1:], dtype='float32')
	return embedding

### Removing out of vocab words

In [9]:
# clean each line
def clean_line(line, vocab):
  tokens = line.split()
  # filter out tokens not in vocab
  tokens_clean = [w for w in tokens if w in vocab]
  return [tokens_clean]

# clean entire dataset
def process_lines(data, vocab):
  lines = list()
  for i in data:
    line = clean_line(i, vocab)
    # add lines to list
    lines += line
  return lines

### Document Vector function

In [10]:
def document_vector(doc, embeddings):
    sentence = list()
    """Create document vectors by averaging word vectors. Remove out-of-vocabulary words."""
    doc = [word for word in doc if word in embeddings.keys()]
    for i in doc:
      word = embeddings[i]
      sentence.append(word)
    return np.mean(sentence, axis=0)

In [11]:
# function for all the data
def all_documents(df, labels_ori, embeddings):
  vec = list()
  labels = list()
  for i in range(len(df)):
    if len(df[i]) == 0:
      continue
    else:
      vec.append(document_vector(df[i], embeddings))
      labels.append(labels_ori.values[i])
  return vec, labels

## Word2Vec

In [12]:
word2vec = load_embedding('Data/embedding_word2vec.txt')

In [13]:
train_clean = process_lines(train_text, vocab)
test_clean = process_lines(test_text, vocab)
train_vec, train_labels_new = all_documents(train_clean, train_labels,word2vec)
test_vec, test_labels_new = all_documents(test_clean, test_labels, word2vec)

In [21]:
lr = LogisticRegression()
lr.fit(train_vec, train_labels_new)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [22]:
y_train_pred = lr.predict(train_vec)
print('Training set accuracy %s' % accuracy_score(train_labels_new, y_train_pred))
print(classification_report(train_labels_new, y_train_pred))

Training set accuracy 0.8803985308397011
              precision    recall  f1-score   support

           0       0.90      0.91      0.90     14477
           1       0.85      0.84      0.85      9210

    accuracy                           0.88     23687
   macro avg       0.87      0.87      0.87     23687
weighted avg       0.88      0.88      0.88     23687



In [23]:
y_test_pred = lr.predict(test_vec)
print('Test set accuracy %s' % accuracy_score(test_labels_new, y_test_pred))
print(classification_report(test_labels_new, y_test_pred))

Test set accuracy 0.8830883595201893
              precision    recall  f1-score   support

           0       0.90      0.91      0.90      3616
           1       0.85      0.84      0.85      2303

    accuracy                           0.88      5919
   macro avg       0.88      0.88      0.88      5919
weighted avg       0.88      0.88      0.88      5919



In [24]:
word2vec_test_accuracy_score = accuracy_score(test_labels_new, y_test_pred)
word2vec_test_precision_score = precision_score(test_labels_new, y_test_pred)
word2vec_test_recall_score = recall_score(test_labels_new, y_test_pred)
word2vec_test_f1_score = f1_score(test_labels_new, y_test_pred)

# GloVe

In [25]:
# load glove embedding from file
raw_embedding_glove = load_embedding('Data/glove.twitter.27B.100d.txt')

In [26]:
train_clean_glove = process_lines(train_text, raw_embedding_glove.keys())
test_clean_glove = process_lines(test_text, raw_embedding_glove.keys())
train_vec_glove, train_labels_glove_new = all_documents(train_clean_glove, train_labels, raw_embedding_glove)
test_vec_glove, test_labels_glove_new = all_documents(test_clean_glove, test_labels, raw_embedding_glove)

In [27]:
lr = LogisticRegression()
lr.fit(train_vec_glove, train_labels_glove_new)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [28]:
y_train_pred = lr.predict(train_vec_glove)
print('Training set accuracy %s' % accuracy_score(train_labels_glove_new, y_train_pred))
print(classification_report(train_labels_glove_new, y_train_pred))

Training set accuracy 0.8597246854150832
              precision    recall  f1-score   support

           0       0.89      0.88      0.89     14473
           1       0.82      0.82      0.82      9209

    accuracy                           0.86     23682
   macro avg       0.85      0.85      0.85     23682
weighted avg       0.86      0.86      0.86     23682



In [29]:
y_test_pred = lr.predict(test_vec_glove)
print('Training set accuracy %s' % accuracy_score(test_labels_glove_new, y_test_pred))
print(classification_report(test_labels_glove_new, y_test_pred))

Training set accuracy 0.8533536070282143
              precision    recall  f1-score   support

           0       0.88      0.88      0.88      3616
           1       0.81      0.81      0.81      2303

    accuracy                           0.85      5919
   macro avg       0.85      0.85      0.85      5919
weighted avg       0.85      0.85      0.85      5919



In [30]:
glove_test_accuracy_score = accuracy_score(test_labels_glove_new, y_test_pred)
glove_test_precision_score = precision_score(test_labels_glove_new, y_test_pred)
glove_test_recall_score = recall_score(test_labels_glove_new, y_test_pred)
glove_test_f1_score = f1_score(test_labels_glove_new, y_test_pred)

# Summary

In [31]:
table = PrettyTable()
table.field_names = ['Model - Logistic Regression', 'Accuracy', 'Precision', 'Recall', 'F1 Score']

table.add_row(['Word2Vec',
               format(word2vec_test_accuracy_score, '.4f'),
               format(word2vec_test_precision_score, '.4f'),
               format(word2vec_test_recall_score, '.4f'),
               format(word2vec_test_f1_score, '.4f')])

table.add_row(['GloVe',
               format(glove_test_accuracy_score, '.4f'),
               format(glove_test_precision_score, '.4f'),
               format(glove_test_recall_score, '.4f'),
               format(glove_test_f1_score, '.4f')])
print(table)

+-----------------------------+----------+-----------+--------+----------+
| Model - Logistic Regression | Accuracy | Precision | Recall | F1 Score |
+-----------------------------+----------+-----------+--------+----------+
|           Word2Vec          |  0.8831  |   0.8547  | 0.8428 |  0.8487  |
|            GloVe            |  0.8534  |   0.8116  | 0.8116 |  0.8116  |
+-----------------------------+----------+-----------+--------+----------+


In [20]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver='lbfgs', max_iter=3000)
lr.fit(train_vec, train_labels_new)
def is_suicidal(text, word_embeddings, lr):
    # Clean the input text
    clean_text = clean_line(text, vocab)
    # If the text is empty after cleaning, return None
    if not clean_text:
        return None
    # Compute the document vector for the input text
    doc_vector = document_vector(clean_text[0], word_embeddings)
    # Predict the label using the trained model
    prediction = lr.predict([doc_vector])[0]
    # Map the numerical label back to the original class label
    prevention_messages = ["I know things seem bleak now, but it can be hard to see possible solutions when you feel so overwhelmed.",
                       "I'm concerned about you because I care, and I want to offer support however I can. You can talk to me.",
                       "I'm always here if you feel like talking.",
                       "You can withstand any storm and when you are too tired to stand, I will hold you up. You are never alone.",
                       "You know I’m always here for you.",
                       "You’re allowed to have bad days, but remember tomorrow is a brand new day.",
                       "You’ve got a place here on Earth for a reason.",
                       "It's okay to have such thoughts but if they become overwhelming, don't keep it to yourself. I am here for you.",
                       "Everything is a season, and right now you’re in winter. It’s dark and cold and you can’t find shelter, but one day it’ll be summer, and you’ll look back and be grateful you stuck it out through winter.",
                       "I know you are going through a lot and you’re scared, but you will never lose me.",
                       "I know it feels like a lot right now. It’s OK to feel that way.",]

    helpline_message = "In times of severe distress where you need to speak with someone immediately, these are suicide hotline services available for you. You will be speaking with volunteers or professionals who are trained to deal with suicide crisis. 988 Suicide and Crisis Lifeline Available 24 hours."
    if(prediction==1):
      label = random.choice(prevention_messages)+"\n"+helpline_message
    else:
      label = "Glad to hear that\n"
    return label


text = "i love dogs"

# Determine if the text is suicidal
label = is_suicidal(text, word2vec, lr)
print(label)  #

Glad to hear that

