In [176]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


This file contains all the steps and code that is to be used in the preprocessing of the suicidal tendancy data.

# Functions

# 1. Import all the necessary libraries

In [177]:
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')

from nltk.tokenize import word_tokenize
import string
table = str.maketrans('', '', string.punctuation)
import re
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
from nltk.stem import wordnet
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
word_lem = WordNetLemmatizer()
from textblob import TextBlob

import tensorflow as tf
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dropout
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# 2. Read the Raw Dataset

In [178]:
def read_dataset():
    data = pd.read_csv(r"/content/drive/MyDrive/Project SuicideWatch /data/data.csv", sep = '\t', header = None)
    data.columns = ['id', 'sentence']
    return data

Note - The current dataset contains 122277 rows and 2 columns. For the demonstration purpose, we will be using only first 10000 for the processing, but all the steps could be applied to all the values. 

Note - To apply the preprocessing on all the available data, don't run sample_data() function.

In [179]:
def sample_data(data, sample_value):
    data = data[:][:sample_value]
    return data

# 3. Data Cleaning and Processing 

Note- Before starting the preprocessing, convert all the sentences in the form of string.

In [180]:
def convert_to_string (data):
    for i in range(len(data)):
        data['sentence'][i] = str(data['sentence'][i])
    return data

def data_processing(data):
    for i in range(len(data)):
        # Remove Excape Sequences
        sentence = remove_excape_sequesces(data['sentence'][i])
        # Remove Links
        sentence = remove_links(sentence)
        # Forming the final paragraphs
        senetnce = form_paragraph(sentence)
        # Perform further NLP processing
        sentence = apply_NLP(sentence)

        data['sentence'][i] = sentence
    return data

def apply_NLP (sentence):
    sentences = sentence.split(sep = '.')
    for i in range(len(sentences)):
        # Tokenize the words
        tokens = word_tokenize(sentences[i])
        tokens = [word.lower() for word in tokens]
        # Remove puntuations
        no_punctuations = [word.translate(table) for word in tokens]
        # Remove all non-alphabetic characters.
        words = [word for word in no_punctuations if word.isalpha()]
        # Remove Stop words and convert the sentence to its base form
        words = [word_lem.lemmatize(w) for w in words if not w in stop_words]
        sentences[i] = " ".join(words)
    sentence = " .".join(sentences)
    return sentence+"."

def remove_excape_sequesces (sentence):
    excape_sequences = ['\n', '\t', '\r', '\\', '\a', '\f', '\o', '\v', '\b', '\"', '\'', '\newline']
    temp = ""
    sentence = str(sentence)
    for i in sentence:
        if i in excape_sequences:
            continue
        temp += i
    return temp

def remove_links (sentence):
    comment = sentence.split(sep = " ")
    for word in comment:
        if ('http' in word) or ('www' in word) or ('https' in word) or (len(word) == 0):
            comment.remove(word)
    sentence = " ".join(comment)
    return sentence

def form_paragraph (sentence):
    temp = nltk.sent_tokenize(sentence)
    for index in range(len(temp)):
        temp[index] = temp[index].replace('.', ' ')
    for index in range(len(temp)):
        d = []
        k = temp[index].split(sep = ' ')
        for z in range(len(k)):
            if len(k[z]) > 0:
                d.append(k[z]) 
        s = ''
        for j in range(len(d)):
            s = s + ' ' + d[j].strip()
        temp[index] = s.strip()      
    sentence = '. '.join(temp).lower()
    return sentence

# 4. Getting the labels for each post using TextBlob

In [181]:
def polarity_and_subjectivity(data):
    polarity_sentence  = []
    subjectivity_sentence = []

    for i in range(len(data)):
        
        # create temp variables
        sentence = data['sentence'][i]
        
        # form object and calculate polarity and subjectivity of sentence
        obj_sentence = TextBlob(sentence)
        polarity = obj_sentence.sentiment.polarity
        subjectivity = obj_sentence.sentiment.subjectivity
        
        # save the polarity and subjectivity of sentence
        polarity_sentence.append(polarity)
        subjectivity_sentence.append(subjectivity)

    data['polarity'] = polarity_sentence
    data['subjectivity'] = subjectivity_sentence
    
    return data

In [182]:
def filter_polarity_and_subjectivity (data):
    k = []
    for i in range(len(data)):
        if data['subjectivity'][i] <= subjectivity_threshold and len(str(data['sentence'][i])) > 10:
            k.append(data['id'][i])
    data = data.drop(np.array(k)-1)
    labels = []
    data = data.reset_index(drop=True)
    for i in range(len(data)):
        if data['polarity'][i] < polarity_threshold:
            labels.append(1)
        else:
            labels.append(0)
    data['labels'] = labels
    return data

In [183]:
data = read_dataset()
data = sample_data(data, 1000)
data = convert_to_string(data)
data = polarity_and_subjectivity(data)
polarity_threshold = 0          # range(-1 to +1)
subjectivity_threshold = 0.5    # range(0 to 1)
data = filter_polarity_and_subjectivity(data)
data = data_processing(data)
data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,id,sentence,polarity,subjectivity,labels
0,1,always waited thing get better .always waited ...,0.057035,0.612771,0
1,6,tonight night dont care fucking hurt . . . ..,-0.600000,0.800000,1
2,7,im tucking tired people pretend care say theyr...,0.123889,0.582778,0
3,9,sucidal intention turned cutting stop many cut...,0.162500,0.558333,0
4,10,slowly becoming mentally unstable show.,0.337500,0.512500,0
...,...,...,...,...,...
575,993,lucky friend boyfriend sometimes wish didnt li...,-0.183333,0.750000,1
576,995,graduated law school today single member famil...,-0.066071,0.653571,1
577,996,feel like im getting close end mess thing life...,0.101736,0.503704,0
578,997,ive emotional rollercoaster lately talking peo...,-0.042765,0.517769,1


# Calculate the Vocabulary size

In [77]:
dict_data = pd.read_csv(r"/content/drive/MyDrive/Project SuicideWatch /data/data.csv", sep = '\t', header = None)

In [82]:
dict_data.columns = ['id', 'text']
dict_data

Unnamed: 0,id,text
0,1,I always waited for things to get better. I al...
1,2,I’ve had rope under my dresser for a while now...
2,3,I’m just burnt out I have nothing left. My gf ...
3,4,I want to commit suicide.. I feel like I have ...
4,5,It's like being thrown in a pit of suffering t...
...,...,...
122272,122273,Hey y'all. Not really sure what I'm doing but ...
122273,122274,I’m just wondering if there’s anyone like me t...
122274,122275,Amber you don't matter no one loves you. Why a...
122275,122276,"Title explains it. I am going to sleep, and I ..."


In [136]:
def calculate_dictionary (data, subjectivity_threshold):
    vocabulary = set()
    excape_sequences = ['\n', '\t', '\r', '\\', '\a', '\f', '\o', '\v', '\b', '\"', '\'', '\newline']

    # Sample the data
    data = data[:][:1000]
    # Iterate over the data
    for i in range(len(data)):

        # Convert to string
        data['text'][i] = str(data['text'][i])

        # filter accotrding to subjectivity score
        obj_sentence = TextBlob(data['text'][i])
        if obj_sentence.sentiment.subjectivity > subjectivity_threshold and len(data['text'][i]) > 10:
            
            # Remove Excape Sequences
            sentence = ""
            for i in data['text'][i]:
                if i in excape_sequences:
                    continue
                sentence += i
            
            # Remove Links
            comment = sentence.split(sep = " ")
            for word in comment:
                if ('http' in word) or ('www' in word) or ('https' in word) or (len(word) == 0):
                    comment.remove(word)
            sentence = " ".join(comment)
            
            # Forming the final paragraphs
            temp = nltk.sent_tokenize(sentence)
            for index in range(len(temp)):
                temp[index] = temp[index].replace('.', ' ')
            for index in range(len(temp)):
                d = []
                k = temp[index].split(sep = ' ')
                for z in range(len(k)):
                    if len(k[z]) > 0:
                        d.append(k[z]) 
                s = ''
                for j in range(len(d)):
                    s = s + ' ' + d[j].strip()
                temp[index] = s.strip()      
            sentence = '. '.join(temp).lower()
            
            # Perform further NLP processing
            sentences = sentence.split(sep = '.')
            for i in range(len(sentences)):
                
                # Tokenize the words
                tokens = word_tokenize(sentences[i])
                tokens = [word.lower() for word in tokens]
                
                # Remove puntuations
                no_punctuations = [word.translate(table) for word in tokens]
                
                # Remove all non-alphabetic characters.
                words = [word for word in no_punctuations if word.isalpha()]
                
                # Remove Stop words and convert the sentence to its base form
                words = [word_lem.lemmatize(w) for w in words if not w in stop_words]

            # Add to set
            for j in range(len(words)):
                vocabulary.add(str(words[j]))
            
    return vocabulary

In [137]:
vocabulary =  calculate_dictionary(dict_data, 0.5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [138]:
len(vocabulary)

1731

In [139]:
vocab_list= pd.DataFrame({"vocabulary":list(vocabulary)})

In [140]:
vocab_list

Unnamed: 0,vocabulary
0,teenager
1,circumstance
2,drown
3,lived
4,ok
...,...
1726,f
1727,depressed
1728,edit
1729,bro


In [None]:
vocab_list.to_csv(r'vocabulary.csv', index = False, header = False, sep = '\t')

In [142]:
voc_size=len(vocabulary)
voc_size

1731

# Word Embedding and LSTM

In [184]:
embedding_vector_features=40
LSTM_layers = 100

In [185]:
data

Unnamed: 0,id,sentence,polarity,subjectivity,labels
0,1,always waited thing get better .always waited ...,0.057035,0.612771,0
1,6,tonight night dont care fucking hurt . . . ..,-0.600000,0.800000,1
2,7,im tucking tired people pretend care say theyr...,0.123889,0.582778,0
3,9,sucidal intention turned cutting stop many cut...,0.162500,0.558333,0
4,10,slowly becoming mentally unstable show.,0.337500,0.512500,0
...,...,...,...,...,...
575,993,lucky friend boyfriend sometimes wish didnt li...,-0.183333,0.750000,1
576,995,graduated law school today single member famil...,-0.066071,0.653571,1
577,996,feel like im getting close end mess thing life...,0.101736,0.503704,0
578,997,ive emotional rollercoaster lately talking peo...,-0.042765,0.517769,1


In [186]:
X = data['sentence']
corpus = data['sentence'].tolist()
y = data['labels']
y.value_counts()

1    329
0    251
Name: labels, dtype: int64

In [187]:
corpus

['always waited thing get better .always waited thing improve people change happy .tired waiting .nothing going get better anyways .family still gon na shitty tomorrow .past gon na change tomorrow .brain going change tomorrow .way extremely defective unloveable going change tomorrow .tomorrow interchangeable range time waiting year .remember time want give .sick .know deserve .matter good enough gon na okay ever .see ever older .want anymore ..',
 'tonight night dont care fucking hurt . . . ..',
 'im tucking tired people pretend care say theyre love know youre fucking lying make feel good one ever make effort see literally dont friend family dont feel loved deserving anything good .im jealous people happy content people care cause take granted much idea hard .point one tried make plan long cant make plan anyone either cause cant help feel unwanted even go fucking drink someone id feel like im wasting time actually wish dead oh god cant fucking dead.',
 'sucidal intention turned cutting

In [188]:
onehot_repr=[one_hot(words,voc_size)for words in corpus] 
onehot_repr

[[1604,
  439,
  1299,
  1219,
  364,
  1604,
  439,
  1299,
  137,
  304,
  703,
  801,
  574,
  1697,
  1327,
  1631,
  1219,
  364,
  983,
  1001,
  398,
  1072,
  279,
  1472,
  1464,
  1334,
  1072,
  279,
  703,
  1464,
  933,
  1631,
  703,
  1464,
  1223,
  697,
  866,
  23,
  1631,
  703,
  1464,
  1464,
  794,
  648,
  621,
  1697,
  233,
  1382,
  621,
  1631,
  14,
  1149,
  1523,
  255,
  1037,
  674,
  85,
  1072,
  279,
  1095,
  294,
  652,
  294,
  658,
  1631,
  1372],
 [630, 406, 1686, 1656, 195, 219],
 [115,
  505,
  574,
  304,
  1617,
  1656,
  711,
  333,
  798,
  1523,
  168,
  195,
  1343,
  1217,
  132,
  674,
  1585,
  294,
  1217,
  1252,
  652,
  802,
  1686,
  1696,
  1001,
  1686,
  132,
  1539,
  539,
  1242,
  674,
  115,
  656,
  304,
  801,
  37,
  304,
  1656,
  1537,
  277,
  1718,
  865,
  461,
  52,
  689,
  1585,
  882,
  1217,
  911,
  1686,
  287,
  1217,
  911,
  1556,
  1562,
  1537,
  287,
  1574,
  132,
  191,
  431,
  1727,
  195,
  1364,


### Embedding Representation

In [189]:
sent_length=0
k=0
for i in range(len(onehot_repr)):
    if len(onehot_repr[i])>sent_length:
        sent_length=len(onehot_repr[i])
        k=i
embedded_docs=pad_sequences(onehot_repr,padding='post',maxlen=sent_length)
print(embedded_docs)

[[1604  439 1299 ...    0    0    0]
 [ 630  406 1686 ...    0    0    0]
 [ 115  505  574 ...    0    0    0]
 ...
 [ 132  521  115 ...    0    0    0]
 [ 370  776  584 ...    0    0    0]
 [ 195  574 1219 ...    0    0    0]]


In [190]:
embedded_docs[0]

array([1604,  439, 1299, ...,    0,    0,    0], dtype=int32)

# Model Preparation 

In [191]:
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(LSTM(LSTM_layers, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 1158, 40)          69240     
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               56400     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 101       
Total params: 125,741
Trainable params: 125,741
Non-trainable params: 0
_________________________________________________________________
None


In [192]:
len(embedded_docs),y.shape

(580, (580,))

In [193]:
X_final=np.array(embedded_docs)
y_final=np.array(y)
X_final.shape,y_final.shape

((580, 1158), (580,))

In [194]:
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=21)

### Model Training

In [195]:
model.fit(X_train,y_train, validation_split=0.4, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7fd827c79a58>

In [196]:
y_pred=model.predict_classes(X_test)



In [197]:
confusion_matrix(y_test,y_pred)

array([[  0,  82],
       [  0, 110]])

In [198]:
accuracy_score(y_test,y_pred)

0.5729166666666666