In [1]:
#Imports
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import numpy as np
from transformers import pipeline
from sklearn.model_selection import train_test_split
import pickle

In [2]:
#original downloaded dataset, can download following the link in my paper
df = pd.read_csv("suicide.csv")

In [3]:
#clean up dataset and get rid of values that can't be processed
df = df.loc[df.Post.apply(lambda x: not isinstance(x, (float, int)))]
df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)
df = df.replace(to_replace=r'[^\w\s]', value='', regex=True)
df = df.replace(to_replace=r'\d', value='', regex=True)

In [4]:
#feature engineering for Post Length, and also tokenizing the original posts so we can work with it later
lengthList = []
for x in df["Post"]:
    lengthList.append(len(x)) 
df["Post_length"] = lengthList
df['Tokenized'] = df['Post'].apply(word_tokenize)

In [5]:
#get rid of stopwords and apply that in the tokenized column 
stop_words = set(stopwords.words('english'))
df['Tokenized'] = df['Tokenized'].apply(lambda x: [word for word in x if word not in stop_words])

In [6]:
#Standard process for lemmatizing Tokens and getting the lemmas/root form of each word. This further reduces noise 
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
def lemmatize_tokens(tokens):
    def get_wordnet_pos(word):
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}
        return tag_dict.get(tag, wordnet.NOUN)
    
    lemmas = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens]
    return lemmas

# apply lemmatization function to column of dataframe
df['lemmatized_messages'] = df['Tokenized'].apply(lemmatize_tokens)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/pranavsomani/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/pranavsomani/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
#To use GloVe embeddings, we need to import a corpus with a certain dimensionality. 
#I chose 100 as my dimensionality it isn't too costly while still preserves a lot of information

def load_glove_embeddings(glove_file_path):
    embeddings_index = {}
    with open(glove_file_path, 'r', encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    embedding_dim = len(coefs)
    return embeddings_index, embedding_dim

#make sure to download this file online by searching up GloVe embeddings...couldn't attach this in my submission due to size
glove_embeddings, embedding_dim = load_glove_embeddings('/Users/pranavsomani/Downloads/glove.6B/glove.6B.100d.txt')

In [9]:
#This function creates an embedding for every word in a post 
def sentence_to_embedding(sentence, embeddings_index, embedding_dim):
    embeddings = []
    for word in sentence:
        if word in embeddings_index:
            embeddings.append(embeddings_index[word])
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(embedding_dim)

#Applies the embedding on the lemmatized messages column
df['sentence_embedding'] = df['lemmatized_messages'].apply(lambda x: sentence_to_embedding(x, glove_embeddings, embedding_dim))

In [10]:
#aggregates all the embedded values for a sentence so machine learning models can be supplied this feature
def reshape(vector):
    y = np.array(vector)
    mean = np.mean(vector)
    mean = np.array(mean)
    return mean.reshape(-1, 1)

df["sentence_embedding"] = df['sentence_embedding'].apply(lambda x: reshape(x))

In [11]:
#Further feature engineering: implementing a highly-used sentiment analysis model on each post in the dataset
#This will serve as a feature to see how strongly negative/positive a post is
sentiment_pipeline = pipeline("sentiment-analysis", truncation=True)

def sentimentAnalysis(data):    
    if type(data) != str:
        return 0
    
    x = sentiment_pipeline(data)
    return x

#Implementing the model on each post in dataframe....takes very long so run at your own risk
df['sentimentAnalysis2'] = df['Post'].apply(lambda x: sentimentAnalysis(x))

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


In [12]:
#Takes the output of the sentimentAnalysis column and changes the format of it to simply be an integer
def sentimentNum(row):
    res = row
    if res == 0:
        return 0
    sentDict = res[0]
    if sentDict["label"] == 'NEGATIVE':
        return sentDict["score"] * -1
    elif sentDict["label"] == 'POSITIVE':
        return sentDict["score"]
    
df['SentAnalysis'] = df['sentimentAnalysis2'].apply(lambda x: sentimentNum(x))

In [13]:
with open('badwords.txt', 'r') as file:
    lines = [line.strip() for line in file]

def badWords(badwordList, txt):
    counter = 0
    txt = txt.lower()
    txt = txt.split()
    for string in txt:
        if string in badwordList:
            counter += 1
    return counter

df['badWordCount'] = df['Post'].apply(lambda x: badWords(lines, x))

In [14]:
df

Unnamed: 0,Title,Post,Label,Post_length,Tokenized,lemmatized_messages,sentence_embedding,sentimentAnalysis2,SentAnalysis,badWordCount
0,im looking for a girl ive met at the polish ai...,ive tried polish spotted pages but i guess she...,nonsuicidal,1022,"[ive, tried, polish, spotted, pages, guess, is...","[ive, try, polish, spot, page, guess, isnt, kn...",[[-0.020897085]],"[{'label': 'NEGATIVE', 'score': 0.695042848587...",-0.695043,1
1,i got a guy kicked off a domestic flight today,i was in a forward row as we were boarding for...,nonsuicidal,919,"[forward, row, boarding, hour, domestic, fligh...","[forward, row, boarding, hour, domestic, fligh...",[[-0.015430211]],"[{'label': 'NEGATIVE', 'score': 0.997833907604...",-0.997834,0
2,my youngest got in school suspension im so proud,so according to witness testimonies a boy grab...,nonsuicidal,355,"[according, witness, testimonies, boy, grabbed...","[accord, witness, testimony, boy, grabbed, say...",[[-0.008069504]],"[{'label': 'POSITIVE', 'score': 0.938526093959...",0.938526,1
3,im a cashier who switched from sirmam to my de...,so as said in the title im a cashier well that...,nonsuicidal,2434,"[said, title, im, cashier, well, thats, part, ...","[say, title, im, cashier, well, thats, part, j...",[[-0.023101069]],"[{'label': 'NEGATIVE', 'score': 0.986810863018...",-0.986811,0
4,my whole class complimented me and didnt reali...,today in class we were doing this activity cal...,nonsuicidal,869,"[today, class, activity, called, someone, basi...","[today, class, activity, call, someone, basica...",[[-0.033342287]],"[{'label': 'NEGATIVE', 'score': 0.908012390136...",-0.908012,0
...,...,...,...,...,...,...,...,...,...,...
15472,once i die i can finally be at rest,my past actions will no longer haunt me hopefu...,suicidal,97,"[past, actions, longer, haunt, hopefully, peop...","[past, action, longer, haunt, hopefully, peopl...",[[-0.025204448]],"[{'label': 'NEGATIVE', 'score': 0.995922803878...",-0.995923,0
15473,i just want to stop,i just want to stop living ive lost everything...,suicidal,580,"[want, stop, living, ive, lost, everything, im...","[want, stop, living, ive, lose, everything, im...",[[-0.016652748]],"[{'label': 'NEGATIVE', 'score': 0.997233331203...",-0.997233,0
15474,im still alive,why the fuck am i still alive why wont i just ...,suicidal,591,"[fuck, still, alive, wont, fucking, kill, alre...","[fuck, still, alive, wont, fuck, kill, already...",[[-0.015870886]],"[{'label': 'NEGATIVE', 'score': 0.998081088066...",-0.998081,4
15475,im lonely but i cant stand people,its a lot better online but irl i cant stand p...,suicidal,1260,"[lot, better, online, irl, cant, stand, people...","[lot, well, online, irl, cant, stand, people, ...",[[-0.021809079]],"[{'label': 'NEGATIVE', 'score': 0.980329394340...",-0.980329,1


In [15]:
#makes the y labels of the dataset binary
label_mapping = {'suicidal': 1, 'nonsuicidal': 0}
df['Label'] = df['Label'].map(label_mapping)

In [16]:
#Made different datasets of both classes to be able to find any significant differences in post length and sentiment
suicidal_df = df[df["Label"] == 1]
nonsuicidal_df = df[df["Label"] == 0]

In [17]:
suicidal_df.describe(), nonsuicidal_df.describe()

(        Label   Post_length  SentAnalysis  badWordCount
 count  6806.0   6806.000000   6806.000000   6806.000000
 mean      1.0    817.997943     -0.843184      0.904790
 std       0.0   1011.940750      0.503856     10.329561
 min       1.0      0.000000     -0.999812      0.000000
 25%       1.0    255.000000     -0.999182      0.000000
 50%       1.0    533.000000     -0.997978      0.000000
 75%       1.0   1020.000000     -0.992421      1.000000
 max       1.0  18413.000000      0.999878    837.000000,
         Label   Post_length  SentAnalysis  badWordCount
 count  7436.0   7436.000000   7436.000000   7436.000000
 mean      0.0    626.681818     -0.029710      0.157746
 std       0.0    674.648277      0.959588      0.601272
 min       0.0      0.000000     -0.999816      0.000000
 25%       0.0    218.000000     -0.993674      0.000000
 50%       0.0    422.000000     -0.650665      0.000000
 75%       0.0    808.000000      0.994332      0.000000
 max       0.0  13665.000000  

In [18]:
#Bootstrapping algorithm I made in pset5 to see whether or not there is a significant difference in the length of posts between those struggling with suicidal ideation and those who aren't
#Suicide Class size
N = 6806.000000
#Non-Suicide Class size
M = 7436.000000

average2 = 0.904790
average1 = 0.157746
score = df["badWordCount"].values.tolist()

observedDiff = abs(average2-average1)
count = 0 

for i in range(20000):
    act1_resample = np.random.choice(score, int(N), replace= True)
    act2_resample = np.random.choice(score, int(M), replace= True)
    
    npmean1 = np.mean(act1_resample)
    npmean2 = np.mean(act2_resample)
    
    diff = abs(npmean2 - npmean1)
    if diff >=  observedDiff:
        count+=1

print(count)
pvalue = (count/20000)
print(pvalue)

0
0.0


In [19]:
#Bootstrapping algorithm I made in pset5 to see whether or not there is a significant difference in the length of posts between those struggling with suicidal ideation and those who aren't

#Suicide Class size
N = 6806.000000
#Non-Suicide Class size
M = 7436.000000

average2 = 817.997943
average1 = 626.681818
score = df["Post_length"].values.tolist()

observedDiff = abs(average2-average1)
count = 0 

for i in range(10000):
    act1_resample = np.random.choice(score, int(N), replace= True)
    act2_resample = np.random.choice(score, int(M), replace= True)
    
    npmean1 = np.mean(act1_resample)
    npmean2 = np.mean(act2_resample)
    
    diff = abs(npmean2 - npmean1)
    if diff >=  observedDiff:
        count+=1

print(count)
pvalue = (count/10000)
print(pvalue)

0
0.0


In [21]:
#Bootstrapping algorithm I made in pset5 to see whether or not there is a significant difference in the sentiment of posts between those struggling with suicidal ideation and those who aren't

#Suicide Class size
N = 6806.000000
#Non-Suicide Class size
M = 7436.000000

average2 = -0.844678
average1 = -0.030158
score = df["SentAnalysis"].values.tolist()


observedDiff = abs(average2-average1)
count = 0 

for i in range(10000):
    act1_resample = np.random.choice(score, int(N), replace= True)
    act2_resample = np.random.choice(score, int(M), replace= True)
    
    npmean1 = np.mean(act1_resample)
    npmean2 = np.mean(act2_resample)
    
    diff = abs(npmean2 - npmean1)
    if diff >=  observedDiff:
        count+=1

print(count)      
pvalue = count/10000
print(pvalue)

0
0.0


In [45]:
#Training RF classifier on my features and output
from sklearn.ensemble import RandomForestClassifier
df.dropna(inplace=True)

X = df[["sentence_embedding", "Post_length", "SentAnalysis", "badWordCount"]]
X = np.array(X)
y = np.array(df['Label'])

#80% training and 20% testing split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Specified depth to prevent overfitting
rf = RandomForestClassifier(max_depth=10)
rf.fit(X_train, y_train)

train_accuracy_rf = rf.score(X_train, y_train)
test_accuracy_rf = rf.score(X_test, y_test)

print("Train Accuracy RF:", train_accuracy_rf)
print("Test Accuracy RF:", test_accuracy_rf)

Train Accuracy RF: 0.8138330553848855
Test Accuracy RF: 0.7581607581607581


In [40]:
#Save model to disk
filename = 'SI_finalized_model.sav'
pickle.dump(rf, open(filename, 'wb'))

#Can use model without having to train it again
model = pickle.load(open(filename, 'rb'))
model 

In [32]:
#Serialize dataframe so everything doesn't become strings
df.to_pickle('pickle-file.pkl.gz', compression='gzip')
df = pd.read_pickle('pickle-file.pkl.gz', compression='gzip')

In [33]:
df

Unnamed: 0,Title,Post,Label,Post_length,Tokenized,lemmatized_messages,sentence_embedding,sentimentAnalysis2,SentAnalysis,badWordCount
0,im looking for a girl ive met at the polish ai...,ive tried polish spotted pages but i guess she...,0.0,1022,"[ive, tried, polish, spotted, pages, guess, is...","[ive, try, polish, spot, page, guess, isnt, kn...",[[-0.020897085]],"[{'label': 'NEGATIVE', 'score': 0.695042848587...",-0.695043,1
1,i got a guy kicked off a domestic flight today,i was in a forward row as we were boarding for...,0.0,919,"[forward, row, boarding, hour, domestic, fligh...","[forward, row, boarding, hour, domestic, fligh...",[[-0.015430211]],"[{'label': 'NEGATIVE', 'score': 0.997833907604...",-0.997834,0
2,my youngest got in school suspension im so proud,so according to witness testimonies a boy grab...,0.0,355,"[according, witness, testimonies, boy, grabbed...","[accord, witness, testimony, boy, grabbed, say...",[[-0.008069504]],"[{'label': 'POSITIVE', 'score': 0.938526093959...",0.938526,1
3,im a cashier who switched from sirmam to my de...,so as said in the title im a cashier well that...,0.0,2434,"[said, title, im, cashier, well, thats, part, ...","[say, title, im, cashier, well, thats, part, j...",[[-0.023101069]],"[{'label': 'NEGATIVE', 'score': 0.986810863018...",-0.986811,0
4,my whole class complimented me and didnt reali...,today in class we were doing this activity cal...,0.0,869,"[today, class, activity, called, someone, basi...","[today, class, activity, call, someone, basica...",[[-0.033342287]],"[{'label': 'NEGATIVE', 'score': 0.908012390136...",-0.908012,0
...,...,...,...,...,...,...,...,...,...,...
15472,once i die i can finally be at rest,my past actions will no longer haunt me hopefu...,1.0,97,"[past, actions, longer, haunt, hopefully, peop...","[past, action, longer, haunt, hopefully, peopl...",[[-0.025204448]],"[{'label': 'NEGATIVE', 'score': 0.995922803878...",-0.995923,0
15473,i just want to stop,i just want to stop living ive lost everything...,1.0,580,"[want, stop, living, ive, lost, everything, im...","[want, stop, living, ive, lose, everything, im...",[[-0.016652748]],"[{'label': 'NEGATIVE', 'score': 0.997233331203...",-0.997233,0
15474,im still alive,why the fuck am i still alive why wont i just ...,1.0,591,"[fuck, still, alive, wont, fucking, kill, alre...","[fuck, still, alive, wont, fuck, kill, already...",[[-0.015870886]],"[{'label': 'NEGATIVE', 'score': 0.998081088066...",-0.998081,4
15475,im lonely but i cant stand people,its a lot better online but irl i cant stand p...,1.0,1260,"[lot, better, online, irl, cant, stand, people...","[lot, well, online, irl, cant, stand, people, ...",[[-0.021809079]],"[{'label': 'NEGATIVE', 'score': 0.980329394340...",-0.980329,1
