<a href="https://colab.research.google.com/github/s1rens/Reddit-Suicidality-Monitor/blob/main/Building_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import preprocessing, metrics
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
import spacy
import pickle
import numpy as np
import re
import matplotlib.pyplot as plt 
import plotly.express as px
nlp = spacy.load("en_core_web_sm")

In [None]:
# preprocess dataset
df = pd.read_csv('suicidality_dataset_raw_fixed.csv', header=0)
df['postfixed'] = df['post'].str.lower()
df['postfixed'].replace(to_replace='http\S+', value='', inplace=True, regex=True) # remove URLs
df['postfixed'].replace(to_replace='amp;\S+', value='', inplace=True, regex=True) # remove formatting
df['postfixed'].replace('\\n', '', inplace=True, regex=True)
df['postfixed'].replace('\\n\\n', '', inplace=True, regex=True)
df['goodlemma'] = df['postfixed'].apply(lambda x: " ".join([y.lemma_ for y in nlp(x) if not y.is_stop and not y.is_punct])) # lemmatise, remove stop words and punctuation, remove 'http and 'com''
tf = TfidfVectorizer(lowercase=True, stop_words='english', strip_accents='ascii', min_df=10)
tf_df = tf.fit_transform(df['goodlemma'])
selector = SelectKBest(chi2)
selector.fit(tf_df, df['suicidal'])
chi2_scores = pd.DataFrame(list(zip(tf.get_feature_names(), selector.scores_, selector.pvalues_)), columns=['word', 'score', 'p-value'])
important_word_df = chi2_scores.sort_values(by=['score'], ascending=False)
num_features = 3000 # number of most important features to keep
important_word_df = important_word_df.iloc[:num_features] # Takes most important features
print(important_word_df.head(10)) # prints top 10 most important features
print(important_word_df.tail(10)) # prints least 10 most important features
important_word_list = important_word_df['word'].tolist()
df['goodlemma_most_important'] = df['goodlemma'].apply(lambda x: " ".join([str(y) for y in nlp(x) if str(y) in important_word_list])) # remove all words that aren't in important_word_list
df

In [None]:
# split data into train and test sets
seed = 0
train, test = train_test_split(df, test_size=0.2, shuffle=True, random_state=seed)

In [None]:
# create train and test tfidf matrices
train_tf = tf.fit_transform(train['goodlemma_most_important'])
test_tf = tf.transform(test['goodlemma_most_important'])

In [None]:
# extract Xtrain, Xtest, ytrain, ytest
Xtrain = pd.DataFrame(train_tf.toarray())
ytrain = train.iloc[:, 4]
Xtest = pd.DataFrame(test_tf.toarray())
ytest = test.iloc[:, 4]

In [None]:
# evaluation
def fit_predict(model, Xtrain, ytrain, Xtest, ytest):
    model.fit(Xtrain, ytrain)
    y_hat = model.predict(Xtest)
    print('Classification report for',model,':')
    print(classification_report(ytest, y_hat))
    print('Confusion matrix for',model,':')
    print(metrics.confusion_matrix(ytest, y_hat))
    return y_hat

# logistic regression
lr = LogisticRegression()
lr_y_hat = fit_predict(lr, Xtrain, ytrain, Xtest, ytest)

# SVM
svm = SVC(kernel='linear')
svm_y_hat = fit_predict(svm, Xtrain, ytrain, Xtest, ytest)

# random forest
rf = RandomForestClassifier()
rf_y_hat = fit_predict(rf, Xtrain, ytrain, Xtest, ytest)

In [None]:
# plot roc curves
fig, ax = plt.subplots()
fig.set_figwidth(8)
fig.set_figheight(5)
metrics.plot_roc_curve(lr, Xtest, ytest, ax=ax)
metrics.plot_roc_curve(svm, Xtest, ytest, ax=ax)
metrics.plot_roc_curve(rf, Xtest, ytest, ax=ax)

In [None]:
# plotting. This won't run on colab but it's not important. Just interactive version of the ROC curve above.
# citation for plotting ROC/AUC code: https://towardsdatascience.com/an-understandable-guide-to-roc-curves-and-auc-and-why-and-when-to-use-them-92020bc4c5c1
y_pred_proba = lr.predict_proba(Xtest)[:,1]
fpr, tpr, thresh = metrics.roc_curve(ytest, y_pred_proba)
roc_df = pd.DataFrame(zip(fpr, tpr, thresh),columns = ["FPR","TPR","Threshold"])
display(roc_df)
fig = px.area(roc_df, x='FPR', y='TPR', hover_data = ['Threshold'])
fig.show()

In [None]:
# save models
with open('model_logistic_regression_3000.pkl', 'wb') as file:
    pickle.dump(lr, file)

with open('model_svm_3000.pkl', 'wb') as file:
    pickle.dump(svm, file)

with open('model_random_forest_3000.pkl', 'wb') as file:
    pickle.dump(rf, file)
    
with open('tfidf_3000.pkl', 'wb') as file:
    pickle.dump(tf, file)
    
with open('important_words_3000.txt', 'w') as file:
    file.write('\n'.join(important_word_list))