In [56]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from os import path
print('Libraries imported!')

Libraries imported!


In [58]:
data = pd.read_json(path.join("..","data", "reddit_cleaned.json"))

In [59]:
X = data["megatext_clean"] # inputs
y = data["is_suicide"] # output

In [124]:
max_features=70

# 60 40
# 70 30
# 80 20
# visualization
# KNN

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42, stratify=y)

In [125]:
tvec_optimised = TfidfVectorizer(
    max_df=0.5, max_features=max_features, min_df=2, ngram_range=(1, 3), stop_words='english')
X_train_tvec = tvec_optimised.fit_transform(X_train).todense()
X_test_tvec = tvec_optimised.transform(X_test).todense()
print(tvec_optimised.fit(X_train).get_feature_names())

['amp', 'anymore', 'away', 'bad', 'best', 'better', 'care', 'college', 'come', 'country', 'dad', 'depressed', 'depression', 'die', 'end', 'exam', 'family', 'father', 'feel like', 'feeling', 'getting', 'girl', 'going', 'good', 'got', 'guy', 'ha', 'happy', 'hard', 'help', 'home', 'im', 'job', 'live', 'living', 'long', 'lot', 'love', 'make', 'mom', 'money', 'month', 'need', 'parent', 'people', 'person', 'point', 'problem', 'really', 'reason', 'said', 'say', 'school', 'shit', 'started', 'study', 'suicide', 'talk', 'tell', 'think', 'thought', 'told', 'tried', 'try', 'used', 'wanted', 'way', 'went', 'work', 'world']


In [126]:
hvec_optimised = HashingVectorizer(
    n_features=max_features, stop_words='english', alternate_sign=False)
X_train_hvec = hvec_optimised.fit_transform(X_train).todense()
X_test_hvec = hvec_optimised.transform(X_test).todense()

In [127]:
# MultinomialNB + TFIDF
mnb = MultinomialNB()
mnb.fit(X_train_tvec, y_train)
taccuracy = mnb.score(X_test_tvec, y_test)

In [128]:
# RF + Hashing
rfc = RandomForestClassifier(random_state=10)
rfc.fit(X_train_hvec, y_train)
haccuracy = rfc.score(X_test_hvec, y_test)

In [129]:
# TFIDF + RF
trfc = RandomForestClassifier(random_state=10)
trfc.fit(X_train_tvec, y_train)
trfcaccuracy = rfc.score(X_test_tvec, y_test)

In [130]:
# Hashing + Multinomial
hmnb = MultinomialNB()
hmnb.fit(X_train_tvec, y_train)
hmnbaccuracy = hmnb.score(X_test_tvec, y_test)

In [131]:
print(f"TfidfVectorizer + MultinomialNB : {taccuracy * 100}")
print(f"HashingVectorizer + RandomForestClassifier : {haccuracy * 100}")
print(f"TfidfVectorizer + RandomForestClassifier : {trfcaccuracy * 100}")
print(f"HashingVectorizer + MultinomialNB : {hmnbaccuracy * 100}")

TfidfVectorizer + MultinomialNB : 68.27586206896552
HashingVectorizer + RandomForestClassifier : 60.0
TfidfVectorizer + RandomForestClassifier : 52.41379310344828
HashingVectorizer + MultinomialNB : 68.27586206896552
