Feature Extraction Analysis, using dataset of reddit comments. The goal is to determine the best method of feature extraction by testing the accuracy of various methods using this dataset. The dataset contains a column 'text' which has the raw text of the reddit comment, and another column 'label' which indicates whether or not the person has depression (the label).

In [None]:
# import libraries
import gensim
import pandas as pd
import numpy as np

In [None]:
# read in dataset
df = pd.read_csv("mental_health.csv")
df.head()

Unnamed: 0,text,label
0,dear american teens question dutch person hear...,0
1,nothing look forward lifei dont many reasons k...,1
2,music recommendations im looking expand playli...,0
3,im done trying feel betterthe reason im still ...,1
4,worried year old girl subject domestic physic...,1


In [None]:
# explore data
df.nunique()

text     27972
label        2
dtype: int64

In [None]:
df.shape

(27977, 2)

In [None]:
# show column we are interested in 
df.text

0        dear american teens question dutch person hear...
1        nothing look forward lifei dont many reasons k...
2        music recommendations im looking expand playli...
3        im done trying feel betterthe reason im still ...
4        worried  year old girl subject domestic physic...
                               ...                        
27972    posting everyday people stop caring  religion ...
27973    okay definetly need hear guys opinion ive pret...
27974    cant get dog think ill kill myselfthe last thi...
27975    whats point princess bridei really think like ...
27976    got nudes person might might know snapchat do ...
Name: text, Length: 27977, dtype: object

In [None]:
# show example of how simple_preprocess works in gensim

df.text[0]

'dear american teens question dutch person heard guys get way easier things learn age us sooooo thth graders like  right guys learn math'

In [None]:
# perform data preprocessing 
# not super necessary here, data is already cleaned (all lowercase, stop words gone, punctuation gone etc.)
# this will be useful for word tokenization though
gensim.utils.simple_preprocess("dear american teens question dutch person heard guys get way easier things learn age us sooooo thth graders like  right guys learn math")

['dear',
 'american',
 'teens',
 'question',
 'dutch',
 'person',
 'heard',
 'guys',
 'get',
 'way',
 'easier',
 'things',
 'learn',
 'age',
 'us',
 'sooooo',
 'thth',
 'graders',
 'like',
 'right',
 'guys',
 'learn',
 'math']

In [None]:
# preprocess the entire text column
processed_text = df.text.apply(gensim.utils.simple_preprocess)

In [None]:
# display processed text
processed_text

0        [dear, american, teens, question, dutch, perso...
1        [nothing, look, forward, lifei, dont, many, re...
2        [music, recommendations, im, looking, expand, ...
3        [im, done, trying, feel, betterthe, reason, im...
4        [worried, year, old, girl, subject, domestic, ...
                               ...                        
27972    [posting, everyday, people, stop, caring, reli...
27973    [okay, definetly, need, hear, guys, opinion, i...
27974    [cant, get, dog, think, ill, kill, myselfthe, ...
27975    [whats, point, princess, bridei, really, think...
27976    [got, nudes, person, might, might, know, snapc...
Name: text, Length: 27977, dtype: object

In [None]:
# initialize gensim model
w2v_model = gensim.models.Word2Vec(
    window=5,
    min_count=2,
    workers=4
)

In [None]:
# build vocabulary (unique list of words)
w2v_model.build_vocab(processed_text, progress_per=1000)

In [None]:
w2v_model.epochs

5

In [None]:
w2v_model.corpus_count

27977

In [None]:
# train the model
w2v_model.train(processed_text, total_examples=w2v_model.corpus_count, epochs=w2v_model.epochs)

(9085551, 9988215)

In [None]:
# save model
w2v_model.save("word2vec.model")

In [None]:
# test out model visually
w2v_model.wv.most_similar("sad")

[('lonely', 0.789612889289856),
 ('angry', 0.7740035057067871),
 ('upset', 0.7641036510467529),
 ('depressed', 0.7409121990203857),
 ('genuinely', 0.723389744758606),
 ('confused', 0.722817599773407),
 ('happy', 0.7174826860427856),
 ('jealous', 0.7104412317276001),
 ('guilty', 0.7059646248817444),
 ('honestly', 0.7045464515686035)]

In [None]:
# similarity score
w2v_model.wv.similarity(w1="sad", w2="depressed")

0.7409122

In [None]:
# show word vector for sad
w2v_model.wv['sad']

array([ 1.0104666 , -0.93931943,  0.01189941, -0.94821906,  0.19007432,
       -0.2183634 ,  0.30052337, -1.4365295 ,  1.0659277 ,  1.0451627 ,
        0.3271546 , -0.55468476, -1.4237788 , -0.5777288 , -0.7728567 ,
        0.46439722,  0.5898041 ,  1.1547302 , -0.5790551 ,  0.49785003,
        0.07761248,  0.6651478 , -0.03519844,  0.1224975 ,  1.0650253 ,
       -0.62421256, -0.33112928,  0.4296295 ,  0.19804901,  0.20989165,
       -0.18744887,  0.76746124,  0.84889954,  0.7750336 , -1.8260312 ,
        0.41376936,  2.7407935 , -1.7676637 ,  0.8710339 ,  0.10614733,
        1.5761476 , -0.33825204,  0.51509243,  2.3070486 , -1.0409973 ,
       -1.3344456 , -0.50425285, -0.6565652 ,  0.26839003, -0.27152115,
       -0.5998513 , -0.24119103,  1.3804125 ,  0.5864011 , -0.6450711 ,
       -0.94272566,  0.90675145,  0.07676438, -0.20897162, -0.65528303,
        0.0808342 , -0.48816195,  0.92590714, -0.68288743,  0.30407584,
        1.009887  , -0.8856558 ,  0.2819892 ,  0.24609473, -1.03