## Depression in Tweets

In [2]:
# import nltk library
import nltk; nltk.download('punkt')
from nltk import sent_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize.treebank import TreebankWordTokenizer

# import stopword libraries
nltk.download('stopwords'); from nltk.corpus import stopwords
from sklearn.feature_extraction import stop_words

# import other libraries
import pandas as pd
import numpy as np
import string
#from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import *
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import *
from sklearn.linear_model import LogisticRegression
#from sklearn.grid_search import GridSearchCV

# import word embedding library
#import glove_helper

# import helper libraries
import collections
from common import utils, vocabulary

#display multiple results per cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

#export models
from sklearn.externals import joblib

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/benthompson/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/benthompson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [76]:
#read in reddit comments
df = pd.read_csv('../depression_subreddit_nondeleted_201801_06.csv')

In [77]:
#look at data
df.head()

Unnamed: 0,body
0,I was on 20mg fluoxetine for a while and it di...
1,"Wonderful. You are the poet amongst fools, you..."
2,&gt;Women dont like depressed n***ers \n\n\n\n...
3,Whoosh
4,I definitely hear this. I'm sorry you're feeli...


In [78]:
#how many non-distinct comments
len(df)

311132

In [79]:
#create column on 1's
x = [1]
x = x * len(df)
df['target'] = x

In [80]:
#make all lowercase
df['body'] = df['body'].str.lower()

In [81]:
df.head(5)

Unnamed: 0,body,target
0,i was on 20mg fluoxetine for a while and it di...,1
1,"wonderful. you are the poet amongst fools, you...",1
2,&gt;women dont like depressed n***ers \n\n\n\n...,1
3,whoosh,1
4,i definitely hear this. i'm sorry you're feeli...,1


## Bring in random Reddit Comments

In [82]:
#read in random comments
df_2 = pd.read_csv('../random_comments_20181011', usecols=[0], header=None)

In [83]:
#look at data
df_2.head()

Unnamed: 0,0
0,body
1,[removed]
2,Pussy
3,Ashamed to say I expected two girls here...
4,Arrested not prison


In [84]:
#how many
len(df_2)

668065

In [85]:
#give column a name
df_2.columns = ['body']

In [86]:
#how many distinct comments
len(df_2.body.unique())

614809

In [87]:
#Make dataframe of unique
df_2 = pd.DataFrame(df_2.body.unique())

#give column name
df_2.columns = ['body']

In [88]:
#make all comments lowercase
df_2['body'] = df_2['body'].str.lower()
df_2.columns = ['body']

In [89]:
df_2.head()

Unnamed: 0,body
0,body
1,[removed]
2,pussy
3,ashamed to say i expected two girls here...
4,arrested not prison


In [90]:
#how many a [removed]
df_2[df_2['body'] == '[removed]']

#drop those 3
df_2.drop(df_2[df_2['body'] == '[removed]'].index, inplace=True)

Unnamed: 0,body
1,[removed]
399697,[removed]
494744,[removed]


In [93]:
#remove nans
df_2.dropna(inplace=True)
df.dropna(inplace=True)

In [97]:
#check for comments that use depression
df_2[(df_2['body'].str.contains('depressed') | df_2['body'].str.contains('depression'))]

#drop them
df_2.drop(df_2[(df_2.body.str.contains('depressed')) | (df_2.body.str.contains('depression'))].index, inplace=True)

Unnamed: 0,body
1768,your tears will be fun on election night. i ho...
2244,"not op, but not everyone can work full time. w..."
4870,it feels like plastic but it could be impeccab...
4902,you're being much more civil than i thought yo...
6203,i've never gotten so depressed so quickly.
7003,"please, explain those reasons. i'm sure they'r..."
7898,she became extremely depressed upon entering t...
8289,you can't easily make a gun in your house. dru...
8758,&gt; the unfriending left rachael feeling depr...
11036,"well, depression is a mental disorder, so yes...."


In [98]:
#recheck length
len(df_2)

613962

In [99]:
#column of 0's
x = 0
x = x * len(df_2)

df_2['target'] = x

In [100]:
#balance classes
df_3 = df_2.sample(n=len(df))

In [101]:
df_3.shape

(311130, 2)

In [102]:
#combine dfs
df = pd.concat([df_3,df])

In [103]:
len(df)

622260

In [104]:
df.head()

Unnamed: 0,body,target
289723,&gt;respecting authority\n\nfucking statist,0
302695,mate... you need to calm down. seriously. they...,0
111893,in lane just play a bully and focus her. i tre...,0
117882,how about hockey? because i follow the rangers...,0
282836,even the black cops are not exempt from racism...,0


In [105]:
#preprocess tweets
example_text="""'RT @techreview: A neural network can 
detect depression and mania in bipolar subjects 
by analyzing how they hold and tap on their smartphone…'"""

# tokenize
def tokenize_text(input_text):
    """
    Args: 
    input_text: a string representing an 
    individual review
        
    Returns:
    input_token: a list containing stemmed 
    tokens, with punctutations removed, for 
    an individual review
        
    """
    input_tokens=[]
        
    # Split sentence
    sents=sent_tokenize(input_text)
            
    # Split word
    for sent in sents:
        input_tokens+=TreebankWordTokenizer().tokenize(sent)
        
    return input_tokens


# canonicalize
def canonicalize_tokens(input_tokens):
    """
    Args:
    input_tokens: a list containing tokenized 
    tokens for an individual review
    
    Returns:
    input_tokens: a list containing canonicalized 
    tokens for an individual review
    
    """
    input_tokens=utils.canonicalize_words(input_tokens)
    return input_tokens


# preprocessor 
def preprocessor(raw_text):
    """
    Args:
    raw_text: a string representing an
    individual review
    
    Returns:
    preprocessed_text: a string representing 
    a preprocessed individual review
    
    """
    # tokenize
    tokens=tokenize_text(raw_text)
    
    # canonicalize
    canonical_tokens=canonicalize_tokens(tokens)
    
    # rejoin string
    preprocessed_text=(" ").join(canonical_tokens) 
    return preprocessed_text

# example data
#input_tokens=tokenize_text(example_text)
#print(input_tokens)

#canonical_tokens=canonicalize_tokens(input_tokens)
#print(canonical_tokens)

preprocessed_text=preprocessor(example_text) 
print(preprocessed_text)

'rt @ techreview : a neural network can detect depression and mania in bipolar subjects by analyzing how they hold and tap on their smartphone… '


In [106]:
# examine stopwords

# sklearn stopwords (frozenset)
sklearn_stopwords=stop_words.ENGLISH_STOP_WORDS
print("number of sklearn stopwords: %d" %(len(sklearn_stopwords)))
#print(sklearn_stopwords)

# nltk stopwords (list)
nltk_stopwords=stopwords.words("english")
print("number of nltk stopwords: %d" %(len(nltk_stopwords)))
#print(nltk_stopwords)

# combined sklearn, nltk, other stopwords (set)
total_stopwords=set(list(sklearn_stopwords.difference(set(nltk_stopwords)))+nltk_stopwords)

other_stopwords=["DG", "DGDG", "@", "rt", "'rt", "'", ":", "depression", "depressed", "RT"]
for w in other_stopwords:
    total_stopwords.add(w)
    
print("number of total stopwords: %d" %(len(total_stopwords)))

number of sklearn stopwords: 318
number of nltk stopwords: 179
number of total stopwords: 388


In [107]:
#look at review w/o stop words
new_review = []
for i in preprocessed_text.split():
    if i in total_stopwords:
        continue
    else:
        new_review.append(i)
        
print(new_review)

['techreview', 'neural', 'network', 'detect', 'mania', 'bipolar', 'subjects', 'analyzing', 'hold', 'tap', 'smartphone…']


In [108]:
#reset index
df = df.reset_index(drop=True)

In [109]:
#split into test, train before sampling to belance
# using recoded labels
#create train, test data
df['is_train'] = np.random.uniform(0,1, len(df)) <= .8

train_data, test_data = df[df['is_train'] == True], df[df['is_train'] == False]

# examine train, test shapes
print("train, test set size: %d, %d" %(len(train_data), len(test_data)))
print("")

# examine train set examples
print("example:")
print("body: %s" %(train_data.get_value(10,'body')))
print("label: %s" %(train_data.get_value(10,'target')))

train, test set size: 497783, 124477

example:
body: oh man,
i'm a big fan of you by bad religion

label: 0


  
  from ipykernel import kernelapp as app


In [110]:
#check class balance
train_data['target'].value_counts()

1    248953
0    248830
Name: target, dtype: int64

In [111]:
print("example:")
print("body: %s" %(train_data.get_value(32,'body')))
print("label: %s" %(train_data.get_value(32,'target')))

example:
body: yea i don't really need her for my current team. i barely have servants that can really benefit from her.
label: 0


  
  This is separate from the ipykernel package so we can avoid doing imports until


## Logistic Regression

In [112]:
#build tf-idf model
vec=TfidfVectorizer(preprocessor=preprocessor, ngram_range=(1,3), stop_words=total_stopwords, max_features=10000)
vec_train_data=vec.fit_transform(train_data['body']) 
vec_test_data=vec.transform(test_data['body']) 

  sorted(inconsistent))


In [113]:
# train Logistic Regression
logit=LogisticRegression(penalty='l2')
logit.fit(vec_train_data, train_data['target'])
pred_labels=logit.predict(vec_test_data)
    
# assess model
f1=f1_score(test_data['target'], pred_labels, average="weighted") 
accuracy=accuracy_score(test_data['target'], pred_labels)
confusion=confusion_matrix(test_data['target'], pred_labels)
print("logistic regression f1 score: %.3f" %(f1))
print("logistic regression accuracy score: %.3f" %(accuracy))
print("logistic regression confusion matrix:")
print(confusion)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

logistic regression f1 score: 0.852
logistic regression accuracy score: 0.852
logistic regression confusion matrix:
[[54567  7733]
 [10668 51509]]


In [114]:
#get top words
#look at top 5 weights for each class
#get coefficients for all features
coef_sq = logit.coef_

#get index of top 5 absolute values for each class
weight_indx = np.argsort(coef_sq)[:, -20:]

#flatten so can use to look up wieghts
weight_indx = weight_indx.flatten()

#get coefficients based on index
weights = coef_sq[:, weight_indx]
 
#get words that match weights based on index
vocab = np.array(vec.get_feature_names())[weight_indx]

# make table
df = pd.DataFrame({'Weights of words that predict depression': weights[0]}
                  , index=vocab)
df

Unnamed: 0,Weights of words that predict depression
illness,4.845405
lonely,4.97243
medication,5.132374
feeling,5.180178
relatable,5.679322
depressive,5.791177
antidepressants,5.845671
psychiatrist,5.908124
happiness,5.969603
life,6.215687


In [115]:
#try to make up an example journal
journal = """Today was wonderful. I had a strange interaction at the store. 
The cashier seemed irratated. I'm not sure what's going on but it makes me feel weird"""

#score test journal
vec_test_example=vec.transform([journal]) 
print("probability of class 0 and 1: ",logit.predict_proba(vec_test_example))

#get words and weights from test journal
word_idx = np.nonzero(vec_test_example)[1]
vocab = np.array(vec.get_feature_names())[word_idx]
weights = coef_sq[:, word_idx]
df = pd.DataFrame({'Weights of words in sample Journal': weights[0]}
                  , index=vocab)
df.sort_values(by='Weights of words in sample Journal')

probability of class 0 and 1:  [[ 0.18187016  0.81812984]]


Unnamed: 0,Weights of words in sample Journal
store,-1.041158
sure going,-0.405577
weird,0.312819
strange,0.366745
going,0.545399
sure,0.555881
makes,0.574439
wonderful,0.83282
interaction,1.246184
today,1.419033


In [127]:
#export tfidf model
tfidf_file = 'tfidf_exported_model'
joblib.dump(vec, tfidf_file)

['tfidf_exported_model']

In [128]:
#export logistic regression
logistic_regression_file = 'logistic_regression_model'
joblib.dump(logit, logistic_regression_file)

['logistic_regression_model']

In [129]:
#test out exported models against prev sample journal
loaded_tfidf = joblib.load('tfidf_exported_model')
loaded_lr = joblib.load('logistic_regression_model')

#score test journal
export_test_example=loaded_tfidf.transform([journal]) 
print("probability of class 0 and 1: ",loaded_lr.predict_proba(export_test_example))


probability of class 0 and 1:  [[ 0.43736519  0.56263481]]
