## Depression in Tweets

In [1]:
# import nltk library
import nltk; nltk.download('punkt')
from nltk import sent_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize.treebank import TreebankWordTokenizer

# import stopword libraries
nltk.download('stopwords'); from nltk.corpus import stopwords
from sklearn.feature_extraction import stop_words

# import other libraries
import pandas as pd
import numpy as np
import string
#from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import *
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import *
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV

# import word embedding library
#import glove_helper

# import helper libraries
import collections
from common import utils, vocabulary

#display multiple results per cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

#export models
from sklearn.externals import joblib

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/benthompson/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/benthompson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!




In [2]:
#read in tweets
df = pd.DataFrame.from_csv('/Users/benthompson/depression_tweets.csv', header=None, parse_dates=True, infer_datetime_format=True)

In [3]:
#add index
df = df.reset_index()

#set column names
df.columns = ['date','tweet_id', 'handle', 'id', 'tweet', 'language', 'device', 'notes', 'notes_2']

In [4]:
#look at data
df.head(5)

Unnamed: 0,date,tweet_id,handle,id,tweet,language,device,notes,notes_2
0,2018-04-05 19:14:48,981973445616525312,Haldol,816793117785542656,Currently I am on 150 mg of hydroxyzine for in...,en,Twitter for iPhone,,
1,2018-04-05 19:14:48,981973444723064832,Rick O,3192532759,Integrated behavioral health for POLICE. Treat...,en,Twitter for iPhone,,
2,2018-04-05 19:14:47,981973443988996096,olivia 🧝🏽‍♀️ボス,1321438920,RT @DevinnJay: I won’t allow depression to fuc...,en,Twitter for iPhone,,
3,2018-04-05 19:14:47,981973443154505728,LeFrenchNeuropsy,2887994266,RT @LePsylab: For science ! Un questionnaire p...,fr,Twitter Web Client,,
4,2018-04-05 19:14:45,981973435705421826,GEEZ,311289251,I lost my brova I fell deep in depression!,en,Twitter for Android,,


In [5]:
#how man non-distinct tweets
len(df)

2839435

In [6]:
#filter to english only
df = df[df['language'] == 'en']

In [7]:
#how many tweets now
len(df)

2576632

In [8]:
#any users w/lots of tweets that might skew model?
#not any that seem too high
df['handle'].value_counts().head(5)

.                    5508
Aiden Hatfield       3004
ً                    2653
In Music We Trust    2258
♡                    1700
Name: handle, dtype: int64

In [9]:
#how many distinct tweets
len(df.tweet.unique())

1037577

In [10]:
#make distinct tweets the df
df = pd.DataFrame(df.tweet.unique())

In [11]:
#rename columns
df.columns = ['tweets']

In [12]:
#export sample to check quality
# pd.options.display.max_colwidth = 1000
# df_sample = df.sample(n=100)
# df_sample.to_csv('../sample_100_depression_tweets.csv')

In [13]:
#look up specific tweet
pd.options.display.max_colwidth = 10000
df.iloc[45055]

tweets    Y’all be using mental illness as a way to justify your bitchy behavior and it’s honestly a no from fucking me. Depr… https://t.co/YzlhNSV9RA
Name: 45055, dtype: object

In [14]:
#create column on 1's
x = [1]
x = x * len(df)
df['target'] = x

In [15]:
df.head(5)

Unnamed: 0,tweets,target
0,Currently I am on 150 mg of hydroxyzine for insomnia. As well as 300 mg Effexor XR and 4 mg Fanapt for Psychotic Depression.,1
1,Integrated behavioral health for POLICE. Treat mind &amp; body. The organizational &amp; operational stressors have long te… https://t.co/MnWDFqckQB,1
2,RT @DevinnJay: I won’t allow depression to fuck me up &amp; set me back. Nah not again.,1
3,I lost my brova I fell deep in depression!,1
4,RT @peachesfrfr: so there i am depression all over my titties,1


## Bring in random tweets

In [16]:
#read in tweets
df_2 = pd.DataFrame.from_csv('../random_tweets.csv', header=None)

In [17]:
#look at data
df_2.head()

True but she still cancelled tho.
"RT @roxxxdoxxx: when she said ""i gotta ask first"" i felt that 😫😅😂 https://t.co/BGPZqFLb9v"
appreciate this perfectly timed pic of me and catto pls https://t.co/GE5poooRcF
one of Beyoncé’s most underrated looks is the one from Jealous. don’t @ me
"""Once you create a system for censoring speech on the grounds that it is 'fake news' (even if it's parody, or sarca… https://t.co/EpuSUaK0UC"


In [18]:
#how many
len(df_2)

135177

In [19]:
#give index
df_2 = df_2.reset_index()

#give column name
df_2.columns = ['tweets']

In [20]:
#how many distinct tweets
len(df_2.tweets.unique())

111985

In [21]:
#Make dataframe of unique
df_2 = pd.DataFrame(df_2.tweets.unique())

#give column name
df_2.columns = ['tweets']

In [22]:
#make all tweets lowercase
df_2['tweets'] = df_2['tweets'].str.lower()
df_2.columns = ['tweets']

In [23]:
df_2.head()

Unnamed: 0,tweets
0,true but she still cancelled tho.
1,"rt @roxxxdoxxx: when she said ""i gotta ask first"" i felt that 😫😅😂 https://t.co/bgpzqflb9v"
2,appreciate this perfectly timed pic of me and catto pls https://t.co/ge5pooorcf
3,one of beyoncé’s most underrated looks is the one from jealous. don’t @ me
4,"""once you create a system for censoring speech on the grounds that it is 'fake news' (even if it's parody, or sarca… https://t.co/epusuak0uc"


In [24]:
#check for tweets that use depression
df_2[(df_2['tweets'].str.contains('depressed') | df_2['tweets'].str.contains('depression'))]

#drop them
df_2.drop(df_2[(df_2.tweets.str.contains('depressed')) | (df_2.tweets.str.contains('depression'))].index, inplace=True)

Unnamed: 0,tweets
1769,"rt @nickhansonmn: hey sorry i’ve been distant lately , i’m just super depressed about the current state of my life and didn’t wanna have to…"
1918,rt @matrix_reioaded: ahh i’m depressed... but all my teachers in elementary school said i was a special boy... how could this happen.. at o…
2568,rt @caucasianjames: on tinder depressed
2763,rt @softyoonle: hi this would really help me alot since i am battling depression and i need something to inspire me or help me 💞 if this fl…
3507,rt @iamsofiadg: philippines 🇵🇭 \nneed someone to talk to?\nsuicide/depression cellphone number \n\n0917-558-4673 📱\n\ncan you rt to potentially…
4159,rt @depressionnote: warning signs of depression ⚠️\n\n⚠️ low self-esteem\n⚠️ guilt\n⚠️ feeling hopeless\n⚠️ tiredness\n⚠️ loss of interest in thi…
4268,"rt @fuxksalliemae: alot of nigerians are struggling with depression, and much of it is financially and economically induced. don't just che…"
4547,rt @daitonreed: you do not have to come this hard on every song. aint no damn reason you needa make me depressed for no reason. https://t.c…
5501,rt @sionesnow: when i read the first sentence i thought there was a thing called “depression fcking” weh lol https://t.co/yup73ftjqc
6143,"i can't help the fact that i make all my characters depressed or not human, because i just realized that alexandria… https://t.co/scwoudncxy"


In [25]:
#recheck length
len(df_2)

111847

In [26]:
#export to check quality
# df_2_sample = df_2.sample(n=100)
# df_2_sample.to_csv('../sample_100_random_tweets.csv')

In [27]:
#column of 0's
x = 0
x = x * len(df_2)

df_2['target'] = x

In [28]:
#balance classes
df_3 = df.sample(n=len(df_2))

In [29]:
# df_3.head()

In [30]:
#combine dfs
df = pd.concat([df_3,df_2])

In [31]:
len(df)

223694

In [32]:
#preprocess tweets
example_text="""'RT @techreview: A neural network can 
detect depression and mania in bipolar subjects 
by analyzing how they hold and tap on their smartphone…'"""

# tokenize
def tokenize_text(input_text):
    """
    Args: 
    input_text: a string representing an 
    individual review
        
    Returns:
    input_token: a list containing stemmed 
    tokens, with punctutations removed, for 
    an individual review
        
    """
    input_tokens=[]
        
    # Split sentence
    sents=sent_tokenize(input_text)
            
    # Split word
    for sent in sents:
        input_tokens+=TreebankWordTokenizer().tokenize(sent)
        
    return input_tokens


# canonicalize
def canonicalize_tokens(input_tokens):
    """
    Args:
    input_tokens: a list containing tokenized 
    tokens for an individual review
    
    Returns:
    input_tokens: a list containing canonicalized 
    tokens for an individual review
    
    """
    input_tokens=utils.canonicalize_words(input_tokens)
    return input_tokens


# preprocessor 
def preprocessor(raw_text):
    """
    Args:
    raw_text: a string representing an
    individual review
    
    Returns:
    preprocessed_text: a string representing 
    a preprocessed individual review
    
    """
    # tokenize
    tokens=tokenize_text(raw_text)
    
    # canonicalize
    canonical_tokens=canonicalize_tokens(tokens)
    
    # rejoin string
    preprocessed_text=(" ").join(canonical_tokens) 
    return preprocessed_text

# example data
#input_tokens=tokenize_text(example_text)
#print(input_tokens)

#canonical_tokens=canonicalize_tokens(input_tokens)
#print(canonical_tokens)

preprocessed_text=preprocessor(example_text) 
print(preprocessed_text)

'rt @ techreview : a neural network can detect depression and mania in bipolar subjects by analyzing how they hold and tap on their smartphone… '


In [33]:
# examine stopwords

# sklearn stopwords (frozenset)
sklearn_stopwords=stop_words.ENGLISH_STOP_WORDS
print("number of sklearn stopwords: %d" %(len(sklearn_stopwords)))
#print(sklearn_stopwords)

# nltk stopwords (list)
nltk_stopwords=stopwords.words("english")
print("number of nltk stopwords: %d" %(len(nltk_stopwords)))
#print(nltk_stopwords)

# combined sklearn, nltk, other stopwords (set)
total_stopwords=set(list(sklearn_stopwords.difference(set(nltk_stopwords)))+nltk_stopwords)

other_stopwords=["DG", "DGDG", "@", "rt", "'rt", "'", ":", "depression", "depressed", "RT"]
for w in other_stopwords:
    total_stopwords.add(w)
    
print("number of total stopwords: %d" %(len(total_stopwords)))

number of sklearn stopwords: 318
number of nltk stopwords: 179
number of total stopwords: 388


In [34]:
#look at review w/o stop words
new_review = []
for i in preprocessed_text.split():
    if i in total_stopwords:
        continue
    else:
        new_review.append(i)
        
print(new_review)

['techreview', 'neural', 'network', 'detect', 'mania', 'bipolar', 'subjects', 'analyzing', 'hold', 'tap', 'smartphone…']


In [35]:
#reset index
df = df.reset_index(drop=True)

In [36]:
#split into test, train before sampling to belance
# using recoded labels
#create train, test data
df['is_train'] = np.random.uniform(0,1, len(df)) <= .8

train_data, test_data = df[df['is_train'] == True], df[df['is_train'] == False]

# examine train, test shapes
print("train, test set size: %d, %d" %(len(train_data), len(test_data))) # train_data: 129023, test_data: 32256
print("")

# examine train set examples
print("example:")
print("tweet: %s" %(train_data.get_value(5,'tweets')))
print("label: %s" %(train_data.get_value(5,'target')))

train, test set size: 179006, 44688

example:
tweet: I know my urge to ghost on everything is the depression but there it is
label: 1


In [37]:
#check class balance
train_data['target'].value_counts()

1    89624
0    89382
Name: target, dtype: int64

In [39]:
print("example:")
print("tweet: %s" %(train_data.get_value(32,'tweets')))
print("label: %s" %(train_data.get_value(32,'target')))

example:
tweet: RT @kittytriplet: @thechew @ABCNetwork I just heard the #chew is being cancelled  I am so upset Please reconsider I am battling depression…
label: 1


## Logistic Regression

In [39]:
#build tf-idf model
vec=TfidfVectorizer(preprocessor=preprocessor, ngram_range=(1,3), stop_words=total_stopwords, max_features=10000)
vec_train_data=vec.fit_transform(train_data['tweets']) 
vec_test_data=vec.transform(test_data['tweets']) 

In [40]:
# train Logistic Regression
logit=LogisticRegression(penalty='l2')
logit.fit(vec_train_data, train_data['target'])
pred_labels=logit.predict(vec_test_data)
    
# assess model
f1=f1_score(test_data['target'], pred_labels, average="weighted") 
accuracy=accuracy_score(test_data['target'], pred_labels)
confusion=confusion_matrix(test_data['target'], pred_labels)
print("logistic regression f1 score: %.3f" %(f1))
print("logistic regression accuracy score: %.3f" %(accuracy))
print("logistic regression confusion matrix:")
print(confusion)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

logistic regression f1 score: 0.819
logistic regression accuracy score: 0.819
logistic regression confusion matrix:
[[19295  3112]
 [ 4950 17293]]


In [70]:
#try Keras
from keras.preprocessing.text import one_hot
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

In [45]:
#create integer encoding of docs
vocab_size = 100
encoded_docs = [one_hot(d, vocab_size) for d in df['tweets']]

In [71]:
#try tokenizer instead
t = Tokenizer()
t.fit_on_texts(df['tweets'])
vocab_t_size = len(t.word_index) + 1

In [72]:
#create sequence
encoded_t_docs = t.texts_to_sequences(df['tweets'])

In [73]:
# pad docs to equals size
pad = 40
# padded_docs = pad_sequences(encoded_docs, maxlen=pad, padding='post')
padded_t_docs = pad_sequences(encoded_t_docs, maxlen=pad, padding='post')

In [74]:
padded_docs[11105]

array([85,  8, 87,  8, 14, 82, 55, 28, 34, 27, 27, 92, 81, 55, 62, 34, 30,
       11, 73, 47, 90, 56, 53, 82,  9,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0], dtype=int32)

In [143]:
from sklearn.model_selection import train_test_split
# X_train,X_test,Y_train,Y_test = train_test_split(padded_docs, df['target'], test_size=.8)
X_train,X_test,Y_train,Y_test = train_test_split(padded_t_docs, df['target'], test_size=.8)

In [144]:
X_train.shape

(44738, 40)

In [141]:
# create the model
embedding_size = 32

model = Sequential()
# model.add(Embedding(vocab_size, embedding_size, input_length=pad))
model.add(Embedding(vocab_t_size, embedding_size, input_length=pad))
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 40, 32)            11611520  
_________________________________________________________________
flatten_4 (Flatten)          (None, 1280)              0         
_________________________________________________________________
dense_7 (Dense)              (None, 250)               320250    
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 251       
Total params: 11,932,021
Trainable params: 11,932,021
Non-trainable params: 0
_________________________________________________________________
None


In [142]:
# Fit the model
epochs=3
batch_size=128

history = model.fit(X_train, Y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.2)

# Final evaluation of the model
scores = model.evaluate(X_test, Y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Train on 35790 samples, validate on 8948 samples
Epoch 1/3
Epoch 2/3

KeyboardInterrupt: 

In [129]:
keras_journal = ["Sometime I feel very alone and anxious"]

In [130]:
encoded_journal = t.texts_to_sequences(keras_journal)

In [131]:
encoded_journal

[[5874, 8, 85, 163, 332, 9, 2197]]

In [132]:
#pad
pad = 40
padded_journal = pad_sequences(encoded_journal, maxlen=pad, padding='post')

In [133]:
padded_journal

array([[5874,    8,   85,  163,  332,    9, 2197,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0]], dtype=int32)

In [138]:
ynew = model.predict_proba(padded_journal)



In [140]:
ynew

array([[ 0.44699621]], dtype=float32)

In [51]:
#get top words
#look at top 5 weights for each class
#get coefficients for all features
coef_sq = logit.coef_

#get index of top 5 absolute values for each class
weight_indx = np.argsort(coef_sq)[:, -20:]

#flatten so can use to look up wieghts
weight_indx = weight_indx.flatten()

#get coefficients based on index
weights = coef_sq[:, weight_indx]
 
#get words that match weights based on index
vocab = np.array(vec.get_feature_names())[weight_indx]

# make table
df = pd.DataFrame({'Weights of words that predict depression': weights[0]}
                  , index=vocab)
df

Unnamed: 0,Weights of words that predict depression
naps,5.900655
battling,6.385033
cures,6.390737
suicidal,6.418737
nap,6.499706
sadness,6.534502
clinical,6.537225
mental,6.697283
postpartum,6.7384
suffering,6.996678


In [126]:
#try to make up an example journal
journal = """Today was wonderful. I had a strange interaction at the store. 
The cashier seemed irratated. I'm not sure what's going on but it makes me feel weird"""

#score test journal
vec_test_example=vec.transform([journal]) 
print("probability of class 0 and 1: ",logit.predict_proba(vec_test_example))

#get words and weights from test journal
word_idx = np.nonzero(vec_test_example)[1]
vocab = np.array(vec.get_feature_names())[word_idx]
weights = coef_sq[:, word_idx]
df = pd.DataFrame({'Weights of words in sample Journal': weights[0]}
                  , index=vocab)
df.sort_values(by='Weights of words in sample Journal')

probability of class 0 and 1:  [[ 0.43736519  0.56263481]]


Unnamed: 0,Weights of words in sample Journal
store,-1.171633
sure,-0.457797
interaction,-0.419084
wonderful,-0.374465
strange,0.326909
weird,0.392027
today,0.923644
makes feel,1.083485
going,1.660564
makes,1.798566


In [127]:
#export tfidf model
tfidf_file = 'tfidf_exported_model'
joblib.dump(vec, tfidf_file)

['tfidf_exported_model']

In [128]:
#export logistic regression
logistic_regression_file = 'logistic_regression_model'
joblib.dump(logit, logistic_regression_file)

['logistic_regression_model']

In [129]:
#test out exported models against prev sample journal
loaded_tfidf = joblib.load('tfidf_exported_model')
loaded_lr = joblib.load('logistic_regression_model')

#score test journal
export_test_example=loaded_tfidf.transform([journal]) 
print("probability of class 0 and 1: ",loaded_lr.predict_proba(export_test_example))


probability of class 0 and 1:  [[ 0.43736519  0.56263481]]
