In [1]:
# import nltk library
import nltk; nltk.download('punkt')
from nltk import sent_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize.treebank import TreebankWordTokenizer

# import stopword libraries
nltk.download('stopwords'); from nltk.corpus import stopwords
from sklearn.feature_extraction import stop_words

# import other libraries
import pandas as pd
import numpy as np
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import *
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import *
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV

# import word embedding library
#import glove_helper

# import helper libraries
import collections
from common import utils, vocabulary

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/benthompson/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/benthompson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!




In [2]:
#read in tweets
df = pd.DataFrame.from_csv('depression_tweets.csv', header=None, parse_dates=True, infer_datetime_format=True)

In [1]:
#add index
df = df.reset_index()

#set column names
df.columns = ['date','tweet_id', 'handle', 'id', 'tweet', 'language', 'device', 'notes', 'notes_2']

NameError: name 'df' is not defined

In [4]:
#look at data
df.head(5)

Unnamed: 0,date,tweet_id,handle,id,tweet,language,device,notes,notes_2
0,2018-04-05 19:14:48,981973445616525312,Haldol,816793117785542656,Currently I am on 150 mg of hydroxyzine for in...,en,Twitter for iPhone,,
1,2018-04-05 19:14:48,981973444723064832,Rick O,3192532759,Integrated behavioral health for POLICE. Treat...,en,Twitter for iPhone,,
2,2018-04-05 19:14:47,981973443988996096,olivia 🧝🏽‍♀️ボス,1321438920,RT @DevinnJay: I won’t allow depression to fuc...,en,Twitter for iPhone,,
3,2018-04-05 19:14:47,981973443154505728,LeFrenchNeuropsy,2887994266,RT @LePsylab: For science ! Un questionnaire p...,fr,Twitter Web Client,,
4,2018-04-05 19:14:45,981973435705421826,GEEZ,311289251,I lost my brova I fell deep in depression!,en,Twitter for Android,,


In [5]:
#how man non-distinct tweets
len(df)

29997

In [6]:
#filter to english only
df = df[df['language'] == 'en']

In [7]:
#how many tweets now
len(df)

28243

In [9]:
#any users w/lots of tweets that might skew model?
#not any that seem too high
df['handle'].value_counts().head(5)

.                       79
Aiden Hatfield          39
In Music We Trust       34
Allen Y. Tien MD MHS    26
✨                       25
Name: handle, dtype: int64

In [10]:
#how many distinct tweets
len(df.tweet.unique())

11912

In [11]:
#make distinct tweets the df
df = pd.DataFrame(df.tweet.unique())

In [12]:
#rename columns
df.columns = ['tweets']

In [13]:
df.head(5)

Unnamed: 0,tweets
0,Currently I am on 150 mg of hydroxyzine for in...
1,Integrated behavioral health for POLICE. Treat...
2,RT @DevinnJay: I won’t allow depression to fuc...
3,I lost my brova I fell deep in depression!
4,RT @peachesfrfr: so there i am depression all...


In [14]:
#get specific tweet
df.get_value(5,'tweets')

'RT @techreview: A neural network can detect depression and mania in bipolar subjects by analyzing how they hold and tap on their smartphone…'

In [15]:
#preprocess tweets
example_text="""'RT @techreview: A neural network can 
detect depression and mania in bipolar subjects 
by analyzing how they hold and tap on their smartphone…'"""

# tokenize
def tokenize_text(input_text):
    """
    Args: 
    input_text: a string representing an 
    individual review
        
    Returns:
    input_token: a list containing stemmed 
    tokens, with punctutations removed, for 
    an individual review
        
    """
    input_tokens=[]
        
    # Split sentence
    sents=sent_tokenize(input_text)
            
    # Split word
    for sent in sents:
        input_tokens+=TreebankWordTokenizer().tokenize(sent)
        
    return input_tokens


# canonicalize
def canonicalize_tokens(input_tokens):
    """
    Args:
    input_tokens: a list containing tokenized 
    tokens for an individual review
    
    Returns:
    input_tokens: a list containing canonicalized 
    tokens for an individual review
    
    """
    input_tokens=utils.canonicalize_words(input_tokens)
    return input_tokens


# preprocessor 
def preprocessor(raw_text):
    """
    Args:
    raw_text: a string representing an
    individual review
    
    Returns:
    preprocessed_text: a string representing 
    a preprocessed individual review
    
    """
    # tokenize
    tokens=tokenize_text(raw_text)
    
    # canonicalize
    canonical_tokens=canonicalize_tokens(tokens)
    
    # rejoin string
    preprocessed_text=(" ").join(canonical_tokens) 
    return preprocessed_text

# example data
#input_tokens=tokenize_text(example_text)
#print(input_tokens)

#canonical_tokens=canonicalize_tokens(input_tokens)
#print(canonical_tokens)

preprocessed_text=preprocessor(example_text) 
print(preprocessed_text)

'rt @ techreview : a neural network can detect depression and mania in bipolar subjects by analyzing how they hold and tap on their smartphone… '


In [16]:
# examine stopwords

# sklearn stopwords (frozenset)
sklearn_stopwords=stop_words.ENGLISH_STOP_WORDS
print("number of sklearn stopwords: %d" %(len(sklearn_stopwords)))
#print(sklearn_stopwords)

# nltk stopwords (list)
nltk_stopwords=stopwords.words("english")
print("number of nltk stopwords: %d" %(len(nltk_stopwords)))
#print(nltk_stopwords)

# combined sklearn, nltk, other stopwords (set)
total_stopwords=set(list(sklearn_stopwords.difference(set(nltk_stopwords)))+nltk_stopwords)

other_stopwords=["DG", "DGDG", "@", "rt", "'rt", "'", ":", "depression"]
for w in other_stopwords:
    total_stopwords.add(w)
    
print("number of total stopwords: %d" %(len(total_stopwords)))

number of sklearn stopwords: 318
number of nltk stopwords: 179
number of total stopwords: 386


In [17]:
#look at review w/o stop words
new_review = []
for i in preprocessed_text.split():
    if i in total_stopwords:
        continue
    else:
        new_review.append(i)
        
print(new_review)

['techreview', 'neural', 'network', 'detect', 'mania', 'bipolar', 'subjects', 'analyzing', 'hold', 'tap', 'smartphone…']


In [18]:
#use tf-idf as baseline for keywords
vec=TfidfVectorizer(preprocessor=preprocessor, ngram_range=(1,2), stop_words=total_stopwords, max_features=10000)
vec_train_data=vec.fit_transform(df['tweets']) 

In [19]:
#shape of train data
vec_train_data.shape

(11912, 10000)

In [20]:
#pull max ids, convert sparse matrix
#to be able to pull data from it
tf_idf_weights = vec_train_data.toarray()

In [21]:
#look at shape
tf_idf_weights.shape

(11912, 10000)

In [22]:
#get index of top N weights for each review
weight_indx = np.argsort(tf_idf_weights)[:, -5:]

In [23]:
#look at shape
weight_indx.shape

(11912, 5)

In [24]:
#convert vec.get_features() to dict for efficiency
look_up = collections.defaultdict()
for i,j in enumerate(vec.get_feature_names()):
    look_up[i] = j

In [25]:
#create lists to add to df
key_word_idx_one = []
key_word_idx_two = []
key_word_idx_three = []
key_word_idx_four = []
key_word_idx_five = []

for j in weight_indx:
    one = j[0]
    two = j[1]
    three = j[2]
    four = j[3]
    five = j[4]

    key_word_idx_one.append(one)
    key_word_idx_two.append(two)
    key_word_idx_three.append(three)
    key_word_idx_four.append(four)
    key_word_idx_five.append(five)
        
# print('keyword 1: ', key_word_idx_one)

key_word_one = []
key_word_two = []
key_word_three = []
key_word_four = []
key_word_five = []

for a,b,c,d,e in zip(key_word_idx_one, key_word_idx_two, key_word_idx_three, key_word_idx_four, key_word_idx_five):
    key_word_one.append(look_up[a])
    key_word_two.append(look_up[b])
    key_word_three.append(look_up[c])
    key_word_four.append(look_up[d])
    key_word_five.append(look_up[e])

In [26]:
df['keyword_1'] = key_word_one
df['keyword_2'] = key_word_two
df['keyword_3'] = key_word_three
df['keyword_4'] = key_word_four
df['keyword_5'] = key_word_five

In [27]:
#check columns
df.head(3)

Unnamed: 0,tweets,keyword_1,keyword_2,keyword_3,keyword_4,keyword_5
0,Currently I am on 150 mg of hydroxyzine for in...,insomnia,currently,psychotic,DGDGDG,mg
1,Integrated behavioral health for POLICE. Treat...,police,te,amp,behavioral,te https
2,RT @DevinnJay: I won’t allow depression to fuc...,fuck,set,allow,nah,fuck amp


In [28]:
def get_random_review(df, rand_num):
    #pull specific example
    print('tweet: ', df['tweets'][rand_num])
    print('key words: ', df['keyword_1'][rand_num]
          , ',', df['keyword_2'][rand_num]
          , ',', df['keyword_3'][rand_num]
         , ',', df['keyword_4'][rand_num]
         , ',', df['keyword_5'][rand_num])

In [29]:
#get random reivew to examine tags
get_random_review(df, np.random.randint(0, len(df)))

tweet:  I'm a rockstar mama  Only fragrance on me Is the smell of marijuana  Got the drugs and the problems  I'm a rockstar… https://t.co/Ckk6hHBVHj
key words:  problems , drugs , marijuana , smell , rockstar


In [30]:
#create df of just keyword columns
count_words = df.iloc[:,1:7]

In [31]:
#pivot df and count
count_words.stack().value_counts().head(20)

영상편집                1411
https                997
fucking              858
fucking annoying     851
fucked               608
fucked workflow      460
fuckin               413
fucking ass          388
fuckdepression       353
anxiety              204
cured                177
fucken               137
fucking bitch        130
real                 119
like                 107
fucking cool          90
post                  90
people                89
amp                   89
know                  80
dtype: int64