In [1]:
import pickle
import numpy as np
from re import split
import math
from textblob import TextBlob as tb
import nltk
# nltk.download('punkt')

In [16]:
data = pickle.load( open( "emotype_v10_val.p", "rb" ) ) 
print (data[0].keys())
print (len(data))

_,m = data[0]['encoding'].shape
n = len(data)
print (m)
print (n)

dict_keys(['text', 'outputs', 'label', 'encoding', 'prediction'])
800
128
800


In [18]:
label_cluster_dic = {'addiction':0, 'anxiety':1, 'autism':2, 'bipolar':3, 'conversation':4, 'depression':5,
 'happy':6, 'schizophrenia':7}

for i in range(len(data)):
    data[i]['cluster'] = label_cluster_dic[data[i]['label']]

In [19]:
def find_text_with_cluster(cluster):
    out = []
    for i in range(len(data)):
        if data[i]['cluster'] == cluster:
            out.append(split("[^a-zA-Z']+", data[i]['text'].strip().lower() ))
    return out

In [20]:
def tf(word, blob):
    '''computes "term frequency" which is the number of times a word appears in a document blob, 
       normalized by dividing by the total number of words in blob. 
       We use TextBlob for breaking up the text into words and getting the word counts.'''
    return blob.words.count(word) / len(blob.words)

def n_containing(word, bloblist):
    '''returns the number of documents containing word. 
    A generator expression is passed to the sum() function.'''
    return sum(1 for blob in bloblist if word in blob.words)

def idf(word, bloblist):
    '''computes "inverse document frequency" or how common a word is among all documents in bloblist. 
       The more common a word is, the lower its idf. 
       We take the ratio of the total number of documents to the number of documents containing word, 
       then take the log of that. Add 1 to the divisor to prevent division by zero.'''
    return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))

def tfidf(word, blob, bloblist):
    ''' computes the TF-IDF score. It's the product of tf and idf.'''
    return tf(word, blob) * idf(word, bloblist)

In [21]:
document1 = tb("one two three")

document2 = tb("two three four")

document3 = tb("three four five")

bloblist = [document1, document2, document3]
for i, blob in enumerate(bloblist):
    print("Top words in document {}".format(i + 1))
    scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    for word, score in sorted_words[:2]:
        print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))

Top words in document 1
	Word: one, TF-IDF: 0.13516
	Word: two, TF-IDF: 0.0
Top words in document 2
	Word: two, TF-IDF: 0.0
	Word: four, TF-IDF: 0.0
Top words in document 3
	Word: five, TF-IDF: 0.13516
	Word: four, TF-IDF: 0.0


In [22]:
# sanity check
blob =  tb("one two three")
tf('one', blob)

0.3333333333333333

In [23]:
n_containing('two', bloblist)

2

In [24]:
idf('three', bloblist) # log (3/(1+2))

-0.2876820724517809

In [25]:
math.log(3/4)

-0.2876820724517809

In [26]:
bloblist = []
diagnosis = []
for cluster in list(label_cluster_dic.keys()):
    print(cluster)
    diagnosis.append(cluster)
    cluster_text = find_text_with_cluster(label_cluster_dic[cluster])
    list_of_posts = []
    for post in cluster_text:
        list_of_posts.append(' '.join(post))
    big_string = tb(' '.join(list_of_posts))
    bloblist.append(big_string)

addiction
anxiety
autism
bipolar
conversation
depression
happy
schizophrenia


In [27]:
for i, blob in enumerate(bloblist):
    print("Top words in document {},{}".format(i + 1, diagnosis[i]))
    scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    for word, score in sorted_words[:5]:
        print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))

Top words in document 1,addiction
	Word: oxy, TF-IDF: 0.00258
	Word: heroin, TF-IDF: 0.00211
	Word: opiates, TF-IDF: 0.00211
	Word: dope, TF-IDF: 0.00187
	Word: addiction, TF-IDF: 0.00164
Top words in document 2,anxiety
	Word: panic, TF-IDF: 0.00127
	Word: attacks, TF-IDF: 0.00116
	Word: interview, TF-IDF: 0.00116
	Word: attack, TF-IDF: 0.00082
	Word: class, TF-IDF: 0.00081
Top words in document 3,autism
	Word: autism, TF-IDF: 0.00398
	Word: aspergers, TF-IDF: 0.00199
	Word: asperger, TF-IDF: 0.00174
	Word: autistic, TF-IDF: 0.00141
	Word: nts, TF-IDF: 0.00124
Top words in document 4,bipolar
	Word: bipolar, TF-IDF: 0.00363
	Word: bpd, TF-IDF: 0.00363
	Word: server, TF-IDF: 0.00204
	Word: hypomanic, TF-IDF: 0.00181
	Word: discord, TF-IDF: 0.00136
Top words in document 5,conversation
	Word: replies, TF-IDF: 0.00206
	Word: ice, TF-IDF: 0.00154
	Word: cream, TF-IDF: 0.00109
	Word: agent, TF-IDF: 0.00103
	Word: penguin, TF-IDF: 0.00103
Top words in document 6,depression
	Word: lonely, TF-ID