In [3]:
import pickle
import numpy as np
from re import split
data = pickle.load(open("emotype_v10.p", "rb" ))
print (data[0].keys())
print (len(data))

_, m = data[0]['encoding'].shape
n = len(data)
print (m)
print (n)

dict_keys(['label', 'outputs', 'text', 'encoding', 'prediction'])
64301
128
64301


In [4]:
label_cluster_dic = {'addiction':0, 'anxiety':1, 'autism':2, 
                         'bipolar':3, 'conversation':4, 'depression':5, 
                         'happy':6, 'schizophrenia':7}

In [5]:
def initialize_cluster_labels_with_ground_truth(data_dict):
    """Initialize dictionary of OPs with cluster labels according original topic. 
    """
    for i in range(len(data_dict)):
        data_dict[i]['cluster'] = label_cluster_dic[data_dict[i]['label']]
    return data_dict

In [6]:
data = initialize_cluster_labels_with_ground_truth(data)

In [7]:
data[2]['text']

'I allow non important things to rule my mind. I have been spending a lot of time trying to get rock band controllers, setting up internet, organizing, etc.In reality I should be focusing on school. Why is it so was for me to obsess about nonimmediate issues. The most important things I should be worried about right now is my health, my education and financial situation.Yet, I write this while I am sitting in math class.  '

In [8]:
len(data)

64301

A potentially smoother way to get a dictionary initialized with all the words present in the corpus...

In [7]:
# lexicon = set()
# for i in range(len(data)):
#     ith_phrase = data[i]['text'].strip().lower()  # Get text from ith OP
#     ith_phrase.replace("'", "")  # Get rid of apostrophes
#     list_of_words_in_phrase = split("[^a-zA-Z]+", ith_phrase)  
#     lexicon.update(list_of_words_in_phrase)  # Add all of the text's words to lexicon
# overall_word_freqs = {}.fromkeys(lexicon, 0)

In [8]:
# import random
# print(len(lexicon))
# some_words_in_the_lexicon = random.sample(lexicon, 10)  # Print 10 random words from our lexicon
# print(some_words_in_the_lexicon)
# print(len(overall_word_freqs.keys()))
# print(overall_word_freqs['i'])

In [11]:
all_text = []  # List to contain all of the words (our lexicon)
for i in range(len(data)):
    tmp = data[i]['text'].strip().lower()
    all_text.append(split("[^a-zA-Z']+", tmp))
print(len(all_text))
print(all_text[1])
dic = {}
for i in range(len(all_text)):  # i_th word in our lexicon
    for j in all_text[i]:  #j
        if j and dic.get(j, -1) != 0:
            dic[j] = 0
print(len(dic.keys()))
print(dic['i'])

64301
['i', 'wonder', 'if', 'it', 'is', 'related', 'to', 'lack', 'of', 'object', 'permanence', 'i', 'feel', 'like', 'every', 'time', 'i', 'look', 'in', 'the', 'mirror', 'i', 'have', 'a', 'hard', 'time', 'wrapping', 'my', 'head', 'around', 'what', 'i', 'see', 'i', 'know', "it's", 'me', 'and', 'i', 'look', 'the', 'same', 'but', 'somehow', 'something', "isn't", 'right', 'almost', 'like', 'maybe', 'this', 'time', 'it', 'will', 'make', 'sense', 'and', 'stick', 'i', "can't", 'get', 'a', 'sense', 'of', 'sameness', 'sometimes', 'i', 'feel', 'ok', 'about', 'what', 'i', 'look', 'like', 'and', 'sometimes', 'i', "can't", 'stand', 'it', 'everyday', 'i', 'see', 'something', 'different', 'i', "don't", 'like', 'and', 'later', 'on', 'it', 'wont', 'bother', 'me', 'anymore', 'or', 'i', 'will', 'notice', 'something', 'else', 'that', 'i', "don't", 'like', 'either', 'way', 'it', 'just', 'never', 'really', 'seems', 'like', 'me', '']
58576
0


In [12]:
word_list = list(dic.keys())
word_list[1]

'do'

In [13]:
len(word_list)

58576

In [14]:
def find_text_with_cluster(cluster):
    out = []
    for i in range(len(data)):
        if data[i]['cluster'] == cluster:
            out.append(split("[^a-zA-Z']+", data[i]['text'].strip().lower() ))
    return out

def count_word_of_cluster(texts):
    out_dic = dic.fromkeys(dic,0)
    for text in texts:
        for word in text:
            if word: out_dic[word] += 1
    return out_dic

def get_cluster_vectors(word_list, dic_cluster):
    out = []
    for word in word_list:  
        out.append(dic_cluster[word])
    return out

def extract_vectors(cluster):
    text_cluster = find_text_with_cluster(cluster)
    dic_cluster = count_word_of_cluster(text_cluster)
    vector_cluster = get_cluster_vectors(word_list,dic_cluster)
    return vector_cluster

In [15]:
num_clusters = len(label_cluster_dic)  # We exlcude conversation
num_words_in_lexicon = len(word_list)

# Rows are clusters, columns are words, and elements are word frequencies in the cluster
word_freq_mat = np.zeros((num_clusters, num_words_in_lexicon))  
for cluster_name, cluster_label in label_cluster_dic.items():
    cluster_freqs = np.array(extract_vectors(cluster_label))
    word_freq_mat[cluster_label, :] = cluster_freqs

In [16]:
num_clusters_with_word_present = np.sum(word_freq_mat > 0, axis=0, keepdims=True)
tf_normalized_by_cluster = word_freq_mat / np.sum(word_freq_mat, axis=1, keepdims=True)
idf = np.log(num_clusters / num_clusters_with_word_present)

In [17]:
tfidf = tf_normalized_by_cluster * idf

In [18]:
n = 20
top_n_tfidf = np.argsort(tfidf, axis=1)[:, -n:]
bottom_n_tfidf = np.argsort(tfidf, axis=1)[:, :n]

In [19]:
[word_list[i] for i in list(top_n_tfidf[label_cluster_dic['autism'], :])]

['aba',
 'pddnos',
 'sensory',
 'neurotypical',
 'workingout',
 'raspergers',
 'stim',
 'stimming',
 'autistics',
 'nt',
 'nonverbal',
 'asperger',
 "asperger's",
 'autistic',
 'asd',
 'nts',
 'autism',
 'aspergers',
 'aspie',
 'aspies']

In [20]:
[word_list[i] for i in list(top_n_tfidf[label_cluster_dic['bipolar'], :])]

['diagnosis',
 'quetiapine',
 'daylio',
 'borderlines',
 'pdoc',
 'hypomanic',
 'hypo',
 'bp',
 'abilify',
 'lithium',
 'manic',
 'seroquel',
 'mania',
 'lamotrigine',
 'latuda',
 'dbt',
 'bpd',
 'hypomania',
 'fp',
 'lamictal']

In [21]:
[word_list[i] for i in list(top_n_tfidf[label_cluster_dic['depression'], :])]

['noose',
 'relapsed',
 'cared',
 'pills',
 'relapse',
 'razor',
 'loves',
 'numb',
 'killed',
 'pussy',
 'sertraline',
 'hurts',
 'empty',
 'selfharm',
 'harming',
 'killing',
 'alive',
 'pathetic',
 'worthless',
 'scars']

In [22]:
[word_list[i] for i in list(top_n_tfidf[label_cluster_dic['happy'], :])]

['graph',
 'testament',
 'achieving',
 'jeff',
 'rohn',
 'pomodoro',
 'bennet',
 'beforeafter',
 'studybreak',
 'moviesvideos',
 'readout',
 'materialization',
 'pz',
 'olsenvideos',
 'youtuberbettermentbookclub',
 'bennetcreated',
 "shrink's",
 'selfpost',
 'utcfriday',
 'utc']

In [23]:
[word_list[i] for i in list(top_n_tfidf[label_cluster_dic['conversation'], :])]

['jew',
 'barman',
 'jewish',
 'billy',
 'replies',
 'nun',
 'pirate',
 'officer',
 'rabbi',
 'genie',
 'chief',
 'paddy',
 'exclaims',
 'tl',
 'priest',
 'farmer',
 'sir',
 'blonde',
 'johnny',
 'bartender']

In [24]:
[word_list[i] for i in list(top_n_tfidf[label_cluster_dic['anxiety'], :])]

['withdrawal',
 'effexor',
 'cymbalta',
 'terrifies',
 'interaction',
 'nausea',
 'pains',
 'sweating',
 'ssris',
 'terrified',
 'dizziness',
 'paxil',
 'citalopram',
 'sertraline',
 'palpitations',
 'panicking',
 'buspar',
 'gad',
 'sa',
 'attack']

# Ignore this bit, I was playing around with weightings...

In [42]:
tf_normalized_over_all_clusters = np.sum(word_freq_mat, axis=0, keepdims=True) / np.sum(word_freq_mat)
np.min(tf_normalized_over_all_clusters)

2.4402623770107763e-07

In [43]:
eps = 1e-10
gamma = np.log((tf_normalized_by_cluster +  eps) / tf_normalized_over_all_clusters)

In [44]:
modified_tfidf = tfidf * gamma

In [45]:
print(np.min(tfidf))

0.0


In [46]:
print(np.max(tfidf))

0.00160351124941


In [47]:
n = 20
top_n_modified_tfidf = np.argsort(modified_tfidf, axis=1)[:, -n:]
bottom_n_modified_tfidf = np.argsort(modified_tfidf, axis=1)[:, :n]

In [49]:
[word_list[i] for i in list(top_n_modified_tfidf[label_cluster_dic['autism'], :])]

['sensory',
 'meltdowns',
 'neurotypical',
 'pddnos',
 'stim',
 'raspergers',
 'workingout',
 'autistics',
 'nt',
 'stimming',
 'nonverbal',
 'asperger',
 "asperger's",
 'autistic',
 'nts',
 'asd',
 'autism',
 'aspergers',
 'aspie',
 'aspies']