In this notebook, we generate a lexicon that will be used in filtering tweets. The general approach is as follows:

(1) For a desired lexicon, generate a number of seed terms that should be within the lexicon.
(2) Find words that are similar to these seed terms by measuring the Euclidean distance between the vector of a new term and the vectors of the seed terms, i.e. measure how similar a new term is to the cluster of seed terms. By iterating through a dictionary, we can get the top 100/200 terms and narrow down additional terms to add to the lexicon. 
(3) Repeat (2) until the lexicon is of desired size.

words_alpha.txt can be downloaded from: https://github.com/dwyl/english-words/blob/master/words_alpha.txt

In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_lg')

In [6]:
with open('words_alpha.txt', 'r') as f:
    words = [line.strip('\n') for line in f]

In [7]:
nlp.max_length = 4000000

In [8]:
tokens = nlp(" ".join(words))

In [9]:
vectors = {token.text: token.vector for token in tokens}

In [10]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import random

In [11]:
def similarity_to_cluster(cluster_tokens):
    mat = np.array([vectors[token] for token in cluster_tokens])
    return lambda x: np.sum(mat @ vectors[x]) if x in vectors else 0

### Negative sharing-related lexicon

In [15]:
negative_seeds = ['difficult', 'challenging', 'disappointed', 'disappointing', 'hurt', 'unhappy']

In [16]:
similarity_to_negative_sharing = similarity_to_cluster(negative_seeds)
negative_sims = np.array([similarity_to_negative_sharing(word) for word in words])

In [17]:
for idx in np.argsort(negative_sims)[::-1][:50]:
    print(words[idx])

frustrating
disappointing
frustrated
difficult
disappointed
painful
emotionally
unhappy
uncomfortable
hurt
terribly
feeling
sad
stressful
upset
miserable
felt
worried
terrible
challenging
disappointment
badly
seemed
feel
angry
worse
somewhat
struggling
anxious
overly
embarrassing
horrible
feelings
unfair
embarrassed
awful
scared
situation
tired
tough
poor
honestly
seem
hurting
afraid
despite
depressing
frankly
quite
depressed


In [18]:
negative_seeds = ['difficult', 'challenging', 'disappointed', 'disappointing', 
                  'hurt', 'unhappy', 'frustrated', 'frustrating', 'painful', 'stressful', 'upset', 'miserable',
                 'anxious', 'angry', 'horrible', 'embarrassing', 'embarrassed', 'awful', 'scared', 'tired',
                 'tough', 'depressing', 'depressed']

In [19]:
similarity_to_negative_sharing = similarity_to_cluster(negative_seeds)
negative_sims = np.array([similarity_to_negative_sharing(word) for word in words])

In [20]:
cnt = 0
for idx in np.argsort(negative_sims)[::-1]:
    if words[idx] in negative_seeds: continue
    print(words[idx])
    cnt += 1
    if cnt == 50: break

uncomfortable
sad
emotionally
feeling
terrible
terribly
annoyed
annoying
unpleasant
awkward
worried
feelings
hopeless
stupid
horribly
fearful
selfish
worse
boring
irritated
pathetic
felt
afraid
nervous
upsetting
scary
overwhelmed
frightened
terrified
irritating
ugly
bitter
tiring
crying
disgusted
emotional
rude
feel
overly
badly
disgusting
desperate
frustration
pissed
sick
downright
ashamed
impatient
ignorant
suffering


In [21]:
negative_seeds = ['difficult', 'challenging', 'disappointed', 'disappointing', 
                  'hurt', 'unhappy', 'frustrated', 'frustrating', 'painful', 'stressful', 'upset', 'miserable',
                 'anxious', 'angry', 'horrible', 'embarrassing', 'embarrassed', 'awful', 'scared', 'tired',
                 'tough', 'depressing', 'depressed', 'uncomfortable', 'sad', 'terrible', 'unpleasant',
                 'awkward', 'hopeless', 'cry', 'cried', 'disgusted', 'desperate', 'pissed', 'ashamed']

In [22]:
similarity_to_negative_sharing = similarity_to_cluster(negative_seeds)
negative_sims = np.array([similarity_to_negative_sharing(word) for word in words])

In [23]:
cnt = 0
for idx in np.argsort(negative_sims)[::-1]:
    if words[idx] in negative_seeds: continue
    print(words[idx])
    cnt += 1
    if cnt == 50: break

feeling
emotionally
terribly
annoyed
crying
annoying
pathetic
stupid
frightened
terrified
disgusting
fearful
selfish
feelings
horribly
rude
afraid
ignorant
worried
ugly
felt
jealous
bitter
irritated
worse
sick
scary
irritating
boring
overwhelmed
nervous
whiny
shocked
feel
cruel
arrogant
downright
impatient
badly
complaining
lonely
frustration
upsetting
emotional
helpless
whining
childish
unbearable
cranky
hate


In [31]:
negative_seeds = ['angry', 'annoyed', 'anxious', 'ashamed', 'awful', 'awkward', 'bitter', 
                  'challenging', 'cried', 'cry', 'depressed', 'depressing', 'desperate', 
                  'difficult', 'disappointed', 'disappointing', 'disgusted', 'embarrassed', 
                  'embarrassing', 'frustrated', 'frustrating', 'hopeless', 'horrible', 'hurt', 
                  'irritated', 'miserable', 'nervous', 'overwhelmed', 'painful', 'pissed', 'sad', 
                  'saddening', 'stressful', 'terrible', 'tired', 'tough', 'unbearable', 'uncomfortable', 
                  'unhappy', 'unpleasant', 'upset', 'upsetting', 'worried']

In [32]:
print(sorted(negative_seeds))

['angry', 'annoyed', 'anxious', 'ashamed', 'awful', 'awkward', 'bitter', 'challenging', 'cried', 'cry', 'depressed', 'depressing', 'desperate', 'difficult', 'disappointed', 'disappointing', 'disgusted', 'embarrassed', 'embarrassing', 'frustrated', 'frustrating', 'hopeless', 'horrible', 'hurt', 'irritated', 'miserable', 'nervous', 'overwhelmed', 'painful', 'pissed', 'sad', 'saddening', 'stressful', 'terrible', 'tired', 'tough', 'unbearable', 'uncomfortable', 'unhappy', 'unpleasant', 'upset', 'upsetting', 'worried']


## gratitude lexicon

In [33]:
gratitude_seeds = ['helpful', 'appreciate', 'nice', 'thanks', 'thank']

In [35]:
similarity_to_gratitude = similarity_to_cluster(gratitude_seeds)
gratitude_sims = np.array([similarity_to_gratitude(word) for word in words])

In [36]:
cnt = 0
for idx in np.argsort(gratitude_sims)[::-1]:
    if words[idx] in gratitude_seeds: continue
    print(words[idx])
    cnt += 1
    if cnt == 50: break

wonderful
glad
thankyou
awesome
appreciated
great
lovely
grateful
good
amazing
congrats
very
happy
hope
informative
really
fantastic
congratulations
welcome
thankful
sorry
guys
love
excellent
beautiful
sharing
dear
honest
thoughtful
feel
loved
me
everyone
cool
fun
luck
interesting
enjoy
haha
loving
bless
hey
inspiring
enjoyed
cute
enjoyable
gracious
suggestions
wow
pretty


In [None]:
gratitude_seeds = ['helpful', 'appreciate', 'nice', 'thanks', 'thank you', 'awesome', 'lovely', 'grateful']