## Sentiment Analysis

In [1]:
import spacy
nlp = spacy.load('en_core_web_md')  # make sure to use a larger model!

In [2]:
nlp(u'fox').vector.shape

(300,)

In [3]:
nlp(u'Hi How are you').vector.shape

(300,)

## Identifying similar vectors

In [4]:
tokens = nlp(u"lion cat pet")

In [9]:
for token1 in tokens:
    for token2 in tokens:
        print(token1.text,token2.text,token1.similarity(token2)) # Cosine similarity between two vectors

like like 1.0
like love 0.657904
like hate 0.65746516
love like 0.657904
love love 1.0
love hate 0.63930994
hate like 0.65746516
hate love 0.63930994
hate hate 1.0


## Opposites are not necessarily different

## Words have similar vectors if they are used in the same context even if they are opposite

In [7]:
tokens = nlp(u"like love hate")

In [8]:
for token1 in tokens:
    for token2 in tokens:
        print(token1.text,token2.text,token1.similarity(token2)) # Cosine similarity between two vectors

like like 1.0
like love 0.657904
like hate 0.65746516
love like 0.657904
love love 1.0
love hate 0.63930994
hate like 0.65746516
hate love 0.63930994
hate hate 1.0


In [10]:
len(nlp.vocab.vectors)

20000

In [11]:
tokens = nlp(u"dog cat nargle")

In [12]:
for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

dog True 7.0336733 False
cat True 6.6808186 False
nargle False 0.0 True


In [13]:
from scipy import spatial

cosine_similarity = lambda x, y: 1 - spatial.distance.cosine(x, y)

king = nlp.vocab['king'].vector
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector

# Now we find the closest vector in the vocabulary to the result of "man" - "woman" + "queen"
new_vector = king - man + woman

In [15]:
computed_similarities = []
# For all the words in the vocab
for word in nlp.vocab:
    if word.has_vector:
        if word.is_lower:
            if word.is_alpha:
                similarity=cosine_similarity(new_vector,word.vector)
                computed_similarities.append((word,similarity))

In [16]:
computed_similarities = sorted(computed_similarities, key=lambda item: -item[1])

In [18]:
print([t[0].text for t in computed_similarities[:10]])

['king', 'queen', 'commoner', 'highness', 'prince', 'sultan', 'maharajas', 'princes', 'kumbia', 'kings']
