In [2]:
!pip install ipynbname




In [6]:
import os
os.chdir('/Users/jessicahong/NLP_study')

In [7]:
from utils import set_project_root
set_project_root()

✅ Project root set to (2 levels up): /Users/jessicahong/NLP_study


In [2]:
import spacy

In [3]:
#loads spaCy's large English language model, en_core_web_lg
nlp  = spacy.load('en_core_web_lg')

In [4]:
#.shape reveals the dimensionality of the vector, typically (300,), meaning the word is represented in a 300-dimensional space.
nlp(u'lion').vector.shape

(300,)

In [5]:
nlp(u'fox').vector.shape

(300,)

In [6]:
tokens = nlp(u'like love hate')

In [7]:
'''
nested loop calculates the semantic similarity between all pairs of tokens in the tokens object (a spaCy Doc or Span).
token1.similarity(token2) returns a cosine similarity score based on their word vectors.
Values close to 1.0 mean high similarity, and values closer to 0 mean low or no similarity.
'''


for token1 in tokens :
    for token2 in tokens :
        print (token1.text, token2.text, token1.similarity(token2))

like like 1.0
like love 0.6579040884971619
like hate 0.6574652194976807
love like 0.6579040884971619
love love 1.0
love hate 0.6393099427223206
hate like 0.6574652194976807
hate love 0.6393099427223206
hate hate 1.0


In [8]:
#checks how many word vectors are present
len(nlp.vocab.vectors)

342918

In [9]:
nlp.vocab.vectors.shape

(342918, 300)

In [10]:
tokens = nlp(u"dog cat nurrgle hyeon")

In [11]:
#token.text – The word itself.
#token.has_vector – Whether this word has a word vector (True or False).
#token.vector_norm – The norm (magnitude) of the word vector. A value of 0.0 usually means no vector is present.
#token.is_oov – Whether the token is out of vocabulary (not included in the model's pretrained word vectors).

for token in tokens :
     print (token.text, token.has_vector, token.vector_norm, token.is_oov)

dog True 7.0336733 False
cat True 6.6808186 False
nurrgle False 0.0 True
hyeon True 8.470639 False


In [15]:
#step 1 : imports the spatial module from scipy.
#step 2 :  cosine_similarity ->omputes the cosine similarity between two vectors.
#1 - spatial.distance.cosine(vec1, vec2) converts cosine distance to cosine similarity.
#A result closer to 1 means higher similarity between the two vectors.
#step 3 :It loads the large English SpaCy model (en_core_web_lg) that includes pretrained word vectors.

In [16]:
from scipy import spatial
cosine_similarity = lambda vec1, vec2: 1 - spatial.distance.cosine(vec1, vec2)

nlp = spacy.load('en_core_web_lg')

In [17]:
king = nlp.vocab['king'].vector
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector

In [18]:
#king-man+women ----> new vector similar with queen, princess, queen, highness

In [19]:
new_vector = king-man+woman

In [20]:
computed_similarities = []
for word in nlp.vocab:
    if word.has_vector and word.is_lower and word.is_alpha:
        similarity = cosine_similarity(new_vector, word.vector)
        computed_similarities.append((word, similarity))  # 정확한 이름!


In [21]:
computed_similarities.sort(key=lambda x: -x[1])
print([t[0].text for t in computed_similarities[:10]])

['king', 'woman', 'she', 'who', 'when', 'dare', 'was', 'not', 'or', 'had']


In [22]:
print(len(computed_similarities))

107


In [23]:
# perform word vector arithmetic and find semantically similar words using SpaCy's pretrained vectors

In [24]:
#Step-by-step breakdown:
#Imports scipy.spatial and spacy.
#Loads the en_core_web_lg model, which includes pretrained word vectors.
#Defines a cosine similarity function using the cosine distance.
#Retrieves the vectors for "king", "man", and "woman".
#Computes a new vector via king - man + woman, which is a classic word analogy method (hoping for "queen").
#Iterates over the SpaCy vocabulary:
#Checks that the word has a vector, is lowercase, and contains only alphabetic characters.
#Calculates cosine similarity between each word's vector and the new vector.
Sorts the results by descending similarity.
Prints the top 10 most similar words — ideally, you'll see "queen" or similar concepts near the top.


'\nStep-by-step breakdown:\nImports scipy.spatial and spacy.\n\nLoads the en_core_web_lg model, which includes pretrained word vectors.\n\nDefines a cosine similarity function using the cosine distance.\n\nRetrieves the vectors for "king", "man", and "woman".\n\nComputes a new vector via king - man + woman, which is a classic word analogy method (hoping for "queen").\n\nIterates over the SpaCy vocabulary:\n\nChecks that the word has a vector, is lowercase, and contains only alphabetic characters.\n\nCalculates cosine similarity between each word\'s vector and the new vector.\n\nSorts the results by descending similarity.\n\nPrints the top 10 most similar words — ideally, you\'ll see "queen" or similar concepts near the top.\n'

In [70]:
from scipy import spatial
import spacy

nlp = spacy.load('en_core_web_lg')

cosine_similarity = lambda vec1, vec2: 1 - spatial.distance.cosine(vec1, vec2)

king = nlp.vocab['king'].vector
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector

new_vector = king - man + woman

computed_similarities = []
for word in nlp.vocab:
    if word.has_vector and word.is_lower and word.is_alpha:
        similarity = cosine_similarity(new_vector, word.vector)
        computed_similarities.append((word, similarity))

computed_similarities.sort(key=lambda x: -x[1])

print([t[0].text for t in computed_similarities[:10]])


['king', 'woman', 'she', 'who', 'when', 'dare', 'was', 'not', 'or', 'had']
