<a href="https://colab.research.google.com/github/radha2006krishna/NLP/blob/main/lab9_wordembeddings_2277.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install gensim
!pip install matplotlib

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m66.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [2]:
# gensim is used to load and work with pre-trained word embedding models
# It provides Word2Vec, GloVe, FastText implementations
import gensim

# KeyedVectors is specifically used to load pre-trained word embeddings
# without loading the full training model
from gensim.models import KeyedVectors

# numpy is used for numerical operations on vectors
# Word embeddings are stored as numerical arrays
import numpy as np

# sklearn.metrics.pairwise is used to calculate similarity between vectors
# cosine_similarity helps measure semantic similarity between words
from sklearn.metrics.pairwise import cosine_similarity

# matplotlib is used to visualize word embeddings in 2D space
import matplotlib.pyplot as plt

In [3]:
import gensim.downloader as api
from gensim.models import KeyedVectors

# Load pre-trained Word2Vec model (may take time on first download)
model = api.load("word2vec-google-news-300")

# Print vocabulary size
print("Vocabulary Size:", len(model.key_to_index))

# Display vector for a sample word
word = "king"
vector = model[word]

print("\nWord:", word)
print("Vector length:", len(vector))
print("First 10 values of the vector:\n", vector[:10])

Vocabulary Size: 3000000

Word: king
Vector length: 300
First 10 values of the vector:
 [ 0.12597656  0.02978516  0.00860596  0.13964844 -0.02563477 -0.03613281
  0.11181641 -0.19824219  0.05126953  0.36328125]


In [4]:
import gensim.downloader as api

# Load GloVe embeddings (100-dimensional)
model = api.load("glove-wiki-gigaword-100")

# Print vocabulary size
print("Vocabulary Size:", len(model.key_to_index))

# Display vector for a sample word
word = "king"
vector = model[word]

print("\nWord:", word)
print("Vector length:", len(vector))
print("First 10 values of the vector:\n", vector[:10])

Vocabulary Size: 400000

Word: king
Vector length: 100
First 10 values of the vector:
 [-0.32307 -0.87616  0.21977  0.25268  0.22976  0.7388  -0.37954 -0.35307
 -0.84369 -1.1113 ]


In [5]:
import gensim.downloader as api

# Load pre-trained GloVe model (100D)
model = api.load("glove-wiki-gigaword-100")

# Define word pairs
word_pairs = [
    ("doctor", "nurse"),
    ("cat", "dog"),
    ("car", "bus"),
    ("king", "queen"),
    ("man", "woman"),
    ("teacher", "student"),
    ("apple", "banana"),
    ("computer", "keyboard"),
    ("sun", "moon"),
    ("river", "water")
]

print("Word Similarity Scores:\n")

for w1, w2 in word_pairs:
    similarity = model.similarity(w1, w2)
    print(f"{w1} - {w2} : {similarity:.4f}")

Word Similarity Scores:

doctor - nurse : 0.7522
cat - dog : 0.8798
car - bus : 0.7373
king - queen : 0.7508
man - woman : 0.8323
teacher - student : 0.8083
apple - banana : 0.5054
computer - keyboard : 0.5418
sun - moon : 0.6138
river - water : 0.6306


In [6]:

import gensim.downloader as api

# Load pre-trained GloVe embeddings (100D)
model = api.load("glove-wiki-gigaword-100")

# Choose at least 5 words
chosen_words = ["king", "university", "doctor", "car", "music"]

for word in chosen_words:
    print(f"\nTop similar words for '{word}':\n")

    similar_words = model.most_similar(word, topn=5)

    for similar_word, score in similar_words:
        print(f"{similar_word} : {score:.4f}")


Top similar words for 'king':

prince : 0.7682
queen : 0.7508
son : 0.7021
brother : 0.6986
monarch : 0.6978

Top similar words for 'university':

college : 0.8294
harvard : 0.8156
yale : 0.8114
professor : 0.8104
graduate : 0.7993

Top similar words for 'doctor':

physician : 0.7673
nurse : 0.7522
dr. : 0.7175
doctors : 0.7081
patient : 0.7074

Top similar words for 'car':

vehicle : 0.8631
truck : 0.8598
cars : 0.8372
driver : 0.8186
driving : 0.7813

Top similar words for 'music':

musical : 0.8128
songs : 0.7978
dance : 0.7897
pop : 0.7863
recording : 0.7651


In [7]:
import gensim.downloader as api

# Load pre-trained Word2Vec (better for analogies)
model = api.load("word2vec-google-news-300")

# Analogy 1
result1 = model.most_similar(
    positive=["king", "woman"],
    negative=["man"],
    topn=5
)

# Analogy 2
result2 = model.most_similar(
    positive=["paris", "india"],
    negative=["france"],
    topn=5
)

# Analogy 3
result3 = model.most_similar(
    positive=["teacher", "hospital"],
    negative=["school"],
    topn=5
)

print("\nking - man + woman = ?")
print(result1)

print("\nparis - france + india = ?")
print(result2)

print("\nteacher - school + hospital = ?")
print(result3)


king - man + woman = ?
[('queen', 0.7118193507194519), ('monarch', 0.6189674139022827), ('princess', 0.5902431011199951), ('crown_prince', 0.5499460697174072), ('prince', 0.5377321839332581)]

paris - france + india = ?
[('chennai', 0.5442505478858948), ('delhi', 0.5149926543235779), ('mumbai', 0.5024341344833374), ('hyderabad', 0.49932485818862915), ('gujarat', 0.48732805252075195)]

teacher - school + hospital = ?
[('Hospital', 0.6331106424331665), ('nurse', 0.6280134320259094), ('hopsital', 0.6217317581176758), ('intensive_care', 0.5683753490447998), ('Hosptial', 0.5647749304771423)]
