# 01-7: Embeddings visualization


## Word2Vec and GloVe

In [None]:
# Download GoogleNews-vectors-negative300.bin.gz from GCS
!wget --no-check-certificate http://storage.googleapis.com/miax12/GoogleNews-vectors-negative300.bin.gz -O /tmp/GoogleNews-vectors-negative300.bin.gz
!gunzip /tmp/GoogleNews-vectors-negative300.bin.gz

In [None]:
# Word2Vec with Gensim
import gensim

# Load pretrained vectors from Google
model = gensim.models.KeyedVectors.load_word2vec_format('/tmp/GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
# TODO: Usamos la palabra "king"

In [None]:
# TODO: king - man + woman = queen

In [None]:
# Try Glove word embeddings with Spacy
# !python3 -m spacy download en_core_web_lg
# !pip3 install spacy --user

In [None]:
import spacy
# Load the spacy model that you have installed
import en_core_web_lg
nlp = en_core_web_lg.load()
# process a sentence using the model
doc = nlp("man king stands on the carpet and sees woman queen")
# Get the vector for 'king':
doc[1].vector[0:50]

In [None]:
# TODO: Find similarity between King and Queen (higher value is better).

In [None]:
# TODO: Find similarity between King and carpet

In [None]:
# TODO: Check if king - man + woman = queen.

In [None]:
from scipy.spatial import distance
import numpy as np

# Format the vocabulary for use in the distance function
vectors = [token.vector for token in doc]
vectors = np.array(vectors)

# Find the closest word below
closest_index = distance.cdist(np.expand_dims(v, axis = 0), vectors, metric = 'cosine').argmin()
output_word = doc[closest_index].text

## Word2Vec and GloVE visualization with NLTK and Gensim 


In [None]:
# Download glove.zip from GCS
!wget --no-check-certificate http://storage.googleapis.com/miax12/glove.zip -O /tmp/glove.zip
!unzip /tmp/glove.zip # this will unzip into glove.twitter.27B.25d.txt

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec

nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token.isalnum() and token not in stop_words]
    
    # Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    
    return stemmed_tokens


In [None]:
raw_text = """
I love machine learning. It is awesome!
Deep learning and natural language processing are very cool.
Artificial intelligence is the future.
"""

preprocessed_tokens = preprocess_text(raw_text)
print(preprocessed_tokens)

In [None]:
glove_embeddings = {}
with open("glove.twitter.27B.25d.txt", "r") as file:
    for line in file:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype="float32")
        glove_embeddings[word] = vector



In [None]:
model = Word2Vec([preprocessed_tokens], min_count=1, vector_size=50, workers=4)

In [None]:
def visualize_embeddings(embeddings, words):
    tsne = TSNE(n_components=2, random_state=0, perplexity=len(words)-1)
    embedding_vectors = np.array([embeddings[word] for word in words])
    two_d_embeddings = tsne.fit_transform(embedding_vectors)

    plt.figure(figsize=(8, 8))
    for i, word in enumerate(words):
        x, y = two_d_embeddings[i, :]
        plt.scatter(x, y)
        plt.annotate(word, (x, y), xytext=(5, 2), textcoords="offset points", ha="right", va="bottom")
    plt.show()

# For GloVe
glove_words = [word for word in preprocessed_tokens if word in glove_embeddings]
visualize_embeddings(glove_embeddings, glove_words)

# For Word2Vec
word2vec_words = model.wv.index_to_key
visualize_embeddings(model.wv, word2vec_words)