# Text Representation using Neural Models or Text Embeddings

## Common Imports

In [None]:
import pandas as pd
import re
import numpy as np

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

import nltk
from nltk import sent_tokenize, word_tokenize
STOPWORDS = nltk.corpus.stopwords.words('english')

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

## Word2Vec Model using Gensim

In [None]:
import nltk
from gensim.models import word2vec

In [None]:
sample = '''This will be followed by more of the same with the mist and fog clearing to give a day of unbroken sunshine 
everywhere on Tuesday and temperatures of between 22 and 27 degrees. It will warmest in the midlands. Temperatures 
could reach a September record for the century in Ireland, but are unlikely to surpass the 29.1 degrees recorded 
at Kildare’s Clongowes Wood College on September 1st, 1906. Tuesday, however, will be the last day of the sunshine 
with rain arriving across the country on Wednesday morning. Temperatures will remain as high as 24 degrees with the 
warmth punctuated by heavy showers.'''

cleaned_sample = re.sub("[^A-Za-z0-9\s.]", "" , sample.replace('\n', '').lower())
tokens_docs = [word_tokenize(doc) for doc in sent_tokenize(cleaned_sample)]
print(tokens_docs) # FORMAT - [[tokens], [tokens], ...]    

In [None]:
w2v_model = word2vec.Word2Vec(tokens_docs, 
                              vector_size=15, # Dimensionality of the word vectors
                              window=20, 
                              min_count = 1,
                              sg=1 # 1 for skip-gram; otherwise CBOW
                             )
w2v_model

### Vocab Length

In [None]:
len(w2v_model.wv)

Recall vocab length from the traditional text representation techniques.

In [None]:
w2v_model_min_2 = word2vec.Word2Vec(tokens_docs, 
                              vector_size=15, # Dimensionality of the word vectors
                              window=20, 
                              min_count = 2, # min_count -> min frequency of the words
                              sg=1 # 1 for skip-gram; otherwise CBOW
                             )
len(w2v_model_min_2.wv)

Only the words occuring at least twice are taken in the vocab.

**Compare the vocabulary list below:**

In [None]:
w2v_model.wv.key_to_index.keys()

In [None]:
w2v_model_min_2.wv.key_to_index.keys()

### Access individual word vectors

In [None]:
w2v_model.wv.get_vecattr('temperatures', 'count')

In [None]:
w2v_model.wv.get_vector('temperatures')

In [None]:
w2v_model.wv.get_vector('temperatures').shape

Recall that we set the dimensionality of the word vectors to 15.

In [None]:
w2v_model.wv.get_vecattr('the', 'count')

In [None]:
w2v_model.wv.get_vector('the')

### Access the "word-vectored" data

In [None]:
vocab = w2v_model.wv.key_to_index.keys()
w2v_model.wv[vocab]

In [None]:
w2v_model.wv[vocab].shape # vocab x dimension of wv

### Most Similar Terms

In [None]:
w2v_model.wv.most_similar('day', topn=3)

In [None]:
w2v_model.wv.most_similar('country', topn=3)

In [None]:
# Given a list of keys which is a subset of the vocab, which of the keys is most similar to the given key.
# syntax w2v_model.wv.most_similar_to_given(given_key, subset of the vocab)

w2v_model.wv.most_similar_to_given('country', ['september', 'ireland', 'day'])

In [None]:
w2v_model.wv.most_similar_to_given('country', ['september', 'ireland', 'of'])

In [None]:
w2v_model.wv.most_similar_to_given('country', ['midlands', 'college', 'unlikely'])

### Visualizing a Word2Vec model

**Using PCA**

In [None]:
from sklearn.decomposition import PCA

In [None]:
def plot_similarity_PCA(model, word_vector):
    pca = PCA(n_components=2)
    result = pca.fit_transform(word_vector)
    print(result.shape)
    plt.scatter(
        result[:, 0], # column (dimesion) 1
        result[:, 1], # column (dimension) 2
        color='b'
    )
    # annotation or printing words in the plot
    for i, word in enumerate(vocab):
        plt.annotate(word, xy=(result[i, 0], result[i, 1]))
    plt.show()

In [None]:
plt.figure(figsize=[10, 10])
plot_similarity_PCA(w2v_model, w2v_model.wv[vocab])

**Using T-SNE**

In [None]:
# visualize embeddings
from sklearn.manifold import TSNE

In [None]:
def plot_similarity_TSNE(model, word_vector):
    tsne = TSNE(n_components=2)
    result = tsne.fit_transform(word_vector)
    print(result.shape)
    plt.scatter(
        result[:, 0],
        result[:, 1],
        color='r'
    )
    for i, word in enumerate(vocab):
        plt.annotate(word, xy=(result[i, 0], result[i, 1]))
    plt.show()

In [None]:
plt.figure(figsize=[10, 10])
plot_similarity_TSNE(w2v_model, w2v_model.wv[vocab])

## GloVe 

Stanford Project Link - https://nlp.stanford.edu/projects/glove/

### GloVe file vs Word2vec file

In [None]:
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

glove_file = 'test_glove.txt'
w2v_file = "test_word2vec.txt"
glove2word2vec(glove_file, w2v_file)

In [None]:
model = KeyedVectors.load_word2vec_format(w2v_file)

### Loading GloVe

In [None]:
from gensim.test.utils import get_tmpfile

glove_file = r'C:\Users\nroy0\Documents\Resources\glove.6B\glove.6B.300d.txt'
w2v_file = get_tmpfile("glove_w2v.txt")
glove2word2vec(glove_file, w2v_file)

In [None]:
w2v_model = KeyedVectors.load_word2vec_format(w2v_file)

In [None]:
w2v_model.most_similar('day', topn=10)

In [None]:
vocab = w2v_model.key_to_index.keys()

In [None]:
glove_embedding_matrix = w2v_model[vocab]
glove_embedding_matrix

In [None]:
glove_embedding_matrix.shape # recall we used 300d

In [None]:
w2v_model.get_vector('day')

In [None]:
list_sample_tokens = [each_token for each_seq in tokens_docs for each_token in each_seq if each_token not in STOPWORDS]
unique_tokens = list(set(list_sample_tokens))
unique_tokens

In [None]:
# Create a numpy array of zeroes of shape of the vocab_size and the dimension you have chosen
# In our case it is 300d

sample_glove_embedding_matrix = np.zeros((len(unique_tokens), 300))
sample_glove_embedding_matrix.shape

In [None]:
for i, each_token in enumerate(unique_tokens):
    try:
        sample_glove_embedding_matrix[i] = w2v_model.get_vector(each_token)
    except KeyError as e:
        print(e)
        print("In the matrix, the vector at position {} is zero.".format(i))
sample_glove_embedding_matrix

In [None]:
sample_glove_embedding_matrix.shape

In [None]:
sample_glove_embedding_matrix[17]