# Working with embedings using Gensim
- Train word2vect embedings from a corpus
- Load pretrained embedings
- Use embedings to Classify text
- Accest to the embedings of spaCy


In [0]:
# Header
import os


# solve gensim warning
! pip install paramiko

import gensim, logging
print('Gensim Version: ', gensim.__version__)


# Data path
data_path = '.'



In [0]:
# Example code to build a word2vect embedings from a corpus

# To show in the output the internal messages of the word2vect process
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 
# My little corpus    
sentences = [['first', 'sentence'], ['second', 'sentence']]

# train word2vec on the two sentences
model = gensim.models.Word2Vec(sentences, min_count=1)


In [0]:
# Load a more big corpus
import nltk
nltk.download('brown')

from nltk.corpus import brown

print('Corpus sentences len:', len(brown.sents()))
print('Corpus words len:', len(brown.words()))

In [0]:
# Train a word2 vect model over the new corpus

from gensim.models import Word2Vec

model = Word2Vec(brown.sents(), size=100, window=5, min_count=5, workers=4)


In [0]:
#Persist the model

model.save('brown_word2vect_model.bin')


In [0]:
# Load a trained model

model = Word2Vec.load('brown_word2vect_model.bin')  # you can continue training with the loaded model!


In [0]:
# Access to the embedings

model.wv['the']  # Vector embeding of a word. Numpy array


In [0]:
# Similarity fucntions

print('Similars to woman:', model.wv.most_similar_cosmul(positive=['woman']), '\n')

print("Indetify the word that doesn't match in a list:", model.wv.doesnt_match("breakfast cereal dinner lunch".split()), '\n')

print('Words similarity (woman - man):', model.wv.similarity('woman', 'man'))


In [0]:
# Check the accuracy of the builded embedings over a standar evaluation list of relations

from gensim.test.utils import datapath

analogy_scores = model.wv.evaluate_word_analogies(datapath('questions-words.txt'))

print('country - capital corrects', analogy_scores[1][0]['correct'])


In [0]:
analogy_scores[1][0]['correct']


In [0]:
print('BERLIN GERMANY PARIS FRANCE = ', model.wv.most_similar(positive=['Germany', 'Paris'], negative=['Berlin']), '\n')


In [0]:
# If you finish to train the model. Save only the embedings and delete model.
from gensim.models import KeyedVectors

word_vectors = model.wv

fname = 'word_vectors.gz'
word_vectors.save(fname)

del model

# To load:
# word_vectors = KeyedVectors.load(fname)


In [0]:
# Explore the embedings

print('Num of embedings:', len(word_vectors.vocab.keys()), '\n')

print('Sample of words available (20 first):', list(word_vectors.vocab.keys())[:20], '\n')

print('Vocab word attributes for "Oregon" word:', word_vectors.vocab['Oregon'], '\n')

print('Word embedings for "Oregon" word:', word_vectors['Oregon'])

In [0]:
# Vocabulary frequency. List the words with freq > 1000

for k in word_vectors.vocab.keys():
    if word_vectors.vocab[k].count > 1000:
        print(k, word_vectors.vocab[k].count)


## Load pretrained vectors and use it

Load pretrained vectors (300Mb) from
 http://nlpserver2.inf.ufrgs.br/alexandres/vectors/lexvec.enwiki%2bnewscrawl.300d.W.pos.vectors.gz 


In [0]:
# Download vectors into colab
! wget https://www.dropbox.com/s/kguufyc2xcdi8yk/lexvec.enwiki%2Bnewscrawl.300d.W.pos.vectors.gz

! gunzip lexvec.enwiki+newscrawl.300d.W.pos.vectors.gz

In [0]:
# Load pretrained embedings

from gensim.models import KeyedVectors

model = KeyedVectors.load_word2vec_format(os.path.join(data_path, 'lexvec.enwiki+newscrawl.300d.W.pos.vectors'),
                                          unicode_errors='ignore')

print('Sample of one embeding')
dog = model['dog']
print('Shape of one embeding:', dog.shape)
print('First 10 embedings of "dog":', dog[:10], '\n')


# Some predefined functions that show content related information for given words
print('woman + king - man = ', model.most_similar(positive=['woman', 'king'], negative=['man']), '\n')

print("Doesn't match:", model.doesnt_match("breakfast cereal dinner lunch".split()), '\n')

print('Similarity woman - man:', model.similarity('woman', 'man'))
    

In [0]:
# Test the accuracy of this model
analogy_scores = model.wv.evaluate_word_analogies(datapath('questions-words.txt'))

print('country - capital corrects', analogy_scores[1][0]['correct'])


In [0]:
# use in a model
import numpy as np

# Load data. Sentiment model in movies reviews.
# Reference: http://www.aclweb.org/anthology/P11-1015 

! wget https://s3-eu-west-1.amazonaws.com/text-mining-course/sentiment_corpus.zip
! unzip sentiment_corpus.zip




In [0]:
X_trn = np.load(os.path.join(data_path, 'sentiment_X_trn.npy')) 
X_tst = np.load(os.path.join(data_path, 'sentiment_X_tst.npy'))
y_trn = np.load(os.path.join(data_path, 'sentiment_y_trn.npy')) # 1: pos, 0:neg
y_tst = np.load(os.path.join(data_path, 'sentiment_y_tst.npy')) # 1: pos, 0:neg


In [0]:
print(X_trn.shape)
print(X_trn[:2])
print(y_trn[:2])

In [0]:
model.vocab.keys()

In [0]:
# Represent each sentence by the average embeding of the words located in the embedings dictionary

def encode_text(corpus, model):
    '''
    Function to encode text sentences into one embedding by sentence
        input: A list of sentences (corpus) and a embeddings model (model)
        output: One embedding for each sentence (average of embeddings of the words in the sentence)
    '''
    features_list = []
    for s in corpus:
        features = []
        for t in s:
            if str.upper(t) in model.vocab.keys():
                features += [model[str.upper(t)]]
        features_list += [np.mean(features, axis=0)] 
    return np.array(features_list)


In [0]:
# Check embedings shape
embeds_trn = encode_text(X_trn, model)
print('Embeds trn shape:', embeds_trn.shape)

embeds_tst = encode_text(X_tst, model)
print('Embeds tst shape:', embeds_tst.shape)

In [0]:
# Build a model and evaluate it
from sklearn.svm import LinearSVC

# Train
text_clf_svm = LinearSVC()
text_clf_svm.fit(embeds_trn, y_trn)

#Evaluate test data
predicted = text_clf_svm.predict(embeds_tst)
print('Test accuracy:', np.mean(predicted == y_tst))

# Word embedings in spaCy

In [0]:
# Install modelo with embeddings
! python -m spacy download en_core_web_md


In [0]:
import spacy

# Load a model with embedings . The small models don't have embedings
import en_core_web_md
nlp = en_core_web_md.load()
print('Model loaded!')


In [0]:
# Chech if the token has a vector.
tokens = nlp(u'dog cat banana bibliopole')

for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)
    

In [0]:
# For a given document, calculate similarity between 'apples' and 'oranges' and 'boots' and 'hippos'
doc = nlp(u"Apples and oranges are similar. Boots and hippos aren't.")
apples = doc[0]
oranges = doc[2]
boots = doc[6]
hippos = doc[8]
print('Similarity between words:')
print('apples vs oranges: ', apples.similarity(oranges))
print('boots vs hippos:', boots.similarity(hippos))

print()
print('Similarity between a word and a sentence:')
# Print similarity between sentence and word 'fruit'
apples_sent, boots_sent = doc.sents
fruit = doc.vocab[u'fruit']
print('apples sentence vs fruit word: ', apples_sent.similarity(fruit))
print('boots sentence vs fruit word:', boots_sent.similarity(fruit))


In [0]:
# Show a vector
apples.vector
