In [65]:
import numpy as np
import pandas as pd
from scipy import spatial
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

Reference: https://medium.com/analytics-vidhya/basics-of-using-pre-trained-glove-vectors-in-python-d38905f356db


Reading the 50 dimension embeddings from file and creating a dictionary

In [5]:
dimension = 50
glove_input_file = "glove\glove.6B." + str(dimension) + "d.txt"

In [6]:
embeddings_dict = {}
with open(glove_input_file, 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

After reading the embeddings we can define few functions to do interesting operations.

In [7]:
def find_closest_embeddings(embedding):
    return sorted(embeddings_dict.keys(), key=lambda word: spatial.distance.euclidean(embeddings_dict[word], embedding))

In [69]:
test_input = "This is a sample input. We will try to have many sentences in this particular paragraph. The more sentences we have the better for our testing. The algorithm will try to rank all sentences. The top ranked sentences will be picked up for the summarization. The picked sentences will be arranged in the order in which they appaear in the input paragraph."

In [73]:
sentences_raw = test_input.split(".")
sentences = []
print(type(sentences), type(sentences_raw))
for i, sentence in enumerate(sentences_raw):
    sentences_raw[i] = sentence.strip()
    sentences.append(sentence.strip().lower())
    print (sentence.strip())

<class 'list'> <class 'list'>
This is a sample input
We will try to have many sentences in this particular paragraph
The more sentences we have the better for our testing
The algorithm will try to rank all sentences
The top ranked sentences will be picked up for the summarization
The picked sentences will be arranged in the order in which they appaear in the input paragraph



In [74]:
# overall glove score
document_embedding = np.zeros(dimension);
found_words = 0
for sentence in sentences:
    words = sentence.split()
    for word in words:
        if word in embeddings_dict:
            document_embedding += embeddings_dict[word]
            found_words += 1

document_embedding /= found_words
print(document_embedding)

[ 2.96651608e-01  4.44978697e-02 -1.18722788e-03 -9.47662751e-02
  4.21513134e-01  6.90408916e-02 -1.43196983e-01 -5.32599188e-02
 -3.43356028e-02 -5.73238628e-02  3.54063658e-02  3.71503158e-03
 -2.55592198e-01 -1.66077892e-02  4.35564798e-01  8.87271821e-02
  6.66614688e-02 -2.23741890e-01 -1.84954056e-01 -5.00160492e-01
  4.37303704e-02 -1.10463572e-01  3.00721319e-01  4.22934765e-02
 -1.26539030e-02 -1.50436912e+00 -2.55004292e-01 -1.05511885e-01
  4.77467435e-03 -2.55383083e-01  3.31346872e+00  1.03256810e-01
 -4.04420235e-01 -3.40043820e-01  2.84908123e-01 -3.51489322e-02
  2.25082982e-01  1.73655832e-01 -1.23936253e-01 -1.17615754e-01
 -2.08834695e-01  8.00735630e-02  9.27520020e-02  2.00819599e-01
 -1.75540773e-01  6.74853095e-02  4.51184723e-02  1.34941179e-01
 -7.22062021e-02 -1.12992558e-01]


In [75]:
sentence_glove_score = np.zeros(len(sentences))

print(len(sentences))
for idx, sentence in enumerate(sentences):
    words = sentence.split(" ")
    n_words = 0
    score = 0
    for word in words:
        if word in embeddings_dict:
            score += document_embedding.dot(embeddings_dict[word]) 
            n_words+=1;
        else:
            print(word)
    
    if n_words==0:
        sentence_glove_score[idx] = 0
        continue
    score /= n_words
    sentence_glove_score[idx] = score
    

7
appaear



In [76]:
print(sentence_glove_score)

[14.55531131 15.8435043  16.80767034 14.25190613 13.51178734 15.04553253
  0.        ]


In [64]:
sentences_to_pick = 3

In [99]:
sentence_dict = {'index': list(range(len(sentences))),
                'score': sentence_glove_score,
                'raw_text': sentences_raw,
                'processed_text': sentences_raw}

sentences_df = pd.DataFrame(data=sentence_dict)

In [101]:
sentences_df

Unnamed: 0,index,score,raw_text,processed_text
0,0,14.555311,This is a sample input,This is a sample input
1,1,15.843504,We will try to have many sentences in this par...,We will try to have many sentences in this par...
2,2,16.80767,The more sentences we have the better for our ...,The more sentences we have the better for our ...
3,3,14.251906,The algorithm will try to rank all sentences,The algorithm will try to rank all sentences
4,4,13.511787,The top ranked sentences will be picked up for...,The top ranked sentences will be picked up for...
5,5,15.045533,The picked sentences will be arranged in the o...,The picked sentences will be arranged in the o...
6,6,0.0,,


In [113]:
picked_sentences_df = sentences_df.nlargest(sentences_to_pick, 'score').sort_values(by=['index'])

#print(picked_sentences_df)
output_text=""
for i, picked_sentence in enumerate(picked_sentences_df['raw_text']):
    output_text += picked_sentence + ". "

print(output_text.strip())


We will try to have many sentences in this particular paragraph. The more sentences we have the better for our testing. The picked sentences will be arranged in the order in which they appaear in the input paragraph.
