In [1]:
import re
import numpy
import gensim
from gensim.models import word2vec
from nltk.corpus import stopwords
from numpy import *

In [2]:
with open("artificial_intelligence.txt") as file:
    text_review = file.read()

In [3]:
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [4]:
def text_to_wordlist(text, remove_stopwords=True):
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]", " ", text)
    # 3. Convert words to lower case and split them, clean stopwords from model' vocabulary
    words = review_text.lower().split()
    stops = set(stopwords.words('english'))
    meaningful_words = [w for w in words if not w in stops]
    return (meaningful_words)

In [5]:
def get_feature_vec(words, model):
    # Index2word is a list that contains the names of the words in
    # the model's vocabulary. Convert it to a set, for speed 
    index2word_set = set(model.index2word)
    clean_text = []
    # vocabulary, add its feature vector to the total
    for word in words:
        if word in index2word_set:
            clean_text.append(model[word])

    return clean_text

In [6]:
clean_train_text = (text_to_wordlist(text_review, remove_stopwords=True))

In [7]:
clean_train = []
for words in clean_train_text:
    if words in clean_train:
        words = +1
    else:
        clean_train.append(words)

In [8]:
trainDataVecs = get_feature_vec(clean_train, model)
trainData = numpy.asarray(trainDataVecs)

In [9]:
similarity = numpy.dot(trainData, trainData.T)

In [10]:
square_mag = numpy.diag(similarity)

In [11]:
inv_square_mag = 1 / square_mag

In [12]:
inv_square_mag[numpy.isinf(inv_square_mag)] = 0

In [13]:
inv_mag = numpy.sqrt(inv_square_mag)

In [14]:
cosine = similarity * inv_mag
cosine = cosine.T * inv_mag

In [15]:
def powerMethod(A, x0, m, iter):
    n = A.shape[1]
    delta = m * (array([1] * n, dtype='float64') / n)
    for i in range(iter):
        x0 = dot((1 - m), dot(A, x0)) + delta
    return x0

In [16]:
n = cosine.shape[1]  # A is n x n
m = 0.15
x0 = [1] * n

In [17]:
pagerank_values = powerMethod(cosine, x0, m, 30)

srt = numpy.argsort(-pagerank_values)
a = srt[0:10]

In [18]:
keywords_list = []
for words in a:
    keywords_list.append(clean_train_text[words])
    
print(keywords_list)

['lower', 'asterisks', 'end', 'start', 'great', 'telephoto', 'camera', 'original', 'could', 'rate']
