### Stage 1:
Perform statistical parsing/tagging on a document in JSON format

INPUTS: JSON doc for the text input  
OUTPUT: JSON format `ParsedGraf(id, sha1, graf)`

In [1]:
import ast
from cmd_pytextrank import pytextrank
import sys
import spacy
from sklearn import cluster
from sklearn import metrics
import pandas as pd
from scipy.spatial import distance
import numpy as np
from numpy import dot
from numpy.linalg import norm
from sklearn.metrics.pairwise import cosine_similarity
nlp = spacy.load('en_core_web_md')

In [45]:
news_topics = pd.read_csv("list_of_topics.csv")
print("No of topics considered {}".format(len(news_topics)))
news_topics_list = news_topics.topics
vectors = np.array(list(map(lambda x: nlp(x).vector, news_topics_list)))

No of topics considered 363


In [46]:
def get_key_terms(text_doc):
    path_stage1 = "o1.json"
    with open(path_stage1, 'w') as f:
        for graf in pytextrank.parse_doc(pytextrank.json_iter(text_doc)):
            f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))

    graph, ranks = pytextrank.text_rank(path_stage1)
    pytextrank.render_ranks(graph, ranks)

    result = []
    count = 0;
    for rl in pytextrank.normalize_key_phrases(path_stage1, ranks):
        count += 1
#         if count>5: break
#         print(pytextrank.pretty_print(rl))
        result.append(ast.literal_eval(pytextrank.pretty_print(rl))[:2])
        
    return result

In [49]:
from urllib.request import urlopen
import json

url = "https://www.who.int/denguecontrol/disease/en/"
html = urlopen(url).read()

from readability.readability import Document
from bs4 import BeautifulSoup

readable_article = Document(html).summary()
readable_title = Document(html).title()
soup = BeautifulSoup(readable_article)
url_dict = {}
url_dict['id'] = 1
url_dict['text'] = soup.text

with open('./test_data/sample.json', 'w') as json_file:
    json.dump(url_dict, json_file)
# print(*** TITLE *** \n\"' + readable_title + '\"\n')
print('*** CONTENT *** \n\"' + soup.text[:500] + '[...]\"')

*** CONTENT *** 
"
What is dengue?
Dengue is fast emerging pandemic-prone viral disease in many parts of the world. Dengue flourishes in urban poor areas, suburbs and the countryside but also affects more affluent neighbourhoods in tropical and subtropical countries.
Dengue is a mosquito-borne viral infection causing a severe flu-like illness and, sometimes causing a potentially lethal complication called severe dengue. The incidence of dengue has increased 30-fold over the last 50 years. Up to 50-100 million inf[...]"


In [50]:
text_doc = "./test_data/sample.json"
key_terms = get_key_terms(text_doc)

word_vectors = list(map(lambda x: nlp(x[0]).vector, key_terms))
NUM_CLUSTERS=5
kmeans = cluster.KMeans(n_clusters=NUM_CLUSTERS, n_jobs=6, n_init=10, max_iter=500,
                       random_state=0)
kmeans.fit(word_vectors)
 
labels = kmeans.labels_
centroids = kmeans.cluster_centers_

result = {}

for x in centroids:
    cosine_values = list(cosine_similarity(x.reshape(1,-1), vectors)[0])
    result[news_topics_list[cosine_values.index(max(cosine_values))]] = max(cosine_values)
#     result.append([news_topics_list[cosine_values.index(max(cosine_values))], max(cosine_values)])

sorted(result.items(), key = lambda kv:(kv[1], kv[0]))
print(result)

{'infections': 0.8303318877200434, 'world': 0.6042749440634765, 'insects': 0.8150994469990989, 'population': 0.7107153592032549}
