## Reference-based article graph

In [16]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

from utils import read_graph, analyze_url
import networkx as nx

from itertools import combinations

scilens_dir = str(Path.home()) + '/Dropbox/scilens/'
hn_vocabulary = open(scilens_dir + 'small_files/hn_vocabulary/hn_vocabulary.txt').read().splitlines()

#File with institutions metadata
institutions = pd.read_csv(scilens_dir + 'small_files/institutions/metadata.tsv', sep='\t')
institutions['URL'] = institutions['URL'].apply(lambda u: re.sub(r'^(www[0-9]?\.)|(web\.)', r'', u))

#File with academic repositories
repositories = pd.read_csv(scilens_dir + 'small_files/repositories/academic_repositories.csv')
repositories['URL'] = repositories['URL'].apply(lambda u: re.sub(r'^http://(www\.)?', r'', u))
sources = institutions['URL'].tolist() + repositories['URL'].tolist()

scilens_dir = scilens_dir + '/cache/diffusion_graph/scilens_3M/'
articles = pd.read_csv(scilens_dir + 'article_details_v2.tsv.bz2', sep='\t')
papers = pd.read_csv(scilens_dir + 'paper_details_v1.tsv.bz2', sep='\t')
G = read_graph(scilens_dir + 'diffusion_graph_v7.tsv.bz2')


In [39]:
l = ['a', 'b', 'c', 'd']
max_combinations = 4
clusters = {}
for c in range(max_combinations):
    for comb in list(combinations(l , c+1)):
        clusters[comb] = []


clusters[('a',)] = 1
clusters

{('a',): 1,
 ('b',): [],
 ('c',): [],
 ('d',): [],
 ('a', 'b'): [],
 ('a', 'c'): [],
 ('a', 'd'): [],
 ('b', 'c'): [],
 ('b', 'd'): [],
 ('c', 'd'): [],
 ('a', 'b', 'c'): [],
 ('a', 'b', 'd'): [],
 ('a', 'c', 'd'): [],
 ('b', 'c', 'd'): [],
 ('a', 'b', 'c', 'd'): []}

In [12]:
#articles[articles.url.apply(lambda x: analyze_url(str(x))[0]).isin([sources])]
#papers[papers.url.apply(lambda x: x.endswith('html'))]

Unnamed: 0,url,title,authors,keywords,publish_date,full_text
11,https://www.cdc.gov/obesity/data/childhood.html,Childhood Obesity Facts,[],"['young', 'prevalence', 'decreased', 'facts', ...",,Prevalence of Childhood Obesity in the United ...
20,https://www.epa.gov/iaq/no2.html,Indoor Air Quality (IAQ),['Us Epa Oar Oria Ied'],"['changes', 'epa', 'epagov', 'looking', 'infor...",2014-07-03 14:42:05-04:00,We've made some changes to EPA.gov. If the inf...
47,https://wwwnc.cdc.gov/eid/not-found.html,Page Not Found,[],"['page', 'removedtry', 'requested', 'unavailab...",,The page you requested cannot be found at this...
68,https://www.nature.com/ajg/journal/v108/n4/abs...,Fecal Microbiota Transplantation for Clostridi...,"['Zain Kassam', 'Christine H Lee', 'Yuhong Yua...","['clinical', 'upr', 'fmt', 'review', 'rcts', '...",2013-03-19 00:00:00,OBJECTIVES: The clinical and economic burden o...
72,https://www.cdc.gov/policy/analysis/process/in...,CDC Policy Process,[],"['implementation', 'identify', 'law', 'impact'...",,Introduction\n\nPolicy is one potentially effe...
83,https://www.nlm.nih.gov/medlineplus/druginfo/m...,Levetiracetam: MedlinePlus Drug Information,[],"['pharmacist', 'dose', 'doctor', 'medication',...",,"Levetiracetam comes as a solution (liquid), an..."
86,https://www.nimh.nih.gov/health/statistics/pre...,Bipolar Disorder,[],"['ups', 'mood', 'shifts', 'disorder', 'topics'...",,Bipolar Disorder\n\nDefinition\n\nBipolar diso...
124,https://www.acs.org/content/acs/en/pressroom/p...,Beer compound could help fend off Alzheimer’s ...,[],"['beer', 'compound', 'parkinsons', 'alzheimers...",,"""Xanthohumol, a Polyphenol Chalcone Present in..."
143,https://www.cdc.gov/minorityhealth/strategies2...,Strategies for Reducing Health Disparities 2016,[],"['disparities', 'reducing', 'major', 'interven...",,Successful Interventions to Reduce Health Disp...
166,https://www.cdc.gov/other/emailupdates/index.html,CDC Email Updates,[],"['text', 'phone', 'subscriber', 'send', 'servi...",,Identify Your Wireless Carrier\n\nIdentify You...


In [None]:
articles['refs'] = articles.url.apply(lambda u: set(G[u]))
articles = articles.set_index('url')


In [None]:
refG = nx.Graph()
for u1 in articles.index:
    for u2 in articles.index:
        if u1<u2:
            inter = articles.loc[u1]['refs'].intersection(articles.loc[u2]['refs'])
            inter = inter.difference(set(['https://scholar.google.com/scholar_lookup', 
                                          'https://www.ncbi.nlm.nih.gov/pubmed/',
                                          'https://www.ncbi.nlm.nih.gov/entrez/query.fcgi',
                                          'https://www.springernature.com/us',
                                          'https://www.mendeley.com/import/',
                                          'https://www.scopus.com/inward/citedby.url']))
            if len(inter) >0:
                print (inter, u1, u2)

## Topic Modeling

In [None]:
scilens_dir = scilens_dir + '/cache/diffusion_graph/scilens_3M/'
df = pd.read_csv(scilens_dir + 'article_details_v1.tsv.bz2', sep='\t')

In [None]:
df['prelude'] = df.title + '\n' + df.full_text.apply(lambda w: w.split('\n')[0])

In [None]:
df['keywords'] = df.prelude.apply(lambda t: set([w for w in hn_vocabulary if w in str(t)]))

In [None]:
df['keywords'] = df['keywords'][~(df['keywords']=='')]

In [None]:
from gsdmm import MovieGroupProcess
mgp = MovieGroupProcess(K=50, alpha=0.1, beta=0.1, n_iters=50)
y = mgp.fit(df.keywords, vocab_size=100)

In [None]:
mgp.cluster_word_distribution

In [None]:
df.title.apply(lambda t: len([w for w in hn_vocabulary if w in str(t)])).value_counts()

In [None]:
df.full_text.apply(lambda t: min(5,len([w for w in hn_vocabulary if w in str(t)]))).value_counts()

In [None]:
df.full_text.apply(lambda t: min(5,len([w for w in hn_vocabulary if w in str(t.split('\n')[0])]))).value_counts()

In [None]:
import spacy
nlp = spacy.load('en_core_web_lg')

In [None]:
df['title_vector'] = df.title.apply(lambda x: nlp(str(x)).vector)

In [None]:
from sklearn.manifold import TSNE
tsne_model_en_2d = TSNE(perplexity=30, n_components=2, verbose=1, init='pca', n_iter=3500, random_state=32)

tsne_result = tsne_model_en_2d.fit_transform(np.stack(df['title_vector']))


In [None]:
tsne_result.shape

In [None]:
import seaborn as sns

plt.figure(figsize=(16,10))
sns.scatterplot(
    x = tsne_result[:,0],
    y = tsne_result[:,1],
    #hue="y",
    #palette=sns.color_palette("hls", 10),
    #data=tsne_result,
    legend="full",
    #alpha=0.3
)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(vocabulary=hn_vocabulary)

D = vectorizer.fit_transform(list(df.title.apply(str)))

In [None]:
D.shape

In [None]:
tsne_result_2 = tsne_model_en_2d.fit_transform(D.toarray())


In [None]:
import seaborn as sns

plt.figure(figsize=(16,10))
sns.scatterplot(
    x = tsne_result_2[:,0],
    y = tsne_result_2[:,1],
    #hue="y",
    #palette=sns.color_palette("hls", 10),
    #data=tsne_result,
    legend="full",
    #alpha=0.3
)