## Reference-based article graph

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

from utils import read_graph, analyze_url
import networkx as nx

from itertools import combinations

scilens_dir = str(Path.home()) + '/Dropbox/scilens/'
hn_vocabulary = open(scilens_dir + 'small_files/hn_vocabulary/hn_vocabulary.txt').read().splitlines()
scilens_dir = scilens_dir + '/cache/diffusion_graph/scilens_3M/'
articles = pd.read_csv(scilens_dir + 'article_details_v2.tsv.bz2', sep='\t')
G = read_graph(scilens_dir + 'diffusion_graph_v7.tsv.bz2')
articles['refs'] = articles.url.apply(lambda u: set(G[u]))
articles = articles.set_index('url')
sciclops_dir = str(Path.home()) + '/Dropbox/sciclops/'

In [None]:
clusters = pd.DataFrame(columns=['refs','articles'])


In [None]:

def add_to_clusters(article, refs, max_combinations=3):
    
    global clusters
    
    for c in range(max_combinations):
        for comb in combinations(refs, c+1):

            if clusters.loc[clusters.refs == comb].empty:
                clusters = clusters.append({'refs':comb,'articles':[article]}, ignore_index=True)
            else:
                clusters.loc[clusters.refs == comb, 'articles'] = clusters.loc[clusters.refs == comb, 'articles'] + [article]


                
add_to_clusters('article1', ['a'])
add_to_clusters('article2', ['a', 'b', 'c'])
articles[:5].apply(lambda a: add_to_clusters(a.name, a.refs), axis=1)
clusters

In [None]:
refG = nx.Graph()
for u1 in articles.index:
    for u2 in articles.index:
        if u1<u2:
            inter = articles.loc[u1]['refs'].intersection(articles.loc[u2]['refs'])
            inter = inter.difference(set(['https://scholar.google.com/scholar_lookup', 
                                          'https://www.ncbi.nlm.nih.gov/pubmed/',
                                          'https://www.ncbi.nlm.nih.gov/entrez/query.fcgi',
                                          'https://www.springernature.com/us',
                                          'https://www.mendeley.com/import/',
                                          'https://www.scopus.com/inward/citedby.url']))
            if len(inter) >0:
                print (inter, u1, u2)

## Topic Modeling

In [None]:
scilens_dir = scilens_dir + '/cache/diffusion_graph/scilens_3M/'
df = pd.read_csv(scilens_dir + 'article_details_v1.tsv.bz2', sep='\t')

In [None]:
df['prelude'] = df.title + '\n' + df.full_text.apply(lambda w: w.split('\n')[0])

In [None]:
df['keywords'] = df.prelude.apply(lambda t: set([w for w in hn_vocabulary if w in str(t)]))

In [None]:
df['keywords'] = df['keywords'][~(df['keywords']=='')]

In [None]:
from gsdmm import MovieGroupProcess
mgp = MovieGroupProcess(K=50, alpha=0.1, beta=0.1, n_iters=50)
y = mgp.fit(df.keywords, vocab_size=100)

In [None]:
mgp.cluster_word_distribution

In [None]:
df.title.apply(lambda t: len([w for w in hn_vocabulary if w in str(t)])).value_counts()

In [None]:
df.full_text.apply(lambda t: min(5,len([w for w in hn_vocabulary if w in str(t)]))).value_counts()

In [None]:
df.full_text.apply(lambda t: min(5,len([w for w in hn_vocabulary if w in str(t.split('\n')[0])]))).value_counts()

In [None]:
import spacy
nlp = spacy.load('en_core_web_lg')

In [None]:
df['title_vector'] = df.title.apply(lambda x: nlp(str(x)).vector)

In [None]:
from sklearn.manifold import TSNE
tsne_model_en_2d = TSNE(perplexity=30, n_components=2, verbose=1, init='pca', n_iter=3500, random_state=32)

tsne_result = tsne_model_en_2d.fit_transform(np.stack(df['title_vector']))


In [None]:
tsne_result.shape

In [None]:
import seaborn as sns

plt.figure(figsize=(16,10))
sns.scatterplot(
    x = tsne_result[:,0],
    y = tsne_result[:,1],
    #hue="y",
    #palette=sns.color_palette("hls", 10),
    #data=tsne_result,
    legend="full",
    #alpha=0.3
)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(vocabulary=hn_vocabulary)

D = vectorizer.fit_transform(list(df.title.apply(str)))

In [None]:
D.shape

In [None]:
tsne_result_2 = tsne_model_en_2d.fit_transform(D.toarray())


In [None]:
import seaborn as sns

plt.figure(figsize=(16,10))
sns.scatterplot(
    x = tsne_result_2[:,0],
    y = tsne_result_2[:,1],
    #hue="y",
    #palette=sns.color_palette("hls", 10),
    #data=tsne_result,
    legend="full",
    #alpha=0.3
)