In [None]:
import __init__
import sys

In [None]:
%%capture
!{sys.executable} -m pip install nltk

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('word_tokenize')
nltk.download('punkt')

import pyLDAvis
pyLDAvis.enable_notebook()

In [None]:
from normalize import *

In [None]:
import json

with open('data/papers.json', 'r', encoding = 'utf-8') as f:
  papers = json.load(f)

## Clustering Training

In [None]:
with open('./data/papers.json', 'r', encoding='utf-8') as f:
  data = json.load(f)
  
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import os

def build_model(tagged_data, name):
  model = Doc2Vec(
    vector_size = 20,
    alpha = 0.025, 
    min_alpha = 0.00025,
    min_count = 1,
    dm = 1,
  )

  model.build_vocab(tagged_data)

  max_epochs = 100

  for epoch in range(max_epochs):
    model.train(
      tagged_data,
      total_examples = model.corpus_count,
      epochs = model.epochs,
    )
    
    # decrease the learning rate
    model.alpha -= 0.0002
    
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

  model.save(f'data/{name}.model')
  
  return model

tagged_titles = [
  TaggedDocument(words = clean(paper['title']), tags=[paper['title']]) 
  for paper in papers
]

tagged_titles_and_abstracts = [
  TaggedDocument(words = clean(paper['title']) + clean(paper['abstract']), tags=[paper['title']]) 
  for paper in papers
]
  
title_model = build_model(tagged_titles, 'title')
title_and_abstract_model = build_model(tagged_titles_and_abstracts, 'title_and_abstract')

# Clustering

In [None]:
from sklearn.cluster import KMeans
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected = True)

def generate_prediction(model, n_clusters = 4):
  return KMeans(init = 'k-means++', n_clusters = n_clusters).fit(model.docvecs.vectors_docs)

def cluster_table(model):
  kmeans = generate_prediction(model)

  trace = go.Table(
    header = dict(values = ['Title', 'Cluster ID']),
    cells = dict(values = [model.docvecs.offset2doctag,kmeans.labels_]),
  )

  data = [trace]

  figure = go.Figure(data = data)

  return figure

In [None]:
iplot(cluster_table(title_model))

In [None]:
iplot(cluster_table(title_and_abstract_model))

In [None]:
from nltk.tokenize import word_tokenize

def find_most_similar_vectors(tagged_data, similar_to, model_name, create_model=False, topn=5):
  # similar_to: tag (int) or text

  if create_model:
    create_and_train(tagged_abstracts, model_name)
  
  model = Doc2Vec.load(f'data/{model_name}')

  if isinstance(similar_to, int):
    pass
  elif isinstance(similar_to, str):
    similar_to = [model.infer_vector(word_tokenize(similar_to))]
  
  similar_vectors = model.docvecs.most_similar(similar_to, topn = topn)
    
  result = [(tag, value) for tag, value in similar_vectors]

  return result

####  Example Most Similar Vectors

In [None]:
to_check = 'A Domain Specific Language based on Monads for Distributed Transactional Memory in Java'
cmp_string = find_most_similar_vectors(tagged_titles, to_check, 'title.model', topn = 10, create_model = False)
cmp_tag = find_most_similar_vectors(tagged_titles, 1, 'title.model', topn = 10, create_model = False)

## Topic Modeling

In [None]:
from topic_modeling import *

In [None]:
create_dir('./data/topic_modeling')

In [None]:
def papers_topic_modeling(field_name):
  clean_text = [clean(d[field_name]) for d in data]

  compute_lda_model(f'./data/topic_modeling/{field_name}', clean_text)
  
def display_topic_modeling(field_name):
  return display_lda_model(f'./data/topic_modeling/{field_name}')

In [None]:
papers_topic_modeling('title')
pyLDAvis.display(display_topic_modeling('title'))

In [None]:
papers_topic_modeling('abstract')
pyLDAvis.display(display_topic_modeling('abstract'))