In [None]:
import __init__
import sys

In [None]:
%%capture
!{sys.executable} -m pip install nltk

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('word_tokenize')
nltk.download('punkt')

import pyLDAvis
pyLDAvis.enable_notebook()

In [None]:
from normalize import *

In [None]:
import json

with open('data/papers.json', 'r', encoding = 'utf-8') as f:
  papers = json.load(f)

# Manual Labeling

In [None]:
import pandas as pd

titles_df = pd.read_csv('./data/manual_topic_modeling_titles.csv')
titles_df.fillna('', inplace = True)
titles_df.head()

We started by manually labeling the papers and came up with 12 topics, which we then also chose as the number of clusters for the automatic clustering.

# Cluster Training

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import os

def build_model(tagged_data):
  model = Doc2Vec(
    vector_size = 20,
    alpha = 0.025, 
    min_alpha = 0.00025,
    min_count = 1,
    dm = 1,
  )

  model.build_vocab(tagged_data)

  max_epochs = 100

  for epoch in range(max_epochs):
    model.train(
      tagged_data,
      total_examples = model.corpus_count,
      epochs = model.epochs,
    )
    
    # decrease the learning rate
    model.alpha -= 0.0002
    
    # fix the learning rate, no decay
    model.min_alpha = model.alpha
  
  return model

tagged_titles = [
  TaggedDocument(words = clean(paper['title']), tags=[str(paper['id'])]) 
  for paper in papers
]

tagged_titles_and_abstracts = [
  TaggedDocument(words = clean(paper['title']) + clean(paper['abstract']), tags=[str(paper['id'])]) 
  for paper in papers
]
  
title_model = build_model(tagged_titles)
title_and_abstract_model = build_model(tagged_titles_and_abstracts)

# Clustering

In [None]:
from sklearn.cluster import KMeans
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected = True)

def generate_prediction(model, n_clusters = 12):
  return KMeans(init = 'k-means++', n_clusters = n_clusters).fit(model.docvecs.vectors_docs)

def plot_cluster_table(model, kmeans):
  trace = go.Table(
    header = dict(values = ['Document ID', 'Cluster ID']),
    cells = dict(values = [model.docvecs.offset2doctag, kmeans.labels_]),
  )

  data = [trace]

  figure = go.Figure(data = data)

  return iplot(figure)

title_kmeans = generate_prediction(title_model)
title_and_abstract_kmeans = generate_prediction(title_and_abstract_model)

In [None]:
plot_cluster_table(title_model, title_kmeans)

In [None]:
plot_cluster_table(title_and_abstract_model, title_and_abstract_kmeans)

In [None]:
def create_mapping_dict(model, kmeans):
  import numpy as np
  return dict(zip(np.int64(model.docvecs.offset2doctag), kmeans.labels_))

title_cluster_mapping = create_mapping_dict(title_model, title_kmeans)
title_and_abstract_cluster_mapping = create_mapping_dict(title_and_abstract_model, title_and_abstract_kmeans)

In [None]:
from nltk.tokenize import word_tokenize

def find_most_similar_vectors(tagged_data, similar_to, model_name, create_model=False, topn=5):
  # similar_to: tag (int) or text

  if create_model:
    create_and_train(tagged_abstracts, model_name)
  
  model = Doc2Vec.load(f'data/{model_name}')

  if isinstance(similar_to, int):
    pass
  elif isinstance(similar_to, str):
    similar_to = [model.infer_vector(word_tokenize(similar_to))]
  
  similar_vectors = model.docvecs.most_similar(similar_to, topn = topn)
    
  result = [(tag, value) for tag, value in similar_vectors]

  return result

####  Example Most Similar Vectors

In [None]:
to_check = 'A Domain Specific Language based on Monads for Distributed Transactional Memory in Java'
cmp_string = find_most_similar_vectors(tagged_titles, to_check, 'title.model', topn = 10, create_model = False)
cmp_tag = find_most_similar_vectors(tagged_titles, 1, 'title.model', topn = 10, create_model = False)

## Topic Modeling

In [None]:
from topic_modeling import *

In [None]:
create_dir('./data/topic_modeling')

In [None]:
def papers_topic_modeling(tagged_data, field_name):
  words = [t.words for t in tagged_data]
  return compute_lda_model(f'./data/topic_modeling/{field_name}', words)
  
def display_topic_modeling(field_name):
  return display_lda_model(f'./data/topic_modeling/{field_name}')

In [None]:
title_model = papers_topic_modeling(tagged_titles, 'title')
pyLDAvis.display(display_topic_modeling('title'))

In [None]:
title_and_abstract_model = papers_topic_modeling(tagged_titles_and_abstracts, 'title_and_abstract')
pyLDAvis.display(display_topic_modeling('title_and_abstract'))

## Accuracy of automated topic modeling

In [None]:
def compute_accuracy(tagged_data, field_name, manual_topics):
  def is_in_manual_topics(t):
    for i in manual_topics:
      topic = i.lower()
      topic = topic.replace('&', '')
      topic = list(filter(None, topic.split(' ')))
      
      if t in topic:
        return True

    return False
      
  accuracy = []
  
  for _ in range(10):
    tmp = 0
    title_model = papers_topic_modeling(tagged_data, field_name)

    topic_1 = title_model.show_topic(0, 12)
    topic_2 = title_model.show_topic(1, 12)
    topics = set(topic_1).union(topic_2)
    
    for t in topics:
      if is_in_manual_topics(t[0]):
        tmp += 1
    
    accuracy.append(tmp)
  
  return (np.mean(accuracy) / 10) * 100

In [None]:
accuracy = compute_accuracy(tagged_titles, 'title', titles_df.columns)
print(f'Accuracy of automated topic modeling for titles: {round(accuracy, 4)}%')

## Topic modeling for the topics of  the clusters

In [None]:
def cluster_topic_modeling(data, cluster_id, field_name):
  return compute_lda_model(f'./data/topic_modeling/clusters/{cluster_id}', data)

def get_topics_of_cluster(cluster):
  return [t[field_name] for t in papers if t['id'] == cluster]

def topic_modeling_for_cluster_topics(topic_cluster_mapping, field_name):
  clusters = np.unique([topic_cluster_mapping[t] for t in topic_cluster_mapping])
  cluster_topic_obj = {}
  
  for t in topic_cluster_mapping:
    cluster_topic_obj.setdefault(topic_cluster_mapping[t],[]).append(t)

  res = []
  
  for c in cluster_topic_obj:
    topic_ids = cluster_topic_obj[c]
    data = [clean(papers[t][field_name]) for t in topic_ids]
    res.append(compute_lda_model(f'./data/topic_modeling/clusters/{c}', data))
  
  return res
    
title_clusters_topic_modeling = topic_modeling_for_cluster_topics(title_cluster_mapping, 'title')

## Plot topics of a cluster

In [None]:
pyLDAvis.display(display_topic_modeling('clusters/3'))