In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
from collections import Counter
import glob
import os
import pickle
import re

import pandas as pd
import numpy as np

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from gensim.models.ldamulticore import LdaMulticore
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 16})

NOTEBOOKS_DIR = os.path.abspath(os.getcwd())
ROOT_DIR = os.path.split(NOTEBOOKS_DIR)[0]
PROCESSED_DATA_DIR = os.path.join(ROOT_DIR, 'data', 'processed')

FINAL_DF_FILEPATH = os.path.join(PROCESSED_DATA_DIR, 'final.csv')
MACHINE_LEARNING_ONLY_FILEPATH = os.path.join(PROCESSED_DATA_DIR, 'machine_learning_only.csv')

# Read in data

In [7]:
df = pd.read_csv(MACHINE_LEARNING_ONLY_FILEPATH, encoding='utf-8')

In [8]:
df.shape

(48564, 8)

In [28]:
sample_description = df[df['identifier'] == 'oai:arXiv.org:1609.09188']['description'].values[0]
print(sample_description)

  Academic researchers often need to face with a large collection of research
papers in the literature. This problem may be even worse for postgraduate
students who are new to a field and may not know where to start. To address
this problem, we have developed an online catalog of research papers where the
papers have been automatically categorized by a topic model. The catalog
contains 7719 papers from the proceedings of two artificial intelligence
conferences from 2000 to 2015. Rather than the commonly used Latent Dirichlet
Allocation, we use a recently proposed method called hierarchical latent tree
analysis for topic modeling. The resulting topic model contains a hierarchy of
topics so that users can browse the topics from the top level to the bottom
level. The topic model contains a manageable number of general topics at the
top level and allows thousands of fine-grained topics at the bottom level. It
also can detect topics that have emerged recently.



In [46]:
vectorizer_filename = os.path.join(ROOT_DIR, 'models', 'vectorizer_tfidf.pkl')
with open(vectorizer_filename, 'rb') as f:
    vectorizer = pickle.load(f)

model_filename = os.path.join(ROOT_DIR, 'models', 'nmf_10_model.pkl')
with open(model_filename, 'rb') as f:
    model = pickle.load(f)

In [39]:
sample_tfidf_vector = vectorizer.transform([sample_description])
sample_tfidf_vector = sample_tfidf_vector.toarray()[0]

In [43]:
print(sample_tfidf_vector[sample_tfidf_vector > 0])

[0.10975632 0.09190316 0.1685538  0.10435482 0.05620574 0.08639332
 0.05379542 0.04263916 0.06859368 0.0658737  0.14410896 0.05462832
 0.24535656 0.11689494 0.07520069 0.07020213 0.1343602  0.24019797
 0.071818   0.06077201 0.08701759 0.08938843 0.08061338 0.0641298
 0.07478367 0.05026622 0.09161293 0.06796482 0.09019584 0.07894361
 0.09123438 0.04048717 0.11634296 0.21804503 0.06300366 0.12508145
 0.03376272 0.0891519  0.05716179 0.06000553 0.03719288 0.04164689
 0.05881559 0.375308   0.15463678 0.06822834 0.12644252 0.03500645
 0.10722751 0.11067416 0.07596778 0.06104489 0.08540671 0.10238222
 0.08680664 0.29989887 0.43815856 0.07023727 0.04040139 0.03736698
 0.06578595 0.09360687]


In [44]:
sample_tfidf_vector

array([0., 0., 0., ..., 0., 0., 0.])

In [45]:
print("[0., 0., 0., ..., 0.11, 0.09, 0.17, ..., 0., 0., 0.]")

[0., 0., 0., ..., 0.11, 0.09, 0.17, ..., 0., 0., 0.]


In [56]:
sample_nmf_topic_loadings = model.transform(sample_tfidf_vector.reshape(1, -1))[0]
sample_nmf_topic_loadings.round(3)

array([0.004, 0.   , 0.   , 0.001, 0.029, 0.   , 0.   , 0.015, 0.003,
       0.002])

In [57]:
print("[0.004, 0.   , 0.   , 0.001, 0.029, 0.   , 0.   , 0.015, 0.003, 0.002]")

[0.004, 0.   , 0.   , 0.001, 0.029, 0.   , 0.   , 0.015, 0.003, 0.002]
