In [1]:
%load_ext autoreload
%autoreload 2

In [40]:
from collections import Counter
import glob
import os
import pickle
import re
import string
import time

import pandas as pd
import numpy as np

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from gensim.models.ldamulticore import LdaMulticore
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.manifold import TSNE

import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 16})

NOTEBOOKS_DIR = os.path.abspath(os.getcwd())
ROOT_DIR = os.path.split(NOTEBOOKS_DIR)[0]
PROCESSED_DATA_DIR = os.path.join(ROOT_DIR, 'data', 'processed')

FINAL_DF_FILEPATH = os.path.join(PROCESSED_DATA_DIR, 'final.csv')
ML_ONLY_FILEPATH = os.path.join(PROCESSED_DATA_DIR, 'machine_learning_only.csv')

In [7]:
df_ml = pd.read_csv(ML_ONLY_FILEPATH, encoding='utf-8')
df_ml = df_ml.reset_index()

In [4]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_ml = tfidf_vectorizer.fit_transform(df_ml['description'])
features = np.array(tfidf_vectorizer.get_feature_names())
nmf_model = NMF(n_components=10, random_state=42)
W = nmf_model.fit_transform(tfidf_ml)
H = nmf_model.components_

In [8]:
document_idx = 0

# topic loadings for the document
print(W[document_idx])

# document title
print(df_ml['title'][document_idx])

# document description
print(df_ml['description'][document_idx])

[0.01750071 0.03036693 0.         0.         0.02932531 0.00416817
 0.         0.00764464 0.         0.00019623]
A Unified SVM Framework for Signal Estimation
  This paper presents a unified framework to tackle estimation problems in
Digital Signal Processing (DSP) using Support Vector Machines (SVMs). The use
of SVMs in estimation problems has been traditionally limited to its mere use
as a black-box model. Noting such limitations in the literature, we take
advantage of several properties of Mercer's kernels and functional analysis to
develop a family of SVM methods for estimation in DSP. Three types of signal
model equations are analyzed. First, when a specific time-signal structure is
assumed to model the underlying system that generated the data, the linear
signal model (so called Primal Signal Model formulation) is first stated and
analyzed. Then, non-linear versions of the signal structure can be readily
developed by following two different approaches. On the one hand, the signal

In [34]:
list(tfidf_vectorizer.transform([df_ml['description'][document_idx]]).toarray()[0])[:10]

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

In [18]:
H.shape

(10, 60744)

In [17]:
features

array(['00', '000', '00000', ..., 'zyody', 'zyx', 'zzstefan'],
      dtype='<U65')

### Create a dictionary where we can look up a word and get its loadings for each topic

In [19]:
word_topic_loading_lookup = {}
for i in range(len(features)):
    word_topic_loading_lookup[features[i]] = H[:, i]

In [21]:
word_topic_loading_lookup['data']

array([3.42728368, 0.        , 0.        , 0.        , 0.03507072,
       0.01918562, 0.04993902, 0.        , 0.29730119, 0.        ])

In [26]:
document_words = tfidf_vectorizer.inverse_transform(tfidf_vectorizer.transform([df_ml['description'][document_idx]]))[0]
document_words[:10]

array(['written', 'versions', 'vector', 'using', 'usefulness', 'used',
       'use', 'unified', 'underlying', 'types'], dtype='<U65')

In [33]:
for word in document_words:
    print(f'{word:17}: {np.round(word_topic_loading_lookup[word], 3)}')

written          : [0.008 0.007 0.006 0.003 0.003 0.    0.    0.018 0.001 0.   ]
versions         : [0.01  0.027 0.006 0.003 0.006 0.    0.003 0.003 0.006 0.024]
vector           : [0.126 0.106 0.    0.    0.    0.058 0.    0.129 0.002 0.092]
using            : [0.423 0.13  0.203 0.119 0.199 0.011 0.046 0.264 0.056 0.08 ]
usefulness       : [0.034 0.008 0.    0.006 0.015 0.    0.    0.011 0.01  0.005]
used             : [0.398 0.089 0.133 0.073 0.141 0.016 0.044 0.158 0.083 0.048]
use              : [0.286 0.05  0.125 0.133 0.15  0.015 0.029 0.156 0.017 0.04 ]
unified          : [0.022 0.042 0.004 0.013 0.02  0.022 0.005 0.034 0.018 0.001]
underlying       : [0.115 0.051 0.002 0.018 0.096 0.096 0.009 0.    0.046 0.058]
types            : [0.133 0.    0.048 0.008 0.021 0.044 0.027 0.054 0.017 0.003]
traditionally    : [0.02  0.002 0.008 0.011 0.003 0.002 0.    0.011 0.001 0.   ]
time             : [0.768 0.    0.235 0.069 0.104 0.017 0.    0.    0.024 0.311]
tackle           : [0.04  0.

In [51]:
translation = str.maketrans(string.punctuation, ' '*len(string.punctuation))
paper_words = df_ml['description'][document_idx].lower().translate(translation).split()
for word in paper_words:
    word_loadings = word_topic_loading_lookup.get(word)
    if word_loadings is not None:
        print(f'{word:17}: {np.round(word_loadings, 2)}')
    else:
        print(word)

this
paper            : [0.34 0.21 0.14 0.13 0.08 0.06 0.06 0.21 0.11 0.12]
presents         : [0.1  0.01 0.02 0.03 0.01 0.   0.   0.04 0.02 0.01]
a
unified          : [0.02 0.04 0.   0.01 0.02 0.02 0.   0.03 0.02 0.  ]
framework        : [0.27 0.16 0.01 0.22 0.18 0.14 0.08 0.16 0.04 0.01]
to
tackle           : [0.04 0.02 0.   0.02 0.   0.01 0.   0.05 0.   0.  ]
estimation       : [0.09 0.17 0.   0.   0.3  0.   0.   0.   0.01 0.16]
problems         : [0.13 0.65 0.   0.18 0.   0.04 0.   0.04 0.03 0.12]
in
digital          : [0.03 0.   0.01 0.   0.   0.   0.   0.01 0.   0.  ]
signal           : [0.08 0.14 0.03 0.   0.   0.09 0.   0.08 0.   0.03]
processing       : [0.16 0.03 0.09 0.   0.   0.03 0.   0.12 0.   0.  ]
dsp              : [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
using            : [0.42 0.13 0.2  0.12 0.2  0.01 0.05 0.26 0.06 0.08]
support          : [0.2  0.08 0.   0.   0.   0.   0.   0.03 0.   0.07]
vector           : [0.13 0.11 0.   0.   0.   0.06 0.   0.13 0.   0.09]
machines     

### Color each topic loading by intensity

In [131]:
for i in range(30, 100):
    for j in range(0, 100):
        print(f"\033[0;{i};{j}m Color test", f" -- ({i}, {j}) \n")

[0;30;0m Color test  -- (30, 0) 

[0;30;1m Color test  -- (30, 1) 

[0;30;2m Color test  -- (30, 2) 

[0;30;3m Color test  -- (30, 3) 

[0;30;4m Color test  -- (30, 4) 

[0;30;5m Color test  -- (30, 5) 

[0;30;6m Color test  -- (30, 6) 

[0;30;7m Color test  -- (30, 7) 

[0;30;8m Color test  -- (30, 8) 

[0;30;9m Color test  -- (30, 9) 

[0;30;10m Color test  -- (30, 10) 

[0;30;11m Color test  -- (30, 11) 

[0;30;12m Color test  -- (30, 12) 

[0;30;13m Color test  -- (30, 13) 

[0;30;14m Color test  -- (30, 14) 

[0;30;15m Color test  -- (30, 15) 

[0;30;16m Color test  -- (30, 16) 

[0;30;17m Color test  -- (30, 17) 

[0;30;18m Color test  -- (30, 18) 

[0;30;19m Color test  -- (30, 19) 

[0;30;20m Color test  -- (30, 20) 

[0;30;21m Color test  -- (30, 21) 

[0;30;22m Color test  -- (30, 22) 

[0;30;23m Color test  -- (30, 23) 

[0;30;24m Color test  -- (30, 24) 

[0;30;25m Color test  -- (30, 25) 

[0;30;26m Color test  -- (30, 26) 

[0;30;27m Color test

[0;44;65m Color test  -- (44, 65) 

[0;44;66m Color test  -- (44, 66) 

[0;44;67m Color test  -- (44, 67) 

[0;44;68m Color test  -- (44, 68) 

[0;44;69m Color test  -- (44, 69) 

[0;44;70m Color test  -- (44, 70) 

[0;44;71m Color test  -- (44, 71) 

[0;44;72m Color test  -- (44, 72) 

[0;44;73m Color test  -- (44, 73) 

[0;44;74m Color test  -- (44, 74) 

[0;44;75m Color test  -- (44, 75) 

[0;44;76m Color test  -- (44, 76) 

[0;44;77m Color test  -- (44, 77) 

[0;44;78m Color test  -- (44, 78) 

[0;44;79m Color test  -- (44, 79) 

[0;44;80m Color test  -- (44, 80) 

[0;44;81m Color test  -- (44, 81) 

[0;44;82m Color test  -- (44, 82) 

[0;44;83m Color test  -- (44, 83) 

[0;44;84m Color test  -- (44, 84) 

[0;44;85m Color test  -- (44, 85) 

[0;44;86m Color test  -- (44, 86) 

[0;44;87m Color test  -- (44, 87) 

[0;44;88m Color test  -- (44, 88) 

[0;44;89m Color test  -- (44, 89) 

[0;44;90m Color test  -- (44, 90) 

[0;44;91m Color test  -- (44, 91) 




[0;57;71m Color test  -- (57, 71) 

[0;57;72m Color test  -- (57, 72) 

[0;57;73m Color test  -- (57, 73) 

[0;57;74m Color test  -- (57, 74) 

[0;57;75m Color test  -- (57, 75) 

[0;57;76m Color test  -- (57, 76) 

[0;57;77m Color test  -- (57, 77) 

[0;57;78m Color test  -- (57, 78) 

[0;57;79m Color test  -- (57, 79) 

[0;57;80m Color test  -- (57, 80) 

[0;57;81m Color test  -- (57, 81) 

[0;57;82m Color test  -- (57, 82) 

[0;57;83m Color test  -- (57, 83) 

[0;57;84m Color test  -- (57, 84) 

[0;57;85m Color test  -- (57, 85) 

[0;57;86m Color test  -- (57, 86) 

[0;57;87m Color test  -- (57, 87) 

[0;57;88m Color test  -- (57, 88) 

[0;57;89m Color test  -- (57, 89) 

[0;57;90m Color test  -- (57, 90) 

[0;57;91m Color test  -- (57, 91) 

[0;57;92m Color test  -- (57, 92) 

[0;57;93m Color test  -- (57, 93) 

[0;57;94m Color test  -- (57, 94) 

[0;57;95m Color test  -- (57, 95) 

[0;57;96m Color test  -- (57, 96) 

[0;57;97m Color test  -- (57, 97) 




[0;73;14m Color test  -- (73, 14) 

[0;73;15m Color test  -- (73, 15) 

[0;73;16m Color test  -- (73, 16) 

[0;73;17m Color test  -- (73, 17) 

[0;73;18m Color test  -- (73, 18) 

[0;73;19m Color test  -- (73, 19) 

[0;73;20m Color test  -- (73, 20) 

[0;73;21m Color test  -- (73, 21) 

[0;73;22m Color test  -- (73, 22) 

[0;73;23m Color test  -- (73, 23) 

[0;73;24m Color test  -- (73, 24) 

[0;73;25m Color test  -- (73, 25) 

[0;73;26m Color test  -- (73, 26) 

[0;73;27m Color test  -- (73, 27) 

[0;73;28m Color test  -- (73, 28) 

[0;73;29m Color test  -- (73, 29) 

[0;73;30m Color test  -- (73, 30) 

[0;73;31m Color test  -- (73, 31) 

[0;73;32m Color test  -- (73, 32) 

[0;73;33m Color test  -- (73, 33) 

[0;73;34m Color test  -- (73, 34) 

[0;73;35m Color test  -- (73, 35) 

[0;73;36m Color test  -- (73, 36) 

[0;73;37m Color test  -- (73, 37) 

[0;73;38m Color test  -- (73, 38) 

[0;73;39m Color test  -- (73, 39) 

[0;73;40m Color test  -- (73, 40) 



[0;86;17m Color test  -- (86, 17) 

[0;86;18m Color test  -- (86, 18) 

[0;86;19m Color test  -- (86, 19) 

[0;86;20m Color test  -- (86, 20) 

[0;86;21m Color test  -- (86, 21) 

[0;86;22m Color test  -- (86, 22) 

[0;86;23m Color test  -- (86, 23) 

[0;86;24m Color test  -- (86, 24) 

[0;86;25m Color test  -- (86, 25) 

[0;86;26m Color test  -- (86, 26) 

[0;86;27m Color test  -- (86, 27) 

[0;86;28m Color test  -- (86, 28) 

[0;86;29m Color test  -- (86, 29) 

[0;86;30m Color test  -- (86, 30) 

[0;86;31m Color test  -- (86, 31) 

[0;86;32m Color test  -- (86, 32) 

[0;86;33m Color test  -- (86, 33) 

[0;86;34m Color test  -- (86, 34) 

[0;86;35m Color test  -- (86, 35) 

[0;86;36m Color test  -- (86, 36) 

[0;86;37m Color test  -- (86, 37) 

[0;86;38m Color test  -- (86, 38) 

[0;86;39m Color test  -- (86, 39) 

[0;86;40m Color test  -- (86, 40) 

[0;86;41m Color test  -- (86, 41) 

[0;86;42m Color test  -- (86, 42) 

[0;86;43m Color test  -- (86, 43) 



In [156]:
d = {
    0.1: "\033[0;30;47m {:.2f} \033[0;30;0m",
    0.5: "\033[0;30;46m {:.2f} \033[0;30;0m",
    1: "\033[0;30;43m {:.2f} \033[0;30;0m",
    2: "\033[0;30;45m {:.2f} \033[0;30;0m",
    3: "\033[0;30;41m {:.2f} \033[0;30;0m"
}

In [157]:
translation = str.maketrans(string.punctuation, ' '*len(string.punctuation))
paper_words = df_ml['description'][document_idx].lower().translate(translation).split()
for word in paper_words:
    num_row = []
    word_loadings = word_topic_loading_lookup.get(word)
    if word_loadings is not None:
        for num in word_loadings:
            if num < 0.1:
                num_row.append(d[0.1].format(round(num, 2)))
            elif num < 0.5:
                num_row.append(d[0.5].format(round(num, 2)))
            elif num < 1:
                num_row.append(d[1].format(round(num, 2)))
            elif num < 2:
                num_row.append(d[2].format(round(num, 2)))
            else:
                num_row.append(d[3].format(round(num, 2)))
        print(f'{word:17}:', *[num_row[i] for i in range(len(num_row))])
    else:
        print(word)

this
paper            : [0;30;46m 0.34 [0;30;0m [0;30;46m 0.21 [0;30;0m [0;30;46m 0.14 [0;30;0m [0;30;46m 0.13 [0;30;0m [0;30;47m 0.08 [0;30;0m [0;30;47m 0.06 [0;30;0m [0;30;47m 0.06 [0;30;0m [0;30;46m 0.21 [0;30;0m [0;30;46m 0.11 [0;30;0m [0;30;46m 0.12 [0;30;0m
presents         : [0;30;46m 0.10 [0;30;0m [0;30;47m 0.01 [0;30;0m [0;30;47m 0.02 [0;30;0m [0;30;47m 0.03 [0;30;0m [0;30;47m 0.01 [0;30;0m [0;30;47m 0.00 [0;30;0m [0;30;47m 0.00 [0;30;0m [0;30;47m 0.04 [0;30;0m [0;30;47m 0.02 [0;30;0m [0;30;47m 0.01 [0;30;0m
a
unified          : [0;30;47m 0.02 [0;30;0m [0;30;47m 0.04 [0;30;0m [0;30;47m 0.00 [0;30;0m [0;30;47m 0.01 [0;30;0m [0;30;47m 0.02 [0;30;0m [0;30;47m 0.02 [0;30;0m [0;30;47m 0.00 [0;30;0m [0;30;47m 0.03 [0;30;0m [0;30;47m 0.02 [0;30;0m [0;30;47m 0.00 [0;30;0m
framework        : [0;30;46m 0.27 [0;30;0m [0;30;46m 0.16 [0;30;0m [0;30;47m 0.01 [0;30;0m [0;30;46m 0.22 [0;30;0m [0;30;46m 0.18 [0;30;0m [0;30;

## Can we color the actual _word_ in the paragraph? Perhaps one topic at a time.

In [216]:
word_color_mappings = {
    0.1: "\033[0;30;47m{}\033[0;30;0m",
    0.5: "\033[0;30;46m{}\033[0;30;0m",
    1: "\033[0;30;43m{}\033[0;30;0m",
    2: "\033[0;30;45m{}\033[0;30;0m",
    3: "\033[0;30;41m{}\033[0;30;0m"
}

In [211]:
def get_colorful_words(topic_idx, paper_words, word_topic_loading_lookup, word_color_mappings):
    colorful_paper_description = []
    for word in paper_words:
        word_loadings = word_topic_loading_lookup.get(word)
        if word_loadings is not None:
            topic_loading = word_loadings[topic_idx]
            if topic_loading < 0.1:
                colorful_paper_description.append(word_color_mappings[0.1].format(word))
            elif topic_loading < 0.5:
                colorful_paper_description.append(word_color_mappings[0.5].format(word))
            elif topic_loading < 1:
                colorful_paper_description.append(word_color_mappings[1].format(word))
            elif topic_loading < 2:
                colorful_paper_description.append(word_color_mappings[2].format(word))
            else:
                colorful_paper_description.append(word_color_mappings[3].format(word))
        else:
            colorful_paper_description.append(word)
    
    return colorful_paper_description

def print_article_colored_by_word_loadings(document_idx, topic_labels=None):
    translation = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    paper_words = df_ml['description'][document_idx].lower().translate(translation).split()
    for topic_idx in range(10):
        colorful_words = get_colorful_words(topic_idx, paper_words, word_topic_loading_lookup, word_color_mappings)
        if topic_labels:
            print(f'TOPIC {topic_idx} - {topic_labels[topic_idx]}')
        else:
            print(f'TOPIC {topic_idx}')
        print()
        print(' '.join([colorful_words[i] for i in range(len(colorful_words))]))
        print()
        print('*' * 70)
        print()

In [245]:
hand_labeled_features = [
    'machine learning / time series', #0
    'gradient / optimization / convergence', #1
    'neural networks / deep learning', #2
    'reinforcement learning', #3
    'variational bayesian', #4
    'graphs / graph ML', #5
    'ML attacks / GANs', #6
    'image / text / classification', #7
    'clustering', #8
    'algorithms / regret / optimization' #9
]

def softmax(v, temperature=1.0):
    '''
    A heuristic to convert arbitrary positive values into probabilities.
    See: https://en.wikipedia.org/wiki/Softmax_function
    '''
    expv = np.exp(v / temperature)
    s = np.sum(expv)
    return expv / s

def analyze_article(paper_idx, descriptions, titles, W, hand_labels):
    '''
    Print an analysis of a single NYT articles, including the article text
    and a summary of which topics it represents. The topics are identified
    via the hand-labels which were assigned by the user.
    '''
    print('Title:', titles[paper_idx])
    print()
    print('Description:\n', descriptions[paper_idx])
    probs = softmax(W[paper_idx], temperature=0.01)
    for prob, label in zip(probs, hand_labels):
        print('--> {:.2f}% {}'.format(prob * 100, label))
    print()

def get_document_report(document_idx, df_ml, topic_labels=None):
    analyze_article(document_idx, df_ml['description'], df_ml['title'], W, hand_labeled_features)
    
    print()
    print('*' * 70)
    print()

    # print paragraphs, one for each topic, words colored by topic loadings
    print_article_colored_by_word_loadings(document_idx, topic_labels=topic_labels)

In [246]:
# This is one of the research papers that is most directionally aligned with only topics 3 and 5
get_document_report(23856, df_ml, topic_labels=hand_labeled_features)

Title: Graph Convolutional Reinforcement Learning

Description:
   Learning to cooperate is crucially important in multi-agent environments. The
key is to understand the mutual interplay between agents. However, multi-agent
environments are highly dynamic, which makes it hard to learn abstract
representations of their mutual interplay. To tackle these difficulties, we
propose graph convolutional reinforcement learning, where graph convolution
adapts to the dynamics of the underlying graph of the multi-agent environment,
and relation kernels capture the interplay between agents by their relation
representations. Latent features produced by convolutional layers from
gradually increased receptive fields are exploited to learn cooperation, and
cooperation is further boosted by temporal relation regularization for
consistency. Empirically, we show that our method substantially outperforms
existing methods in a variety of cooperative scenarios.

--> 0.21% machine learning / time series
--> 0

In [184]:
# This is one of the research papers that is most directionally aligned with only topic 7
get_document_report(21108, df_ml)

[0.         0.         0.         0.         0.         0.
 0.         0.07293617 0.         0.        ]
Unsupervised Domain Adaptation of Contextualized Embeddings for Sequence
  Labeling
  Contextualized word embeddings such as ELMo and BERT provide a foundation for
strong performance across a wide range of natural language processing tasks by
pretraining on large corpora of unlabeled text. However, the applicability of
this approach is unknown when the target domain varies substantially from the
pretraining corpus. We are specifically interested in the scenario in which
labeled data is available in only a canonical source domain such as newstext,
and the target domain is distinct from both the labeled and pretraining texts.
To address this scenario, we propose domain-adaptive fine-tuning, in which the
contextualized embeddings are adapted by masked language modeling on text from
the target domain. We test this approach on sequence labeling in two
challenging domains: Early Modern En

In [189]:
# This is the research paper with the highest summed topic loadings
get_document_report(25878, df_ml)

[0.         0.         0.00882468 0.0104415  0.         0.1303849
 0.00109208 0.0049476  0.10071876 0.        ]
Attributed Graph Clustering: A Deep Attentional Embedding Approach
  Graph clustering is a fundamental task which discovers communities or groups
in networks. Recent studies have mostly focused on developing deep learning
approaches to learn a compact graph embedding, upon which classic clustering
methods like k-means or spectral clustering algorithms are applied. These
two-step frameworks are difficult to manipulate and usually lead to suboptimal
performance, mainly because the graph embedding is not goal-directed, i.e.,
designed for the specific clustering task. In this paper, we propose a
goal-directed deep learning approach, Deep Attentional Embedded Graph
Clustering (DAEGC for short). Our method focuses on attributed graphs to
sufficiently explore the two sides of information in graphs. By employing an
attention network to capture the importance of the neighboring nodes 