In [1]:
%load_ext autoreload
%autoreload 2

In [16]:
from collections import Counter
import glob
import os
import pickle
import re
import time

import pandas as pd
import numpy as np

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from gensim.models.ldamulticore import LdaMulticore
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 16})

NOTEBOOKS_DIR = os.path.abspath(os.getcwd())
ROOT_DIR = os.path.split(NOTEBOOKS_DIR)[0]
PROCESSED_DATA_DIR = os.path.join(ROOT_DIR, 'data', 'processed')

FINAL_DF_FILEPATH = os.path.join(PROCESSED_DATA_DIR, 'final.csv')
ML_ONLY_FILEPATH = os.path.join(PROCESSED_DATA_DIR, 'machine_learning_only.csv')

In [3]:
df_ml = pd.read_csv(ML_ONLY_FILEPATH, encoding='utf-8')

In [13]:
def print_influential_words_per_topic(H, vocabulary):
    '''
    Print the most influential words of each latent topic.
    '''
    hand_labels = []
    for i, row in enumerate(H):
        top_ten = np.argsort(row)[::-1][:10]
        print('topic', i)
        print('-->', ', '.join(vocabulary[top_ten]))
        print(H[i, top_ten])
        print()

In [6]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
tfidf_ml = tfidf_vectorizer.fit_transform(df_ml['description'])
features = np.array(tfidf_vectorizer.get_feature_names())
nmf_model = NMF(n_components=10, random_state=42)
W = nmf_model.fit_transform(tfidf_ml)
H = nmf_model.components_

In [7]:
features.shape

(1937107,)

In [8]:
W.shape

(48564, 10)

In [9]:
H.shape

(10, 1937107)

In [14]:
print_influential_words_per_topic(H, features)

topic 0
--> image, task, domain, model, features, classification, images, tasks, method, language
[0.78582865 0.66507851 0.65863228 0.62396726 0.59232523 0.55475786
 0.54615639 0.51782957 0.50370434 0.48367955]

topic 1
--> algorithm, gradient, optimization, convex, stochastic, convergence, algorithms, function, problem, problems
[0.68515299 0.65678419 0.62609002 0.58926665 0.55681618 0.53772341
 0.4500454  0.42127649 0.41929625 0.38676054]

topic 2
--> neural, networks, network, neural networks, deep, neural network, deep neural, training, convolutional, layer
[1.43846292 1.31056672 1.18750761 0.95406205 0.94484623 0.67429993
 0.4853477  0.46451645 0.41949394 0.41464689]

topic 3
--> policy, reinforcement, reinforcement learning, learning, agent, rl, reward, policies, agents, control
[0.98907337 0.79882513 0.78564878 0.78422828 0.58821074 0.52339129
 0.40045495 0.38715526 0.37969716 0.37061776]

topic 4
--> adversarial, attacks, adversarial examples, examples, attack, robustness, pert

# n-grams

In [17]:
start = time.time()
tfidf_vectorizer_2gram_1000 = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=1000)
tfidf_ml_2gram_1000 = tfidf_vectorizer_2gram_1000.fit_transform(df_ml['description'])
print('ngram_tfidf_ml:', time.time() - start)

features_2gram_1000 = np.array(tfidf_vectorizer_2gram_1000.get_feature_names())
print('features:', time.time() - start)

nmf_model_2gram_1000 = NMF(n_components=10, random_state=42)
print('nfm_model:', time.time() - start)

W_2gram_1000 = nmf_model_2gram_1000.fit_transform(tfidf_ml_2gram_1000)
print('W_ngram:', time.time() - start)

H_2gram_1000 = nmf_model_2gram_1000.components_
print('H_ngram:', time.time() - start)

ngram_tfidf_ml: 19.909547090530396
features: 19.91393804550171
nfm_model: 19.914159297943115
W_ngram: 24.656283140182495
H_ngram: 24.65682816505432


In [18]:
print_influential_words_per_topic(H_2gram_1000, features_2gram_1000)

topic 0
--> features, classification, image, feature, task, method, images, dataset, art, based
[1.48551153 1.48294038 1.34095394 1.18026789 1.14577577 1.04819745
 0.98955816 0.9442723  0.9205665  0.91896229]

topic 1
--> neural, networks, network, neural networks, deep, neural network, deep neural, training, convolutional, layer
[2.23273679 1.97466684 1.93633016 1.41595231 1.32358887 1.09753677
 0.70740265 0.63769829 0.61314055 0.5745794 ]

topic 2
--> gradient, stochastic, optimization, convergence, convex, descent, gradient descent, stochastic gradient, method, sgd
[1.96864527 1.40598325 1.38078347 1.16963396 1.10820961 1.0512812
 0.91541112 0.73387071 0.71381813 0.70826078]

topic 3
--> policy, reinforcement, reinforcement learning, learning, agent, rl, agents, reward, policies, control
[1.67296812 1.43572467 1.41010821 1.27256649 1.0849047  0.87955188
 0.71826246 0.68722938 0.68086033 0.66274452]

topic 4
--> model, models, inference, latent, bayesian, variational, variables, dist

# 3-grams

In [19]:
start = time.time()
tfidf_vectorizer_3gram_1000 = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=1000)
tfidf_ml_3gram_1000 = tfidf_vectorizer_3gram_1000.fit_transform(df_ml['description'])
print('ngram_tfidf_ml:', time.time() - start)

features_3gram_1000 = np.array(tfidf_vectorizer_3gram_1000.get_feature_names())
print('features:', time.time() - start)

nmf_model_3gram_1000 = NMF(n_components=10, random_state=42)
print('nfm_model:', time.time() - start)

W_3gram_1000 = nmf_model_3gram_1000.fit_transform(tfidf_ml_3gram_1000)
print('W_ngram:', time.time() - start)

H_3gram_1000 = nmf_model_3gram_1000.components_
print('H_ngram:', time.time() - start)

ngram_tfidf_ml: 18.723983764648438
features: 18.728272914886475
nfm_model: 18.728781938552856
W_ngram: 23.996538877487183
H_ngram: 23.996723890304565


In [20]:
print_influential_words_per_topic(H_3gram_1000, features_3gram_1000)

topic 0
--> features, classification, image, feature, task, method, images, dataset, art, based
[1.48551153 1.48294038 1.34095394 1.18026789 1.14577577 1.04819745
 0.98955816 0.9442723  0.9205665  0.91896229]

topic 1
--> neural, networks, network, neural networks, deep, neural network, deep neural, training, convolutional, layer
[2.23273679 1.97466684 1.93633016 1.41595231 1.32358887 1.09753677
 0.70740265 0.63769829 0.61314055 0.5745794 ]

topic 2
--> gradient, stochastic, optimization, convergence, convex, descent, gradient descent, stochastic gradient, method, sgd
[1.96864527 1.40598325 1.38078347 1.16963396 1.10820961 1.0512812
 0.91541112 0.73387071 0.71381813 0.70826078]

topic 3
--> policy, reinforcement, reinforcement learning, learning, agent, rl, agents, reward, policies, control
[1.67296812 1.43572467 1.41010821 1.27256649 1.0849047  0.87955188
 0.71826246 0.68722938 0.68086033 0.66274452]

topic 4
--> model, models, inference, latent, bayesian, variational, variables, dist

It looks like we're losing some information by only using 1000 features. For the full model, I'll probably want to remove that.

# only 2-grams (no single words)

In [21]:
start = time.time()
tfidf_vectorizer_2_only_gram_1000 = TfidfVectorizer(stop_words='english', ngram_range=(2, 2), max_features=1000)
tfidf_ml_2_only_gram_1000 = tfidf_vectorizer_2_only_gram_1000.fit_transform(df_ml['description'])
print('ngram_tfidf_ml:', time.time() - start)

features_2_only_gram_1000 = np.array(tfidf_vectorizer_2_only_gram_1000.get_feature_names())
print('features:', time.time() - start)

nmf_model_2_only_gram_1000 = NMF(n_components=10, random_state=42)
print('nfm_model:', time.time() - start)

W_2_only_gram_1000 = nmf_model_2_only_gram_1000.fit_transform(tfidf_ml_2_only_gram_1000)
print('W_ngram:', time.time() - start)

H_2_only_gram_1000 = nmf_model_2_only_gram_1000.components_
print('H_ngram:', time.time() - start)

ngram_tfidf_ml: 18.301547050476074
features: 18.305480003356934
nfm_model: 18.30560302734375
W_ngram: 19.193109035491943
H_ngram: 19.19334626197815


In [22]:
print_influential_words_per_topic(H_2_only_gram_1000, features_2_only_gram_1000)

topic 0
--> neural networks, deep neural, convolutional neural, recurrent neural, networks dnns, adversarial examples, networks cnns, artificial neural, cifar 10, training deep
[6.12176074 2.42153113 0.93374954 0.61003035 0.56534016 0.49000115
 0.4570256  0.30042007 0.28475252 0.28236073]

topic 1
--> machine learning, learning algorithms, learning models, learning techniques, learning ml, learning methods, learning model, learning systems, learning algorithm, using machine
[4.15484352 0.60839516 0.51489602 0.3769146  0.34104413 0.28882068
 0.16535568 0.16245515 0.15866517 0.15314164]

topic 2
--> state art, paper propose, art methods, propose novel, experimental results, proposed method, art performance, art results, outperforms state, semi supervised
[3.81160578 0.91408228 0.77320177 0.75042212 0.6823414  0.66813624
 0.66683694 0.57154162 0.54240953 0.53624966]

topic 3
--> deep learning, learning models, learning based, learning model, learning methods, learning techniques, using de

# Only 3-grams

In [25]:
start = time.time()
tfidf_vectorizer_3_only_gram_1000 = TfidfVectorizer(stop_words='english', ngram_range=(3, 3), max_features=1000)
tfidf_ml_3_only_gram_1000 = tfidf_vectorizer_3_only_gram_1000.fit_transform(df_ml['description'])
print('ngram_tfidf_ml:', time.time() - start)

features_3_only_gram_1000 = np.array(tfidf_vectorizer_3_only_gram_1000.get_feature_names())
print('features:', time.time() - start)

nmf_model_3_only_gram_1000 = NMF(n_components=10, random_state=42)
print('nfm_model:', time.time() - start)

W_3_only_gram_1000 = nmf_model_3_only_gram_1000.fit_transform(tfidf_ml_3_only_gram_1000)
print('W_ngram:', time.time() - start)

H_3_only_gram_1000 = nmf_model_3_only_gram_1000.components_
print('H_ngram:', time.time() - start)

ngram_tfidf_ml: 28.08918309211731
features: 28.097098112106323
nfm_model: 28.097248077392578
W_ngram: 28.749913215637207
H_ngram: 28.752339124679565


In [26]:
print_influential_words_per_topic(H_3_only_gram_1000, features_3_only_gram_1000)

topic 0
--> deep neural networks, neural networks dnns, training deep neural, neural networks dnn, using deep neural, state art performance, learning deep neural, based deep neural, mnist cifar 10, neural networks trained
[4.84470172 1.18548741 0.39347682 0.25046856 0.16658612 0.11803193
 0.11452266 0.09864378 0.09585715 0.09381212]

topic 1
--> convolutional neural networks, neural networks cnns, deep convolutional neural, neural networks cnn, state art performance, state art results, based convolutional neural, graph convolutional neural, achieve state art, using convolutional neural
[4.21062478 1.72590409 0.70024894 0.45661084 0.22809815 0.14496153
 0.1151631  0.10922772 0.10885449 0.09574735]

topic 2
--> convolutional neural network, neural network cnn, deep convolutional neural, neural network architecture, neural network trained, based convolutional neural, recurrent neural network, deep learning based, using convolutional neural, neural network model
[4.41966986 2.14260944 0.96