In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from collections import Counter
import glob
import os
import pickle
import re

import pandas as pd
import numpy as np

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from gensim.models.ldamulticore import LdaMulticore
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 16})

NOTEBOOKS_DIR = os.path.abspath(os.getcwd())
ROOT_DIRECTORY = os.path.split(NOTEBOOKS_DIR)[0]
DATA_DIRECTORY_RAW = os.path.join(ROOT_DIRECTORY, 'data', 'raw')
DATA_DIRECTORY_PROCESSED = os.path.join(ROOT_DIRECTORY, 'data', 'processed')
DATA_DIRECTORY_PROCESSED_DFS = os.path.join(ROOT_DIRECTORY, 'data', 'processed', 'dfs')
MODELS_DIRECTORY = os.path.join(ROOT_DIRECTORY, 'models')

FINAL_DF_FILEPATH = os.path.join(DATA_DIRECTORY_PROCESSED, 'final.csv')
ML_ONLY_FILEPATH = os.path.join(DATA_DIRECTORY_PROCESSED, 'machine_learning_only.csv')

In [3]:
df = pd.read_csv(ML_ONLY_FILEPATH)

In [4]:
n_components = 10
model_filename = os.path.join(MODELS_DIRECTORY, f'nmf_{n_components}_model.pkl')
vectorizer_filename = os.path.join(MODELS_DIRECTORY, f'vectorizer_tfidf.pkl')
weights_filename = os.path.join(MODELS_DIRECTORY, f'nmf_{n_components}_weights_W.pkl')

print('loading model')
with open(model_filename, 'rb') as f:
    nmf_model = pickle.load(f)

print('loading vectorizer')
with open(vectorizer_filename, 'rb') as f:
    tfidf_vectorizer = pickle.load(f)

print('loading weights')
with open(weights_filename, 'rb') as f:
    W = pickle.load(f)

loading model
loading vectorizer
loading weights


# Testing using gensim

In [5]:
from gensim.test.utils import common_corpus, common_dictionary, common_texts
from gensim.utils import simple_preprocess
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.nmf import Nmf
from gensim.corpora.dictionary import Dictionary

model = LdaModel(common_corpus, 5, common_dictionary)

cm = CoherenceModel(model=model, corpus=common_corpus, coherence='u_mass')
coherence = cm.get_coherence()  # get coherence value

In [6]:
coherence

-14.678142504651342

In [7]:
common_corpus

[[(0, 1), (1, 1), (2, 1)],
 [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(2, 1), (5, 1), (7, 1), (8, 1)],
 [(1, 1), (5, 2), (8, 1)],
 [(3, 1), (6, 1), (7, 1)],
 [(9, 1)],
 [(9, 1), (10, 1)],
 [(9, 1), (10, 1), (11, 1)],
 [(4, 1), (10, 1), (11, 1)]]

In [8]:
model

<gensim.models.ldamodel.LdaModel at 0x102f36c10>

In [9]:
model.print_topics()

[(0,
  '0.150*"user" + 0.103*"system" + 0.103*"computer" + 0.103*"response" + 0.103*"interface" + 0.103*"time" + 0.057*"trees" + 0.056*"graph" + 0.056*"survey" + 0.056*"minors"'),
 (1,
  '0.346*"trees" + 0.060*"graph" + 0.060*"minors" + 0.060*"interface" + 0.060*"time" + 0.059*"human" + 0.059*"system" + 0.059*"survey" + 0.059*"eps" + 0.059*"user"'),
 (2,
  '0.341*"system" + 0.186*"eps" + 0.186*"human" + 0.033*"trees" + 0.032*"graph" + 0.032*"survey" + 0.032*"minors" + 0.032*"interface" + 0.032*"user" + 0.032*"time"'),
 (3,
  '0.296*"graph" + 0.162*"survey" + 0.161*"minors" + 0.161*"trees" + 0.028*"human" + 0.028*"system" + 0.028*"user" + 0.028*"time" + 0.028*"response" + 0.028*"eps"'),
 (4,
  '0.087*"trees" + 0.084*"graph" + 0.083*"system" + 0.083*"minors" + 0.083*"time" + 0.083*"eps" + 0.083*"human" + 0.083*"user" + 0.083*"response" + 0.083*"interface"')]

In [10]:
list(common_dictionary.items())

[(0, 'computer'),
 (1, 'human'),
 (2, 'interface'),
 (3, 'response'),
 (4, 'survey'),
 (5, 'system'),
 (6, 'time'),
 (7, 'user'),
 (8, 'eps'),
 (9, 'trees'),
 (10, 'graph'),
 (11, 'minors')]

In [11]:
common_texts

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

In [12]:
sorted(simple_preprocess(df.iloc[0]['description']))[:10]

['advantage',
 'algorithms',
 'algorithms',
 'all',
 'alternative',
 'an',
 'analysis',
 'analysis',
 'analyzed',
 'analyzed']

# Tokenize text

In [13]:
processed_text = df['description'].map(simple_preprocess)

In [14]:
processed_text

0        [this, paper, presents, unified, framework, to...
1        [multitask, learning, can, be, effective, when...
2        [we, investigate, generic, problem, of, learni...
3        [novel, unified, bayesian, framework, for, net...
4        [this, work, considers, an, estimation, task, ...
                               ...                        
48559    [maximum, posteriori, map, inference, over, di...
48560    [this, paper, is, survey, of, dictionary, scre...
48561    [the, problem, of, secure, friend, discovery, ...
48562    [two, complementary, approaches, have, been, e...
48563    [this, monograph, presents, the, main, complex...
Name: description, Length: 48564, dtype: object

# Create dictionary and corpus, then train NMF model

In [15]:
# Create a corpus from a list of texts
ml_dictionary = Dictionary(processed_text)
ml_corpus = [ml_dictionary.doc2bow(text) for text in processed_text]

In [16]:
# Train the model on the corpus.
nmf = Nmf(ml_corpus, 10, ml_dictionary)

In [17]:
Nmf?

In [18]:
ml_corpus[0]

[(0, 1),
 (1, 2),
 (2, 1),
 (3, 1),
 (4, 1),
 (5, 2),
 (6, 2),
 (7, 8),
 (8, 2),
 (9, 2),
 (10, 1),
 (11, 1),
 (12, 1),
 (13, 1),
 (14, 1),
 (15, 2),
 (16, 1),
 (17, 1),
 (18, 1),
 (19, 1),
 (20, 1),
 (21, 4),
 (22, 1),
 (23, 2),
 (24, 1),
 (25, 1),
 (26, 1),
 (27, 1),
 (28, 1),
 (29, 1),
 (30, 1),
 (31, 1),
 (32, 2),
 (33, 2),
 (34, 1),
 (35, 4),
 (36, 1),
 (37, 1),
 (38, 2),
 (39, 1),
 (40, 4),
 (41, 1),
 (42, 1),
 (43, 1),
 (44, 2),
 (45, 1),
 (46, 4),
 (47, 3),
 (48, 2),
 (49, 1),
 (50, 1),
 (51, 1),
 (52, 1),
 (53, 1),
 (54, 2),
 (55, 1),
 (56, 1),
 (57, 1),
 (58, 2),
 (59, 1),
 (60, 10),
 (61, 1),
 (62, 1),
 (63, 5),
 (64, 1),
 (65, 1),
 (66, 2),
 (67, 1),
 (68, 1),
 (69, 1),
 (70, 6),
 (71, 1),
 (72, 1),
 (73, 1),
 (74, 2),
 (75, 1),
 (76, 1),
 (77, 3),
 (78, 9),
 (79, 1),
 (80, 4),
 (81, 1),
 (82, 1),
 (83, 1),
 (84, 1),
 (85, 1),
 (86, 12),
 (87, 2),
 (88, 1),
 (89, 1),
 (90, 1),
 (91, 1),
 (92, 1),
 (93, 1),
 (94, 1),
 (95, 3),
 (96, 2),
 (97, 1),
 (98, 1),
 (99, 2),
 (100, 1

# Trying LDA

In [19]:
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import preprocess_documents, preprocess_string

In [20]:
preprocess_documents([df.iloc[0]['description']])[0][:10]

['paper',
 'present',
 'unifi',
 'framework',
 'tackl',
 'estim',
 'problem',
 'digit',
 'signal',
 'process']

In [21]:
remove_stopwords(df.iloc[0]['description'])

"This paper presents unified framework tackle estimation problems Digital Signal Processing (DSP) Support Vector Machines (SVMs). The use SVMs estimation problems traditionally limited mere use black-box model. Noting limitations literature, advantage properties Mercer's kernels functional analysis develop family SVM methods estimation DSP. Three types signal model equations analyzed. First, specific time-signal structure assumed model underlying generated data, linear signal model (so called Primal Signal Model formulation) stated analyzed. Then, non-linear versions signal structure readily developed following different approaches. On hand, signal model equation written reproducing kernel Hilbert spaces (RKHS) well-known RKHS Signal Model formulation, Mercer's kernels readily SVM non-linear algorithms. On hand, alternative common Dual Signal Model formulation, signal expansion auxiliary signal model equation given non-linear regression time instant observed time series. These building

In [22]:
remove_stopwords("A a The the. The big".lower())

'the. big'

In [23]:
df.iloc[0]['description']

"  This paper presents a unified framework to tackle estimation problems in\nDigital Signal Processing (DSP) using Support Vector Machines (SVMs). The use\nof SVMs in estimation problems has been traditionally limited to its mere use\nas a black-box model. Noting such limitations in the literature, we take\nadvantage of several properties of Mercer's kernels and functional analysis to\ndevelop a family of SVM methods for estimation in DSP. Three types of signal\nmodel equations are analyzed. First, when a specific time-signal structure is\nassumed to model the underlying system that generated the data, the linear\nsignal model (so called Primal Signal Model formulation) is first stated and\nanalyzed. Then, non-linear versions of the signal structure can be readily\ndeveloped by following two different approaches. On the one hand, the signal\nmodel equation is written in reproducing kernel Hilbert spaces (RKHS) using the\nwell-known RKHS Signal Model formulation, and Mercer's kernels ar

In [24]:
processed_text = preprocess_documents(df['description'])

In [25]:
processed_text[0][:10]

['paper',
 'present',
 'unifi',
 'framework',
 'tackl',
 'estim',
 'problem',
 'digit',
 'signal',
 'process']

In [26]:
# Create a corpus from a list of texts
ml_dictionary = Dictionary(processed_text)
ml_corpus = [ml_dictionary.doc2bow(text) for text in processed_text]

# Looking at different number of topics

In [27]:
model = LdaModel(ml_corpus, 4, ml_dictionary)
for i in range(4):
    print(f"topic {i}")
    print(sorted([x[0] for x in model.show_topic(i)]))
    print()

topic 0
['algorithm', 'deep', 'learn', 'model', 'network', 'neural', 'perform', 'propos', 'task', 'train']

topic 1
['data', 'featur', 'gener', 'imag', 'learn', 'method', 'model', 'network', 'propos', 'train']

topic 2
['algorithm', 'data', 'distribut', 'estim', 'function', 'method', 'model', 'optim', 'problem', 'propos']

topic 3
['approach', 'base', 'data', 'learn', 'machin', 'model', 'predict', 'propos', 'time', 'user']



In [87]:
model = LdaModel(ml_corpus, 10, ml_dictionary)

In [88]:
cm = CoherenceModel(model=model, corpus=ml_corpus, coherence='u_mass')
coherence = cm.get_coherence()  # get coherence value

In [89]:
coherence

-1.680781506020875

In [97]:
for i in range(10):
    print(f"topic {i}")
    print(sorted([x[0] for x in model.show_topic(i)]))
    print()

topic 0
['adversari', 'data', 'gener', 'imag', 'kernel', 'learn', 'model', 'network', 'propos', 'train']

topic 1
['approach', 'base', 'classif', 'data', 'dataset', 'featur', 'imag', 'method', 'model', 'propos']

topic 2
['approach', 'bayesian', 'data', 'infer', 'learn', 'method', 'model', 'predict', 'propos', 'variabl']

topic 3
['approach', 'data', 'domain', 'learn', 'method', 'model', 'perform', 'propos', 'task', 'train']

topic 4
['algorithm', 'cluster', 'data', 'distribut', 'estim', 'measur', 'probabl', 'problem', 'sampl', 'set']

topic 5
['algorithm', 'data', 'dimension', 'function', 'graph', 'matrix', 'method', 'optim', 'problem', 'propos']

topic 6
['algorithm', 'bound', 'converg', 'function', 'gradient', 'learn', 'method', 'optim', 'problem', 'stochast']

topic 7
['attack', 'commun', 'data', 'detect', 'learn', 'model', 'network', 'node', 'system', 'time']

topic 8
['agent', 'algorithm', 'learn', 'onlin', 'optim', 'problem', 'recommend', 'regret', 'set', 'user']

topic 9
['arch

In [98]:
for i in (3, 4, 5, 8, 10, 12, 15, 20):
    print(i)
    m = LdaModel(ml_corpus, i, ml_dictionary)
    cm = CoherenceModel(model=m, corpus=ml_corpus, coherence='u_mass')
    print(cm.get_coherence())  # get coherence value

3
-1.382871453529041
4
-1.331313371232071
5
-1.3563345571951215
8
-1.5502184691140024
10
-1.602341419768642
12
-1.6915873802794137
15
-1.7205349996045516
20
-2.1730298366466374


In [101]:
for i in range(20):
    print(f"topic {i}")
    print(sorted([x[0] for x in m.show_topic(i)]))
    print()

topic 0
['algorithm', 'converg', 'convex', 'function', 'gradient', 'method', 'optim', 'problem', 'propos', 'stochast']

topic 1
['applic', 'base', 'data', 'deep', 'learn', 'machin', 'network', 'research', 'system', 'time']

topic 2
['action', 'agent', 'control', 'environ', 'learn', 'polici', 'reinforc', 'robot', 'state', 'task']

topic 3
['activ', 'convolut', 'deep', 'function', 'input', 'layer', 'network', 'neural', 'train', 'weight']

topic 4
['data', 'dimension', 'distanc', 'kernel', 'low', 'matrix', 'method', 'propos', 'space', 'vector']

topic 5
['algorithm', 'dictionari', 'learn', 'mathcal', 'matrix', 'problem', 'quantum', 'signal', 'spars', 'time']

topic 6
['data', 'label', 'learn', 'method', 'model', 'network', 'perform', 'propos', 'task', 'train']

topic 7
['base', 'chang', 'data', 'detect', 'filter', 'model', 'predict', 'price', 'propos', 'time']

topic 8
['base', 'factor', 'item', 'prefer', 'propos', 'rank', 'rate', 'recommend', 'sourc', 'user']

topic 9
['bound', 'class', 

In [110]:
for i in range(10):
    print(i, "--", sorted(m.get_document_topics(ml_corpus[i]), key=lambda x: x[1], reverse=True))
    print()

0 -- [(19, 0.42074344), (4, 0.32120946), (1, 0.16010426), (9, 0.065056354), (17, 0.019072272)]

1 -- [(18, 0.25287732), (6, 0.21733043), (15, 0.19612978), (0, 0.14315172), (9, 0.13410266), (3, 0.04862747)]

2 -- [(19, 0.47108638), (14, 0.14372198), (4, 0.14245492), (10, 0.13533966), (9, 0.09765017)]

3 -- [(14, 0.6709431), (19, 0.13877225), (0, 0.09675104), (17, 0.083773136)]

4 -- [(10, 0.59172595), (18, 0.20567958), (6, 0.1410145), (2, 0.05182047)]

5 -- [(18, 0.30910546), (4, 0.24588783), (6, 0.12937838), (19, 0.09191819), (0, 0.086828396), (12, 0.080965504), (3, 0.047124192)]

6 -- [(0, 0.63222873), (5, 0.13759778), (10, 0.124387175), (19, 0.052485265), (6, 0.043021303)]

7 -- [(0, 0.6585103), (5, 0.24158467), (8, 0.09140255)]

8 -- [(0, 0.5277565), (6, 0.30365962), (9, 0.11588411), (16, 0.026188208), (4, 0.019257301)]

9 -- [(14, 0.47788218), (9, 0.15134759), (19, 0.14869864), (17, 0.12767899), (15, 0.06180227), (4, 0.024634993)]



In [112]:
df.iloc[3]['title']

'Bayesian Discovery of Threat Networks'

In [120]:
print(df.iloc[3]['description'])

  A novel unified Bayesian framework for network detection is developed, under
which a detection algorithm is derived based on random walks on graphs. The
algorithm detects threat networks using partial observations of their activity,
and is proved to be optimum in the Neyman-Pearson sense. The algorithm is
defined by a graph, at least one observation, and a diffusion model for threat.
A link to well-known spectral detection methods is provided, and the
equivalence of the random walk and harmonic solutions to the Bayesian
formulation is proven. A general diffusion model is introduced that utilizes
spatio-temporal relationships between vertices, and is used for a specific
space-time formulation that leads to significant performance improvements on
coordinated covert networks. This performance is demonstrated using a new
hybrid mixed-membership blockmodel introduced to simulate random covert
networks with realistic properties.



In [121]:
print(df.iloc[5]['title'])
print(df.iloc[5]['description'])

Learning Non-Linear Feature Maps
  Feature selection plays a pivotal role in learning, particularly in areas
were parsimonious features can provide insight into the underlying process,
such as biology. Recent approaches for non-linear feature selection employing
greedy optimisation of Centred Kernel Target Alignment(KTA), while exhibiting
strong results in terms of generalisation accuracy and sparsity, can become
computationally prohibitive for high-dimensional datasets. We propose randSel,
a randomised feature selection algorithm, with attractive scaling properties.
Our theoretical analysis of randSel provides strong probabilistic guarantees
for the correct identification of relevant features. Experimental results on
real and artificial data, show that the method successfully identifies
effective features, performing better than a number of competitive approaches.

