In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import LocalOutlierFactor
from sklearn.datasets import fetch_20newsgroups
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
!pip install tensorflow-hub
import tensorflow_hub as hub

module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" 
embed = hub.load(module_url)



In [None]:
# Load extracted n26 data
vectors = pd.read_csv("data/extracted_n26_tsv_vecs.tsv", delimiter='\t|,', header=None, engine='python')
vectors = vectors.drop(vectors.columns[0], axis=1)
metadata = pd.read_csv("data/extracted_n26_tsv_metadata.tsv", delimiter='\t')

vec_meta = vectors.join(metadata)
train_data = vec_meta[vec_meta['FAQ_id'] <= 130] # only train on first 130 classes
train_data = train_data.drop(['FAQ_id', 'locale', 'market', 'question'], axis=1)

test_novel = vec_meta[(vec_meta['FAQ_id'] > 130) & (vec_meta['FAQ_id'] < 140)]
test_novel = test_novel.drop(['FAQ_id', 'locale', 'market', 'question'], axis=1)

test_not_novel = train_data.sample(n=30, random_state=1) # random sample of non-novel clusters
train_data = pd.concat([train_data,test_not_novel]).drop_duplicates(keep=False) # train data cannot contain test data

In [3]:
print("Loading 20newsgroups data...")
newsgroups_train = fetch_20newsgroups(subset='train', categories=['sci.crypt', 'sci.electronics', 'sci.med', 'sci.space'])
newsgroups_test_novel = fetch_20newsgroups(subset='test', categories=['rec.autos', 'rec.motorcycles'])
newsgroups_test_not_novel = fetch_20newsgroups(subset='test', categories=['sci.crypt', 'sci.electronics', 'sci.med', 'sci.space'])

print("Loading tensorflow session...")
with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    print("Embedding training text...")
    train = session.run(embed(newsgroups_train.data))
    print("Embedding novel test text...")
    test_novel = session.run(embed(newsgroups_test_novel.data))
    print("Embedding non-novel test text...")
    test_not_novel = session.run(embed(newsgroups_test_not_novel.data))

Loading 20newsgroups data...
Loading tensorflow session...
Embedding training text...
Embedding novel test text...
Embedding non-novel test text...
[[-0.02493628 -0.00875452 -0.03682045 ...  0.06833718  0.01936085
  -0.04084104]
 [-0.05356552 -0.00455837 -0.06219256 ...  0.05972339 -0.01134234
  -0.07541798]
 [ 0.02706743 -0.05515036 -0.05686761 ...  0.01804754  0.0138009
  -0.03079167]
 ...
 [ 0.00095232 -0.0658168  -0.06965939 ...  0.03900306 -0.02887024
  -0.03771075]
 [-0.03471673 -0.01773412 -0.02311416 ...  0.05316818 -0.04348731
  -0.03696968]
 [ 0.04498163  0.05106161 -0.0536316  ... -0.01554263 -0.07532571
  -0.05781196]]
[[-3.8983472e-02  4.6628855e-02 -4.2750228e-02 ...  6.3185364e-02
   4.8586439e-02 -5.0488707e-02]
 [-3.0889332e-05 -5.7217997e-02  2.6093222e-02 ...  3.1696301e-02
  -6.0166493e-02 -5.1477849e-02]
 [ 3.7204765e-02  3.2482296e-02 -5.1061399e-02 ...  6.4168021e-02
  -2.8178463e-02 -3.0378869e-02]
 ...
 [-1.2353746e-02 -5.8908980e-02 -5.1708084e-02 ...  5.47422

NameError: name 'test_not_' is not defined

In [4]:
print(train)
print(test_novel)
print(test_not_novel)

print(train.shape)
print(test_novel.shape)
print(test_not_novel.shape)

[[-0.02493628 -0.00875452 -0.03682045 ...  0.06833718  0.01936085
  -0.04084104]
 [-0.05356552 -0.00455837 -0.06219256 ...  0.05972339 -0.01134234
  -0.07541798]
 [ 0.02706743 -0.05515036 -0.05686761 ...  0.01804754  0.0138009
  -0.03079167]
 ...
 [ 0.00095232 -0.0658168  -0.06965939 ...  0.03900306 -0.02887024
  -0.03771075]
 [-0.03471673 -0.01773412 -0.02311416 ...  0.05316818 -0.04348731
  -0.03696968]
 [ 0.04498163  0.05106161 -0.0536316  ... -0.01554263 -0.07532571
  -0.05781196]]
[[-3.8983472e-02  4.6628855e-02 -4.2750228e-02 ...  6.3185364e-02
   4.8586439e-02 -5.0488707e-02]
 [-3.0889332e-05 -5.7217997e-02  2.6093222e-02 ...  3.1696301e-02
  -6.0166493e-02 -5.1477849e-02]
 [ 3.7204765e-02  3.2482296e-02 -5.1061399e-02 ...  6.4168021e-02
  -2.8178463e-02 -3.0378869e-02]
 ...
 [-1.2353746e-02 -5.8908980e-02 -5.1708084e-02 ...  5.4742273e-02
  -1.7764550e-02 -4.3340690e-02]
 [-7.1316823e-02 -1.7166175e-02 -1.5004803e-02 ... -4.1110326e-02
  -4.6790406e-02 -7.3208995e-02]
 [-6.0558

In [None]:
# Local outlier detection
lof = LocalOutlierFactor(n_neighbors=10, contamination='auto')
train_data['lof'] = lof.fit_predict(train_data)

# Getting the negative LOF score
train_data['negative_outlier_factor'] = lof.negative_outlier_factor_
# print(train_data.head)

outliers = train_data[train_data['lof'] == -1]
print(outliers.head)

In [16]:
# Novelty detection
lof = LocalOutlierFactor(n_neighbors=100, contamination=0.25, novelty=True)
lof.fit(train)

# Should predict -1 for all elements in test_novel
y_pred_test_novel = lof.predict(test_novel)
num_test_novel = y_pred_test_novel[y_pred_test_novel == -1].size
print("Size: " + str(num_test_novel))
print("Accuracy: " + str(num_test_novel / test_novel.shape[0]))

# Should predict 1 for all elements in test_not_novel
y_pred_test_not_novel = lof.predict(test_not_novel)
num_test_not_novel = y_pred_test_not_novel[y_pred_test_not_novel == 1].size
print("Size: " + str(num_test_not_novel))
print("Accuracy: " + str(num_test_not_novel / test_not_novel.shape[0]))

Size: 524
Accuracy: 0.6599496221662469
Size: 1055
Accuracy: 0.6681443951868271
