In [101]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import LocalOutlierFactor
from sklearn.datasets import fetch_20newsgroups
from sklearn.preprocessing import StandardScaler

!pip install umap-learn
import umap

!pip install tensorflow-hub
import tensorflow as tf
import tensorflow_hub as hub

module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" 
embed = hub.load(module_url)



In [159]:
# Load extracted n26 data
vectors = pd.read_csv("data/extracted_n26_tsv_vecs.tsv", delimiter='\t|,', header=None, engine='python')
vectors = vectors.drop(vectors.columns[0], axis=1)
metadata = pd.read_csv("data/extracted_n26_tsv_metadata.tsv", delimiter='\t')

vec_meta = vectors.join(metadata)
train = vec_meta[vec_meta['FAQ_id'] <= 130] # only train on first 130 classes
train = train.drop(['FAQ_id', 'locale', 'market', 'question'], axis=1)

test_novel = vec_meta[(vec_meta['FAQ_id'] > 130) & (vec_meta['FAQ_id'] < 140)]
test_novel = test_novel.drop(['FAQ_id', 'locale', 'market', 'question'], axis=1)

test_not_novel = train.sample(n=30, random_state=1) # random sample of non-novel clusters
train = pd.concat([train,test_not_novel]).drop_duplicates(keep=False) # train data cannot contain test data

In [3]:
print("Loading 20newsgroups data...")
newsgroups_train = fetch_20newsgroups(subset='train', categories=['sci.crypt', 'sci.electronics', 'sci.med', 'sci.space'])
newsgroups_test_novel = fetch_20newsgroups(subset='test', categories=['rec.autos', 'rec.motorcycles'])
newsgroups_test_not_novel = fetch_20newsgroups(subset='test', categories=['sci.crypt', 'sci.electronics', 'sci.med', 'sci.space'])

print("Loading tensorflow session...")
with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    print("Embedding training text...")
    train = session.run(embed(newsgroups_train.data))
    print("Embedding novel test text...")
    test_novel = session.run(embed(newsgroups_test_novel.data))
    print("Embedding non-novel test text...")
    test_not_novel = session.run(embed(newsgroups_test_not_novel.data))

print(train.shape)
print(test_novel.shape)
print(test_not_novel.shape)

Loading 20newsgroups data...
Loading tensorflow session...
Embedding training text...
Embedding novel test text...
Embedding non-novel test text...
[[-0.02493628 -0.00875452 -0.03682045 ...  0.06833718  0.01936085
  -0.04084104]
 [-0.05356552 -0.00455837 -0.06219256 ...  0.05972339 -0.01134234
  -0.07541798]
 [ 0.02706743 -0.05515036 -0.05686761 ...  0.01804754  0.0138009
  -0.03079167]
 ...
 [ 0.00095232 -0.0658168  -0.06965939 ...  0.03900306 -0.02887024
  -0.03771075]
 [-0.03471673 -0.01773412 -0.02311416 ...  0.05316818 -0.04348731
  -0.03696968]
 [ 0.04498163  0.05106161 -0.0536316  ... -0.01554263 -0.07532571
  -0.05781196]]
[[-3.8983472e-02  4.6628855e-02 -4.2750228e-02 ...  6.3185364e-02
   4.8586439e-02 -5.0488707e-02]
 [-3.0889332e-05 -5.7217997e-02  2.6093222e-02 ...  3.1696301e-02
  -6.0166493e-02 -5.1477849e-02]
 [ 3.7204765e-02  3.2482296e-02 -5.1061399e-02 ...  6.4168021e-02
  -2.8178463e-02 -3.0378869e-02]
 ...
 [-1.2353746e-02 -5.8908980e-02 -5.1708084e-02 ...  5.47422

NameError: name 'test_not_' is not defined

In [161]:
# Dimensionality reduction
reducer = umap.UMAP(n_components=3)
def reduce(raw_data):
    labels = pd.DataFrame(raw_data).join(metadata)['FAQ_id']
#     print(labels)
    scaled_data = StandardScaler().fit_transform(raw_data)
    embedding = reducer.fit_transform(scaled_data, y=labels)
    return embedding

reduced_train = reduce(train)
reduced_test_novel = reduce(test_novel)
reduced_test_not_novel = reduce(test_not_novel)

In [23]:
# Local outlier detection
lof = LocalOutlierFactor(n_neighbors=100, contamination=0.2)
train_df = pd.DataFrame(train)
train_df['lof'] = lof.fit_predict(train)

# Getting the negative LOF score
train_df['negative_outlier_factor'] = lof.negative_outlier_factor_
# print(train_data.head)

outliers = train_df[train_df['lof'] == -1]
print(outliers.head)

<bound method NDFrame.head of              1         2         3         4         5         6         7  \
45   -0.025578 -0.041987 -0.063839 -0.051133  0.028106 -0.061051  0.022543   
46   -0.032095 -0.055612 -0.071654 -0.034519  0.032207 -0.062536  0.028935   
47    0.027240 -0.026190 -0.032985  0.016827  0.060872  0.038157 -0.056079   
51    0.053881 -0.033968 -0.043604  0.034592  0.088728  0.020174 -0.065760   
55    0.026008 -0.047539 -0.044618  0.050966  0.082764 -0.025954 -0.043094   
...        ...       ...       ...       ...       ...       ...       ...   
1287  0.007229 -0.029586 -0.066311 -0.093518 -0.002644 -0.063794 -0.050951   
1288  0.013362  0.052013 -0.037614 -0.092381  0.033477  0.001189 -0.022374   
1292 -0.040373 -0.015938 -0.099218  0.022792  0.018013 -0.103935 -0.011122   
1293  0.011657 -0.013658 -0.094562  0.031199  0.038568 -0.110401 -0.017671   
1299  0.000121 -0.000738 -0.082709 -0.008067  0.024078 -0.115960  0.006990   

             8         9        1

In [163]:
def tune_hyperparam(nn, contam):
    # Novelty detection
    lof = LocalOutlierFactor(n_neighbors=nn, contamination=contam, novelty=True)
    lof.fit(reduced_train)

    # Should predict -1 for all elements in test_novel
    # All examples in test_novel are new categories
    y_pred_test_novel = lof.predict(reduced_test_novel)
    num_test_novel = y_pred_test_novel[y_pred_test_novel == -1].size

    # Should predict 1 for all elements in test_not_novel
    y_pred_test_not_novel = lof.predict(reduced_test_not_novel)
    num_test_not_novel = y_pred_test_not_novel[y_pred_test_not_novel == 1].size
    return num_test_novel, y_pred_test_novel.size, num_test_not_novel, y_pred_test_not_novel.size

for n_neighbors in range(5,15):
    for contamination in [x/100 for x in range(1, 50)]:
        num_novel, novel_size, num_nonnovel, nonnovel_size = tune_hyperparam(n_neighbors, contamination)
        novel_acc = num_novel/novel_size
        nonnovel_acc = num_nonnovel/nonnovel_size
        if novel_acc + nonnovel_acc > 1.9:
            print("Neighbors:" + str(n_neighbors))
            print("Contamination: " + str(contamination))
            print("Novel test acc: " + str(num_novel) + '/' + str(novel_size) + ' = ' + str(novel_acc))
            print("Non-novel test acc: " + str(num_nonnovel) + '/' + str(nonnovel_size) + ' = ' + str(nonnovel_acc))
            print()
        

Neighbors:7
Contamination: 0.23
Novel test acc: 58/63 = 0.9206349206349206
Non-novel test acc: 30/30 = 1.0

Neighbors:7
Contamination: 0.24
Novel test acc: 58/63 = 0.9206349206349206
Non-novel test acc: 30/30 = 1.0

Neighbors:7
Contamination: 0.3
Novel test acc: 61/63 = 0.9682539682539683
Non-novel test acc: 28/30 = 0.9333333333333333

Neighbors:12
Contamination: 0.31
Novel test acc: 59/63 = 0.9365079365079365
Non-novel test acc: 30/30 = 1.0

Neighbors:12
Contamination: 0.32
Novel test acc: 60/63 = 0.9523809523809523
Non-novel test acc: 30/30 = 1.0

Neighbors:12
Contamination: 0.33
Novel test acc: 61/63 = 0.9682539682539683
Non-novel test acc: 30/30 = 1.0

Neighbors:12
Contamination: 0.34
Novel test acc: 61/63 = 0.9682539682539683
Non-novel test acc: 30/30 = 1.0

Neighbors:12
Contamination: 0.35
Novel test acc: 61/63 = 0.9682539682539683
Non-novel test acc: 30/30 = 1.0

Neighbors:12
Contamination: 0.36
Novel test acc: 61/63 = 0.9682539682539683
Non-novel test acc: 30/30 = 1.0

Neighbor