In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import LocalOutlierFactor
from sklearn.datasets import fetch_20newsgroups
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

!pip install umap-learn
import umap

!pip install tensorflow-hub
import tensorflow as tf
import tensorflow_hub as hub

module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" 
embed = hub.load(module_url)



In [32]:
# Load extracted n26 data
vectors = pd.read_csv("data/extracted_n26_tsv_vecs.tsv", delimiter='\t|,', header=None, engine='python')
vectors = vectors.drop(vectors.columns[0], axis=1)
metadata = pd.read_csv("data/extracted_n26_tsv_metadata.tsv", delimiter='\t')

vec_meta = vectors.join(metadata)
train = vec_meta[vec_meta['FAQ_id'] <= 135] # only train on first 130 classes
train = train.drop(['FAQ_id', 'locale', 'market', 'question'], axis=1)

test_novel = vec_meta[vec_meta['FAQ_id'] > 135]
test_novel = test_novel.drop(['FAQ_id', 'locale', 'market', 'question'], axis=1)

test_not_novel = train.sample(n=30, random_state=56) # random sample of non-novel clusters
train = pd.concat([train,test_not_novel]).drop_duplicates(keep=False) # train data cannot contain test data

In [27]:
# Load and embed 20newsgroups data
print("Loading 20newsgroups data...")
newsgroups_train = fetch_20newsgroups(subset='train', categories=['sci.crypt', 'sci.electronics', 'sci.med', 'sci.space'])
newsgroups_test_novel = fetch_20newsgroups(subset='test', categories=['rec.autos', 'rec.motorcycles'])
newsgroups_test_not_novel = fetch_20newsgroups(subset='test', categories=['sci.crypt', 'sci.electronics', 'sci.med', 'sci.space'])

print("Loading tensorflow session...")
with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    
    print("Embedding training text...")
    train = session.run(embed(newsgroups_train.data))
    train_vec_df = pd.DataFrame(train)
    train_vec_df.insert(loc=0, column='category', value=newsgroups_train.target)
    train_vec_df.to_csv (r'data/train_vec.csv', index=False)
    
    print("Embedding novel test text...")
    test_novel = session.run(embed(newsgroups_test_novel.data))
    test_novel_vec_df = pd.DataFrame(test_novel)
    labels = newsgroups_test_novel.target + len(newsgroups_train.target_names)
    test_novel_vec_df.insert(loc=0, column='category', value=labels)
    test_novel_vec_df.to_csv (r'data/test_novel_vec.csv', index=False)
    
    print("Embedding non-novel test text...")
    test_not_novel = session.run(embed(newsgroups_test_not_novel.data))
    test_not_novel_vec_df = pd.DataFrame(test_not_novel)
    test_not_novel_vec_df.insert(loc=0, column='category', value=newsgroups_test_not_novel.target)
    test_not_novel_vec_df.to_csv (r'data/test_not_novel_vec.csv', index=False)

print(train.shape)
print(test_novel.shape)
print(test_not_novel.shape)

Loading 20newsgroups data...
Loading tensorflow session...
Embedding training text...
<bound method NDFrame.head of       category         0         1         2         3         4         5  \
0            0 -0.024936 -0.008755 -0.036820 -0.017022  0.014010  0.001497   
1            1 -0.053566 -0.004558 -0.062193 -0.036859  0.028082  0.048462   
2            3  0.027067 -0.055150 -0.056868 -0.021518  0.029498 -0.064579   
3            1 -0.058897 -0.060390  0.079883  0.024593  0.058477  0.051801   
4            3  0.007847 -0.019826 -0.056083  0.004792  0.057461 -0.020218   
...        ...       ...       ...       ...       ...       ...       ...   
2368         0  0.025209 -0.011895 -0.039954 -0.041639 -0.027124 -0.013211   
2369         2 -0.012131  0.036753  0.060257 -0.055220 -0.062304  0.028782   
2370         3  0.000952 -0.065817 -0.069659 -0.032320  0.048500  0.039660   
2371         3 -0.034717 -0.017734 -0.023114 -0.048325 -0.002122 -0.067101   
2372         1  0.044982  

(2373, 512)
(794, 512)
(1579, 512)


In [28]:
# Load embedded data
train = pd.read_csv('data/train_vec.csv')
test_novel = pd.read_csv('data/test_novel_vec.csv')
test_not_novel = pd.read_csv('data/test_not_novel_vec.csv')

In [102]:
# Dimensionality reduction
# reducer = umap.UMAP(n_components=50)
reducer = PCA(n_components=40)
def reduce(dataframe):
#     labels = pd.DataFrame(dataframe).join(metadata)['FAQ_id']
    labels = dataframe['category']
    scaled_data = StandardScaler().fit_transform(dataframe.loc[:, '1'::])
    embedding = reducer.fit_transform(scaled_data, y=labels)
    return embedding

reduced_train = reduce(train)
reduced_test_novel = reduce(test_novel)
reduced_test_not_novel = reduce(test_not_novel)

# reduced_train = train
# reduced_test_novel = test_novel
# reduced_test_not_novel = test_not_novel

In [8]:
# Local outlier detection
lof = LocalOutlierFactor(n_neighbors=10, contamination=0.01)
train_df = pd.DataFrame(train)
train_df['lof'] = lof.fit_predict(train)

# Getting the negative LOF score
train_df['negative_outlier_factor'] = lof.negative_outlier_factor_
# print(train_data.head)

outliers = train_df[train_df['lof'] == -1].join(metadata)
print(outliers.head)
# print(outliers['FAQ_id', 'locale', 'market', 'question'])

NameError: name 'metadata' is not defined

In [137]:
def tune_hyperparam(nn, contam):
    # Novelty detection
#     print("Fitting LOF...")
    lof = LocalOutlierFactor(n_neighbors=nn, contamination=contam, novelty=True)
    lof.fit(reduced_train)

    # Should predict -1 for all elements in test_novel
    # All examples in test_novel are new categories
#     print("Predicting novel test examples...")
    y_pred_test_novel = lof.predict(reduced_test_novel)
    num_test_novel = y_pred_test_novel[y_pred_test_novel == -1].size

    # Should predict 1 for all elements in test_not_novel
#     print("Predicting non-novel test examples...")
    y_pred_test_not_novel = lof.predict(reduced_test_not_novel)
    num_test_not_novel = y_pred_test_not_novel[y_pred_test_not_novel == 1].size
    return num_test_novel, y_pred_test_novel.size, num_test_not_novel, y_pred_test_not_novel.size


for n_components in range(57,58):
    # Dimensionality reduction
    # reducer = umap.UMAP(n_components=50)
    print("n_components: " + str(n_components))
    reducer = PCA(n_components=n_components)
    reduced_train = reduce(train)
    reduced_test_novel = reduce(test_novel)
    reduced_test_not_novel = reduce(test_not_novel)
    for n_neighbors in range(75,100,2):
        print("--n_neighbors: " + str(n_neighbors))
        for contamination in [x/100 for x in range(1, 11, 2)]:
            num_novel, novel_size, num_nonnovel, nonnovel_size = tune_hyperparam(n_neighbors, contamination)
            novel_acc = num_novel/novel_size
            nonnovel_acc = num_nonnovel/nonnovel_size
            if novel_acc + nonnovel_acc > 1.235:
                print("----Neighbors:" + str(n_neighbors))
                print("----Contamination: " + str(contamination))
                print("----Novel test acc: " + str(num_novel) + '/' + str(novel_size) + ' = ' + str(novel_acc))
                print("----Non-novel test acc: " + str(num_nonnovel) + '/' + str(nonnovel_size) + ' = ' + str(nonnovel_acc))
                print()
        

n_components: 57
--n_neighbors: 75
--n_neighbors: 77
--n_neighbors: 79
--n_neighbors: 81
--n_neighbors: 83
--n_neighbors: 85
--n_neighbors: 87
--n_neighbors: 89
--n_neighbors: 91


KeyboardInterrupt: 