In [2]:
# Preprocessing
import sys
import random
import time
import json
import os
import argparse
from utils import *
import numpy as np
import pandas as pd
from tqdm import tqdm_gui
from tqdm import tqdm
from nltk.tokenize import word_tokenize

from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn import metrics
from thop import profile
# Modeling
import torch
torch.backends.cudnn.enabled = False
from model import BiLSTM
from model import PGD_contrastive
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.models import Model, load_model
from keras import backend as K

# Evaluation
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import LocalOutlierFactor
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

dataset = 'atis'
MAX_NUM_WORDS = 10000
MAX_SEQ_LEN = 100
unseen_proportion = 100
cont_proportion = 1.0
mask_proportion = 0
proportion = 50

df, partition_to_n_row = load_data(dataset)

df['content_words'] = df['text'].apply(lambda s: word_tokenize(s))
texts = df['content_words'].apply(lambda l: " ".join(l))

# Do not filter out "," and "."
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token="<UNK>", filters='!"#$%&()*+-/:;<=>@[\]^_`{|}~')

tokenizer.fit_on_texts(texts)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(texts)
sequences_pad = pad_sequences(sequences, maxlen=MAX_SEQ_LEN, padding='post', truncating='post')

# Train-valid-test split
idx_train = (None, partition_to_n_row['train'])
idx_valid = (partition_to_n_row['train'], partition_to_n_row['train'] + partition_to_n_row['valid'])
idx_test = (partition_to_n_row['train'] + partition_to_n_row['valid'], partition_to_n_row['train'] + partition_to_n_row['valid'] + partition_to_n_row['test'])
idx_cont = (partition_to_n_row['train'] + partition_to_n_row['valid'] + partition_to_n_row['test'], None)

X_train = sequences_pad[idx_train[0]:idx_train[1]]
X_valid = sequences_pad[idx_valid[0]:idx_valid[1]]
X_test = sequences_pad[idx_test[0]:idx_test[1]]
X_cont = sequences_pad[idx_cont[0]:idx_cont[1]]

df_train = df[idx_train[0]:idx_train[1]]
df_valid = df[idx_valid[0]:idx_valid[1]]
df_test = df[idx_test[0]:idx_test[1]]
df_cont = df[idx_cont[0]:idx_cont[1]]

y_train = df_train.label.reset_index(drop=True)
y_valid = df_valid.label.reset_index(drop=True)
y_test = df_test.label.reset_index(drop=True)
y_cont = df_cont.label.reset_index(drop=True)
train_text = df_train.text.reset_index(drop=True)
valid_text = df_valid.text.reset_index(drop=True)
test_text = df_test.text.reset_index(drop=True)
cont_text = df_cont.text.reset_index(drop=True)
print("cont: %d" % (X_cont.shape[0]))

n_class = y_train.unique().shape[0]


## 我自己设置的固定已知类
y_cols_seen = pd.read_csv(f'data/{dataset}/known_class_{proportion}.txt', header=None)[0].tolist()
y_cols_unseen = list(set(y_train.value_counts().index) - set(y_cols_seen))
n_class_seen = len(y_cols_seen)
##

y_cols_unseen_b = []

for i in range(len(y_cols_seen)):
    tmp_idx = y_train[y_train.isin([y_cols_seen[i]])]
    tmp_idx = tmp_idx[:int(proportion / 100 * len(tmp_idx))].index
    if not i:
        part_train_seen_idx = tmp_idx
    else:
        part_train_seen_idx = np.concatenate((part_train_seen_idx, tmp_idx), axis=0)

train_seen_idx = y_train[y_train.isin(y_cols_seen)].index
train_ood_idx = y_train[y_train.isin(y_cols_unseen)]
train_ood_idx = train_ood_idx[:int(unseen_proportion / 100 * len(train_ood_idx))].index

valid_seen_idx = y_valid[y_valid.isin(y_cols_seen)].index
valid_ood_idx = y_valid[y_valid.isin(y_cols_unseen)]
valid_ood_idx = valid_ood_idx[:int(unseen_proportion / 100 * len(valid_ood_idx))].index

test_seen_idx = y_test[y_test.isin(y_cols_seen)].index
test_ood_idx = y_test[y_test.isin(y_cols_unseen)].index

src_cols = ['src']
bt_cols = ['bt']
src_idx = y_cont[y_cont.isin(src_cols)]
ind_src_idx = src_idx[:int(cont_proportion * 0.8 * len(src_idx))].index
ood_src_idx = src_idx[int(0.8 * len(src_idx)):int(0.8 * len(src_idx) + cont_proportion * 0.2 * len(src_idx))].index
bt_idx = y_cont[y_cont.isin(bt_cols)]
ind_bt_idx = bt_idx[:int(cont_proportion * 0.8 * len(bt_idx))].index
ood_bt_idx = bt_idx[int(0.8 * len(bt_idx)):int(0.8 * len(bt_idx) + cont_proportion * 0.2 * len(bt_idx))].index

X_train_seen = X_train[part_train_seen_idx]
X_train_ood = X_train[train_ood_idx]
y_train_seen = y_train[part_train_seen_idx]
train_seen_text = list(train_text[part_train_seen_idx])
train_unseen_text = list(train_text[train_ood_idx])
X_valid_seen = X_valid[valid_seen_idx]
X_valid_ood = X_valid[valid_ood_idx]
y_valid_seen = y_valid[valid_seen_idx]
valid_seen_text = list(valid_text[valid_seen_idx])
valid_unseen_text = list(valid_text[valid_ood_idx])
X_test_seen = X_test[test_seen_idx]
X_test_ood = X_test[test_ood_idx]
y_test_seen = y_test[test_seen_idx]
test_seen_text = list(test_text[test_seen_idx])
test_unseen_text = list(test_text[test_ood_idx])

print("train : valid : test = %d : %d : %d" % (X_train_seen.shape[0], X_valid_seen.shape[0], X_test_seen.shape[0]))


cont: 0
train : valid : test = 2172 : 433 : 777


In [None]:

src_ind_x = X_cont[ind_src_idx]
src_ind_y = y_cont[ind_src_idx]
bt_ind_x = X_cont[ind_bt_idx]
bt_ind_y = y_cont[ind_bt_idx]
src_ood_x = X_cont[ood_src_idx]
src_ood_y = y_cont[ood_src_idx]
bt_ood_x = X_cont[ood_bt_idx]
bt_ood_y = y_cont[ood_bt_idx]

if y_cols_unseen_b:
    train_ood_idx_b = y_train[y_train.isin(y_cols_unseen_b)].index
    X_train_ood_b = X_train[train_ood_idx_b]

le = LabelEncoder()
le.fit(y_train_seen)
y_train_idx = le.transform(y_train_seen)
y_valid_idx = le.transform(y_valid_seen)
y_test_idx = le.transform(y_test_seen)
ood_index = y_test_idx[0]
y_train_onehot = to_categorical(y_train_idx)
y_valid_onehot = to_categorical(y_valid_idx)
y_test_onehot = to_categorical(y_test_idx)

for i in range(int(mask_proportion / 100 * len(y_train_onehot))):
    y_train_onehot[i] = [0.0] * n_class_seen

y_train_ood = np.array([[0.0] * n_class_seen for _ in range(len(train_ood_idx))])

In [21]:
y_train_seen

47      atis_abbreviation
55      atis_abbreviation
70      atis_abbreviation
142     atis_abbreviation
223     atis_abbreviation
              ...        
1436        atis_quantity
1611        atis_quantity
1612        atis_quantity
1619        atis_quantity
1730        atis_quantity
Name: label, Length: 2172, dtype: object

In [20]:
n_class_seen

9

In [24]:
y_cols_seen = pd.read_csv(f'data/{dataset}/known_class_{proportion}.txt', header=None)
# y_cols_unseen = list(set(y_train.value_counts().index) - set(y_cols_seen))
# n_class_seen = len(y_cols_seen)
y_cols_seen

Unnamed: 0,0
0,atis_abbreviation
1,atis_airline
2,atis_cheapest
3,atis_city
4,atis_flight
5,atis_flight_no
6,atis_ground_fare
7,atis_ground_service
8,atis_quantity
