# Imports

In [1]:
import os
import gc
import pickle

In [2]:
from flair.data import Sentence
from flair.models import SequenceTagger
from flair.models import TextClassifier

from flair.data import TaggedCorpus
from flair.data_fetcher import  NLPTaskDataFetcher, NLPTask

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [3]:
import torch

# Initialize Mongo database

In [4]:
from pymongo import MongoClient

client = MongoClient()

db = client['glvis_db']

# Extract hidden representations from flair's pretrained NER model

In [5]:
ner = SequenceTagger.load('ner')

2019-03-18 16:23:11,508 loading file /home/snie/.flair/models/en-ner-conll03-v0.4.pt


In [6]:
corpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03, base_path='data/conll/')

2019-03-18 16:23:14,525 Reading data from data/conll/conll_03
2019-03-18 16:23:14,526 Train: data/conll/conll_03/eng.train
2019-03-18 16:23:14,526 Dev: data/conll/conll_03/eng.testa
2019-03-18 16:23:14,527 Test: data/conll/conll_03/eng.testb


In [7]:
len(corpus.get_all_sentences())

22137

In [8]:
db['flair_ner'].drop()

In [None]:
for i, sentence in enumerate(corpus.get_all_sentences()):
    print(f'Start sentence {i}')
    
    # Define hook to get intermediate values
    hidden_states = torch.zeros(len(sentence), 1, 512)
    def hook(m, i):
        hidden_states.copy_(i[0].data)
                                
    h = ner.linear.register_forward_pre_hook(hook)
                                
    ner.predict(sentence)
                                
    spans = sentence.get_spans('ner')
                                
    # Informaction to store: the named entities, their predicted labels, probabilities and hidden states
    # If there are multiple words for one entity, take the average value of hidden states
    # and record the number of words in the entity
    
    for span in spans:
        entry = {}
        entry['text'] = span.text
        entry['tag'] = span.tag
        entry['score'] = span.score
        entry['token_num'] = len(span.tokens)
        
        idx = [token.idx-1 for token in span.tokens]
        entry['linear_layer_state'] = hidden_states[idx, :, :].mean(dim=0).squeeze().tolist()
        
        db['flair_ner'].insert_one(entry)
    
    h.remove()
    
    print(f'Finish sentence {i}')

# Extract hidden states from pretrained en-sentiment model

In [3]:
train_pos = os.listdir('data/aclImdb_v1/aclImdb/train/pos/')
train_neg = os.listdir('data/aclImdb_v1/aclImdb/train/neg/')
test_pos = os.listdir('data/aclImdb_v1/aclImdb/test/pos/')
test_neg = os.listdir('data/aclImdb_v1/aclImdb/test/neg/')

In [5]:
pos_data = []
for name in train_pos:
    with open('data/aclImdb_v1/aclImdb/train/pos/' + name, 'r') as f:
        pos_data.append(f.readline())
for name in test_pos:
    with open('data/aclImdb_v1/aclImdb/test/pos/' + name, 'r') as f:
        pos_data.append(f.readline())

In [6]:
neg_data = []
for name in train_neg:
    with open('data/aclImdb_v1/aclImdb/train/neg/' + name, 'r') as f:
        neg_data.append(f.readline())
for name in test_neg:
    with open('data/aclImdb_v1/aclImdb/test/neg/' + name, 'r') as f:
        neg_data.append(f.readline())

### Load model

In [None]:
import torch

In [4]:
sent_model = TextClassifier.load('en-sentiment')

In [11]:
# Set batch size
batch_size = 64

In [12]:
# Define hook to get intermediate values
records = torch.zeros(batch_size, 2048)

def hook(m, i, o):
    print(i[0].data.shape)
    records.copy_(i[0].data)

In [None]:
# Add the hook to model
h = sent_model.decoder.register_forward_hook(hook)

In [9]:
len(pos_data) / batch_size

390.625

In [None]:
for i in range(len(neg_data) // batch_size):
    sentences = [Sentence(s) for s in neg_data[batch_size * i: batch_size * (i+1)]]
    sent_model.predict(sentences, mini_batch_size=batch_size)
    labels = [sen.labels[0].to_dict() for sen in sentences]
    
    val_list = records.tolist()
    
    db_entries = [{
        'sentence': neg_data[batch_size * i + ix],
        'reduce_mean': val_list[ix],
        'label': labels[ix]
    } for ix in range(len(sentences))]
    
    val_collection.insert_many(db_entries)

# Database util

### Flattened val_collection

In [20]:
for rec in val_collection.find():
    break

In [10]:
rec['label']

{'value': 'POSITIVE', 'confidence': 1.0}

In [22]:
add_fields = [
    {'$addFields': {'sentiment': '$label.value', 'confidence': '$label.confidence'}},
    {'$out': 'flattened'}
]

In [24]:
flattened = db['flattened']

In [27]:
# delete label fields in the document store
flattened.update_many({}, {'$unset': {'label': ''}})

<pymongo.results.UpdateResult at 0x7f4fd14ab9c8>

### Add index to val_collection

In [12]:
import pymongo

In [21]:
val_collection.create_index([('sentence', pymongo.TEXT)])

'sentence_text'

In [40]:
val_collection.create_index([('confidence', 1)])

'confidence_1'

In [46]:
val_collection.create_index('sentiment')

'sentiment_1'

In [61]:
# Test search on the index
cur = val_collection.find({
    '$and': [
        {'$text': {'$search': 'happy'}}, 
        {'sentiment': 'NEGATIVE'},
        {'confidence': {'$eq': 1.0}}
    ]
})

In [60]:
len(list(cur))

404

### Play with database

In [3]:
def query(term):
    pipeline = {
        '$text': {'$search': term}
    }

    return list(val_collection.find(pipeline))

In [8]:
res = query('\"movie\""')

In [84]:
vectors = np.array([elm['val'] for elm in res])

In [98]:
mean = np.mean(vectors, axis=0)
std = np.mean(vectors, axis=0)
stats = [
    {
        'dim': i,
        'mean': val[0],
        'std': val[1]
    } for i, val in enumerate(zip(mean, std))
]

### Utilities

In [127]:
pipeline = [
    {"$group": {"_id": "$sentence", "count": {"$sum": 1}}},
    {"$match": {"count": {"$gt": 1 }}}
]

In [128]:
res = val_collection.aggregate(pipeline, allowDiskUse=True)