# Imports

In [1]:
import os
import gc
import pickle

In [2]:
from flair.data import Sentence
from flair.models import SequenceTagger
from flair.models import TextClassifier

from flair.data import TaggedCorpus
from flair.data_fetcher import  NLPTaskDataFetcher, NLPTask

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [3]:
import torch

# Initialize Mongo database

In [4]:
from pymongo import MongoClient

client = MongoClient()

db = client['glvis_db']

# Extract hidden representations from flair's pretrained NER model

In [5]:
ner = SequenceTagger.load('ner')

2019-03-18 15:05:43,372 loading file /home/snie/.flair/models/en-ner-conll03-v0.4.pt


In [6]:
corpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03, base_path='data/conll/')

2019-03-18 15:05:47,491 Reading data from data/conll/conll_03
2019-03-18 15:05:47,492 Train: data/conll/conll_03/eng.train
2019-03-18 15:05:47,492 Dev: data/conll/conll_03/eng.testa
2019-03-18 15:05:47,492 Test: data/conll/conll_03/eng.testb


In [5]:
ner

SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings()
    (list_embedding_1): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=300, bias=True)
      )
    )
    (list_embedding_2): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=300, bias=True)
      )
    )
  )
  (word_dropout): WordDropout()
  (locked_dropout): LockedDropout()
  (embedding2nn): Linear(in_features=4196, out_features=4196, bias=True)
  (rnn): LSTM(4196, 256, bidirectional=True)
  (linear): Linear(in_features=512, out_features=20, bias=True)
)

In [13]:
# Set batch size
batch_size = 64

In [7]:
# Define hook to get intermediate values
records = torch.zeros(9, 3, 512)

def hook(m, i):
    print(i[0].data.shape)
    records.copy_(i[0].data)

In [8]:
# Add the hook to model
h = ner.linear.register_forward_pre_hook(hook)

In [9]:
corpus.train[0:3]

[Sentence: "-DOCSTART-" - 1 Tokens,
 Sentence: "EU rejects German call to boycott British lamb ." - 9 Tokens,
 Sentence: "Peter Blackburn" - 2 Tokens]

In [10]:
ner.predict(corpus.train[0:3])

torch.Size([9, 3, 512])


[Sentence: "-DOCSTART-" - 1 Tokens,
 Sentence: "EU rejects German call to boycott British lamb ." - 9 Tokens,
 Sentence: "Peter Blackburn" - 2 Tokens]

In [36]:
t = corpus.train[0]

In [41]:
corpus.train[1].to_tagged_string()

'EU <NNP/I-NP/S-ORG> rejects <VBZ/I-VP> German <JJ/I-NP/S-MISC> call <NN/I-NP> to <TO/I-VP> boycott <VB/I-VP> British <JJ/I-NP/S-MISC> lamb <NN/I-NP> . <.>'

In [42]:
t.get_spans('ner')

[]

In [43]:
corpus.train[1].get_spans('ner')

[<ORG-span (1): "EU">, <MISC-span (3): "German">, <MISC-span (7): "British">]

In [34]:
records[:, 1:2, :]

tensor([[[ 0.0611, -0.0194,  0.5108,  ...,  0.7188, -0.0425, -0.0070]],

        [[ 0.1761,  0.3049, -0.1496,  ...,  0.6126, -0.0083, -0.5562]],

        [[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        ...,

        [[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]])

# Extract hidden states from pretrained en-sentiment model

In [3]:
train_pos = os.listdir('data/aclImdb_v1/aclImdb/train/pos/')
train_neg = os.listdir('data/aclImdb_v1/aclImdb/train/neg/')
test_pos = os.listdir('data/aclImdb_v1/aclImdb/test/pos/')
test_neg = os.listdir('data/aclImdb_v1/aclImdb/test/neg/')

In [5]:
pos_data = []
for name in train_pos:
    with open('data/aclImdb_v1/aclImdb/train/pos/' + name, 'r') as f:
        pos_data.append(f.readline())
for name in test_pos:
    with open('data/aclImdb_v1/aclImdb/test/pos/' + name, 'r') as f:
        pos_data.append(f.readline())

In [6]:
neg_data = []
for name in train_neg:
    with open('data/aclImdb_v1/aclImdb/train/neg/' + name, 'r') as f:
        neg_data.append(f.readline())
for name in test_neg:
    with open('data/aclImdb_v1/aclImdb/test/neg/' + name, 'r') as f:
        neg_data.append(f.readline())

### Load model

In [None]:
import torch

In [4]:
sent_model = TextClassifier.load('en-sentiment')

In [11]:
# Set batch size
batch_size = 64

In [12]:
# Define hook to get intermediate values
records = torch.zeros(batch_size, 2048)

def hook(m, i, o):
    print(i[0].data.shape)
    records.copy_(i[0].data)

In [None]:
# Add the hook to model
h = sent_model.decoder.register_forward_hook(hook)

In [9]:
len(pos_data) / batch_size

390.625

In [None]:
for i in range(len(neg_data) // batch_size):
    sentences = [Sentence(s) for s in neg_data[batch_size * i: batch_size * (i+1)]]
    sent_model.predict(sentences, mini_batch_size=batch_size)
    labels = [sen.labels[0].to_dict() for sen in sentences]
    
    val_list = records.tolist()
    
    db_entries = [{
        'sentence': neg_data[batch_size * i + ix],
        'reduce_mean': val_list[ix],
        'label': labels[ix]
    } for ix in range(len(sentences))]
    
    val_collection.insert_many(db_entries)

# Database util

### Flattened val_collection

In [20]:
for rec in val_collection.find():
    break

In [10]:
rec['label']

{'value': 'POSITIVE', 'confidence': 1.0}

In [22]:
add_fields = [
    {'$addFields': {'sentiment': '$label.value', 'confidence': '$label.confidence'}},
    {'$out': 'flattened'}
]

In [24]:
flattened = db['flattened']

In [27]:
# delete label fields in the document store
flattened.update_many({}, {'$unset': {'label': ''}})

<pymongo.results.UpdateResult at 0x7f4fd14ab9c8>

### Add index to val_collection

In [12]:
import pymongo

In [21]:
val_collection.create_index([('sentence', pymongo.TEXT)])

'sentence_text'

In [40]:
val_collection.create_index([('confidence', 1)])

'confidence_1'

In [46]:
val_collection.create_index('sentiment')

'sentiment_1'

In [61]:
# Test search on the index
cur = val_collection.find({
    '$and': [
        {'$text': {'$search': 'happy'}}, 
        {'sentiment': 'NEGATIVE'},
        {'confidence': {'$eq': 1.0}}
    ]
})

In [60]:
len(list(cur))

404

### Play with database

In [3]:
def query(term):
    pipeline = {
        '$text': {'$search': term}
    }

    return list(val_collection.find(pipeline))

In [8]:
res = query('\"movie\""')

In [84]:
vectors = np.array([elm['val'] for elm in res])

In [98]:
mean = np.mean(vectors, axis=0)
std = np.mean(vectors, axis=0)
stats = [
    {
        'dim': i,
        'mean': val[0],
        'std': val[1]
    } for i, val in enumerate(zip(mean, std))
]

### Utilities

In [127]:
pipeline = [
    {"$group": {"_id": "$sentence", "count": {"$sum": 1}}},
    {"$match": {"count": {"$gt": 1 }}}
]

In [128]:
res = val_collection.aggregate(pipeline, allowDiskUse=True)