In [2]:
import numpy as np

frequent_words = {}
with open('freq17k.txt', 'r') as inf:
    for line in inf:
        frequent_words[line.strip().split('\t')[0]] = 'not populated'
        
stopwords = []
with open('stopwords.txt', 'r') as inf:
    for line in inf:
        stopwords.append(line.strip())

with open('glove.6B.300d.txt', 'r') as inf:
    for line in inf:
        parts = line.strip().split(' ')
        if parts[0] in frequent_words:
            frequent_words[parts[0]] = np.array([float(x) for x in parts[1:]])

In [3]:
unp_keys = []
for word in frequent_words:
    if frequent_words[word] == 'not populated':
        unp_keys.append(word)
    
for word in unp_keys:
    del frequent_words[word]

for word in stopwords:
    if word in frequent_words:
        del frequent_words[word]

  This is separate from the ipykernel package so we can avoid doing imports until


In [33]:
word = 'education'
science_dims = np.abs(frequent_words[word])
print('Word:', word)

max_idx = np.argmax(science_dims)
print('Dimension:', max_idx)

closest_words = []
for word in frequent_words:
    closest_words.append((word, frequent_words[word][max_idx]))

print('Closest words:')

rev_flag = False
if frequent_words[word][max_idx] > 0:
    rev_flag = True
closest_list = sorted(closest_words, key=lambda x: x[1], reverse=rev_flag)[:10]
for i in closest_list:
    print(i)

Word: education
Dimension: 276
Closest words:
('thousands', -3.0639)
('residents', -3.0343)
('palestinian', -3.0306)
('hundreds', -3.0065)
('police', -3.0031)
('palestinians', -2.9903)
('weapons', -2.9228)
('students', -2.9186)
('people', -2.9016)
('use', -2.9015)


In [34]:
import random

all_words = [(key, frequent_words[key]) for key in frequent_words.keys()]
random.shuffle(all_words)
train_set = all_words[:15000]
test_set = all_words[15000:]

In [35]:
train_words = [item[0] for item in train_set]
train_vectors = [item[1] for item in train_set]

test_words = [item[0] for item in test_set]
test_vectors = [item[1] for item in test_set]

In [36]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class DenoisingAutoencoder(nn.Module):

    def __init__(self, input_dim, sparse_dim=1000):
        super(DenoisingAutoencoder, self).__init__()
        self.hidden = nn.Linear(input_dim, sparse_dim)
        self.out = nn.Linear(sparse_dim, input_dim)

    def forward(self, x):
        x = x + torch.normal(0, 0.4, size=x.size()).to(x.device)
        h = torch.clamp(self.hidden(x), min=0, max=1)
        o = self.out(h)
        return o, h

In [192]:
import torch.optim as optim

net = DenoisingAutoencoder(300).double()

mse_criterion = nn.MSELoss()
epochs = 5
optimizer = optim.Adam(net.parameters())
sparsity_frac = 0.15

for epoch in range(epochs):
    steps = 0
    random.shuffle(train_vectors)
    net.train()
    
    for i in range(0, len(train_vectors), 32):
        optimizer.zero_grad()
        batch = np.array(train_vectors[i:i+32])
        
        inp = torch.from_numpy(batch).double()
        out, hidden = net(inp)
        target_sf = hidden.new_full(hidden[0].size(), fill_value=sparsity_frac)
        
        loss = mse_criterion(out, inp) + torch.sum(torch.clamp((torch.mean(hidden, axis=0) - target_sf), min=0) ** 2) + torch.mean(hidden * (1 - hidden))
        loss.backward()
        if steps % 100 == 0:
            print('Epoch: {} \t Step: {} \t Training Loss: {}'.format(epoch, steps, loss.detach().numpy()))
        steps += 1
        optimizer.step()
        
    net.eval()
    batch = np.array(test_vectors)
    noise = np.random.normal(0, 0.4, batch.shape)
    inp = torch.from_numpy(batch).double()
    noisy_inp = torch.from_numpy(batch + noise).double()
    out, hidden = net(noisy_inp)
    loss = mse_criterion(out, inp)
    print('Epoch: {} \t Testing Reconstruction Loss: {}'.format(epoch, loss.detach().numpy()))


Epoch: 0 	 Step: 0 	 Training Loss: 0.5781896421873733
Epoch: 0 	 Step: 100 	 Training Loss: 0.15287216749330187
Epoch: 0 	 Step: 200 	 Training Loss: 0.14335025968848025
Epoch: 0 	 Step: 300 	 Training Loss: 0.12329336006627403
Epoch: 0 	 Step: 400 	 Training Loss: 0.11667178655735777
Epoch: 0 	 Testing Reconstruction Loss: 0.09492387589848965
Epoch: 1 	 Step: 0 	 Training Loss: 0.11350265327514808
Epoch: 1 	 Step: 100 	 Training Loss: 0.10428688968389258
Epoch: 1 	 Step: 200 	 Training Loss: 0.10275318759821532
Epoch: 1 	 Step: 300 	 Training Loss: 0.09937993696813349
Epoch: 1 	 Step: 400 	 Training Loss: 0.10286867977054837
Epoch: 1 	 Testing Reconstruction Loss: 0.09238702697351153
Epoch: 2 	 Step: 0 	 Training Loss: 0.09470932429806041
Epoch: 2 	 Step: 100 	 Training Loss: 0.09417839725157937
Epoch: 2 	 Step: 200 	 Training Loss: 0.09033812155398155
Epoch: 2 	 Step: 300 	 Training Loss: 0.08759788667927318
Epoch: 2 	 Step: 400 	 Training Loss: 0.08837546771283786
Epoch: 2 	 Testin

In [193]:
net.eval()
sparse_vectors = {}

for word in frequent_words:
    o, h = net(torch.from_numpy(np.array([frequent_words[word]])))
    sparse_vectors[word] = h.detach().numpy().squeeze()

In [196]:
word = 'education'
science_dims = np.abs(sparse_vectors[word])
print('Word:', word)

max_idx = np.argmax(science_dims)
print('Dimension:', max_idx)

closest_words = []
for word in sparse_vectors:
    closest_words.append((word, abs(sparse_vectors[word][max_idx])))

print('Closest words:')    
closest_list = sorted(closest_words, key=lambda x: x[1], reverse=True)[:10]
for i in closest_list:
    print(i)

Word: education
Dimension: 237
Closest words:
('school', 1.0)
('university', 1.0)
('education', 1.0)
('students', 1.0)
('college', 1.0)
('science', 1.0)
('student', 1.0)
('schools', 1.0)
('studies', 1.0)
('engineering', 1.0)


In [37]:
import torch
from transformers import BertModel, BertTokenizer

pretrained_weights = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
model = BertModel.from_pretrained(pretrained_weights)  

In [38]:
from tqdm import tqdm

bert_vectors = {}

with torch.no_grad():
    for word in tqdm(frequent_words):
        input_ids = torch.tensor([tokenizer.encode(word)])
        bert_vectors[word] = model(input_ids)[0][:,1,:].detach().numpy().squeeze()

100%|██████████| 16690/16690 [15:55<00:00, 17.46it/s]


In [34]:
# Normalization

all_vectors = []
for word in bert_vectors:
    all_vectors.append(bert_vectors[word])
all_vectors = np.array(all_vectors)

min_av = np.min(all_vectors, axis=0)
max_av = np.max(all_vectors, axis=0)

norm_bert_vectors = {}
for word in bert_vectors:
    norm_bert_vectors[word] = ((bert_vectors[word] - min_av) / (max_av - min_av))

In [39]:
word = 'science'
science_dims = np.abs(bert_vectors[word])
print('Word:', word)

max_idx = np.argmax(science_dims)
print('Dimension:', max_idx)

closest_words = []
for word in bert_vectors:
    closest_words.append((word, bert_vectors[word][max_idx]))

print('Closest words:') 
rev_flag = False
if bert_vectors[word][max_idx] > 0:
    rev_flag = True
closest_list = sorted(closest_words, key=lambda x: x[1], reverse=rev_flag)[:10]
for i in closest_list:
    print(i)

Word: science
Dimension: 308
Closest words:
('lifted', -6.2681575)
('servings', -6.258731)
('intervals', -6.249471)
('decks', -6.235012)
('advisors', -6.227306)
('beverage', -6.1765027)
('lendingtree', -6.172181)
('empires', -6.172099)
('funeral', -6.159995)
('malls', -6.1489334)


In [40]:
import random
import torch.optim as optim

all_words = [(key, bert_vectors[key]) for key in bert_vectors.keys()]
random.shuffle(all_words)
train_set = all_words[:15000]
test_set = all_words[15000:]

train_words = [item[0] for item in train_set]
train_vectors = [item[1] for item in train_set]

test_words = [item[0] for item in test_set]
test_vectors = [item[1] for item in test_set]

bert_net = DenoisingAutoencoder(768, 1500).double()

mse_criterion = nn.MSELoss()
epochs = 10
optimizer = optim.Adam(bert_net.parameters())
sparsity_frac = 0.05

for epoch in range(epochs):
    steps = 0
    random.shuffle(train_vectors)
    bert_net.train()
    
    for i in range(0, len(train_vectors), 32):
        optimizer.zero_grad()
        batch = np.array(train_vectors[i:i+32])
        
        inp = torch.from_numpy(batch).double()
        out, hidden = bert_net(inp)
        target_sf = hidden.new_full(hidden[0].size(), fill_value=sparsity_frac)
        
        loss = mse_criterion(out, inp) + torch.sum(torch.clamp((torch.mean(hidden, axis=0) - target_sf), min=0) ** 2) + torch.mean(hidden * (1 - hidden))
        loss.backward()
        if steps % 100 == 0:
            print('Epoch: {} \t Step: {} \t Training Loss: {}'.format(epoch, steps, loss.detach().numpy()))
        steps += 1
        optimizer.step()
        
    bert_net.eval()
    batch = np.array(test_vectors)
    inp = torch.from_numpy(batch).double()
    out, hidden = bert_net(inp)
    loss = mse_criterion(out, inp)
    print('Epoch: {} \t Testing Reconstruction Loss: {}'.format(epoch, loss.detach().numpy()))

Epoch: 0 	 Step: 0 	 Training Loss: 34.82261159201152
Epoch: 0 	 Step: 100 	 Training Loss: 0.22098053312640317
Epoch: 0 	 Step: 200 	 Training Loss: 0.1980836235463653
Epoch: 0 	 Step: 300 	 Training Loss: 0.1732385120508834
Epoch: 0 	 Step: 400 	 Training Loss: 0.15348737330110887
Epoch: 0 	 Testing Reconstruction Loss: 0.15339409885312652
Epoch: 1 	 Step: 0 	 Training Loss: 0.1515732235327169
Epoch: 1 	 Step: 100 	 Training Loss: 0.17068721398301068
Epoch: 1 	 Step: 200 	 Training Loss: 0.14867206741991323
Epoch: 1 	 Step: 300 	 Training Loss: 0.12990709795930677
Epoch: 1 	 Step: 400 	 Training Loss: 0.14331825219043892
Epoch: 1 	 Testing Reconstruction Loss: 0.12684075475239034
Epoch: 2 	 Step: 0 	 Training Loss: 0.12936777289950485
Epoch: 2 	 Step: 100 	 Training Loss: 0.13666537633099768
Epoch: 2 	 Step: 200 	 Training Loss: 0.12641225766798894
Epoch: 2 	 Step: 300 	 Training Loss: 0.11535100102314302
Epoch: 2 	 Step: 400 	 Training Loss: 0.13335425129276854
Epoch: 2 	 Testing Re

In [41]:
bert_net.eval()
sparse_bert_vectors = {}

for word in frequent_words:
    o, h = bert_net(torch.from_numpy(np.array([bert_vectors[word]])).double())
    sparse_bert_vectors[word] = h.detach().numpy().squeeze()

In [42]:
word = 'science'
science_dims = np.abs(sparse_bert_vectors[word])
print('Word:', word)

max_idx = np.argmax(science_dims)
print('Dimension:', max_idx)

closest_words = []
for word in bert_vectors:
    closest_words.append((word, abs(sparse_bert_vectors[word][max_idx])))

print('Closest words:')    
closest_list = sorted(closest_words, key=lambda x: x[1], reverse=True)[:10]
for i in closest_list:
    print(i)

Word: science
Dimension: 1227
Closest words:
('business', 1.0)
('health', 1.0)
('policy', 1.0)
('university', 1.0)
('management', 1.0)
('education', 1.0)
('technology', 1.0)
('sports', 1.0)
('systems', 1.0)
('media', 1.0)


In [43]:
torch.save(bert_net.state_dict(), 'sparse_net_bert_1500.pth')

In [28]:
import torch
from torch import nn
from torch.nn import CrossEntropyLoss, MSELoss
from transformers import BertModel, BertPreTrainedModel, BertConfig 

class BertForSequenceClassificationWithSparsity(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.hidden_size = config.hidden_size
        self.sparse_size = 1000
        
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
        
        self.sparse_net = DenoisingAutoencoder(config.hidden_size, self.sparse_size)
        self.sparse_net.load_state_dict(torch.load('sparse_net_bert.pth'))
        self.sparsity_frac = 0.05
        self.sparsity_imp = 0.1
        
        self.sparse_dense = nn.Linear(self.sparse_size, self.sparse_size)
        self.sparse_activation = nn.Tanh()
        self.sparse_classifier = nn.Linear(self.sparse_size, self.config.num_labels)
        
        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
    ):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )

        osize = outputs[0].size()
        all_outputs = outputs[0].reshape(osize[0]*osize[1], self.hidden_size)
        rec_outputs, sparse_outputs = self.sparse_net(all_outputs)
        
#         pooled_output = outputs[1]
#         pooled_output = self.dropout(pooled_output)
#         logits = self.classifier(pooled_output)

        loss_recon = MSELoss()
        target_sf = sparse_outputs.new_full(sparse_outputs[0].size(), fill_value=self.sparsity_frac)
        loss = self.sparsity_imp * (loss_recon(rec_outputs, all_outputs) + torch.sum(torch.clamp((sparse_outputs.mean(axis=0) - target_sf), min=0) ** 2) + torch.mean(sparse_outputs * (1 - sparse_outputs)))
        
        sparse_outputs = sparse_outputs.reshape(osize[0], osize[1], -1)
        sp_first_token_tensor = sparse_outputs[:, 0]
        sp_pooled_output = self.sparse_dense(sp_first_token_tensor)
        sp_pooled_output = self.sparse_activation(sp_pooled_output)
        sp_pooled_output = self.dropout(sp_pooled_output)
        sp_logits = self.sparse_classifier(sp_pooled_output)
        
        outputs = (sp_logits,) + outputs[2:]  # add hidden states and attention if they are here
        
        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = MSELoss()
                loss += loss_fct(sp_logits.view(-1), labels.view(-1))
            else:
                loss_fct = CrossEntropyLoss()
                loss += loss_fct(sp_logits.view(-1, self.num_labels), labels.view(-1))
        
        outputs = (loss,) + outputs

        return outputs  # (loss), logits, (hidden_states), (attentions)

In [29]:
config = BertConfig.from_pretrained('bert-base-uncased',
                                      num_labels=3,
                                      finetuning_task='xnli',
                                      cache_dir=None)

model = BertForSequenceClassificationWithSparsity.from_pretrained('bert-base-uncased',
                                        from_tf=False,
                                        config=config,
                                        cache_dir=None)

In [30]:
input_ids = torch.tensor([tokenizer.encode('this is it'), tokenizer.encode('this is not')])
labels = torch.tensor([0, 1])
a, b = model(input_ids=input_ids, labels=labels)

In [1]:
from transformers import BertConfig

In [2]:
class SparseBertConfig(BertConfig):

    def __init__(self, **args):
        super().__init__(**args)

        self.sparsity_frac = None
        self.sparsity_imp = None
        self.sparse_size = None
        self.sparse_noise_stf = None
        self.sparse_net_params = None
        self.pred_input_size = None

In [6]:
config = SparseBertConfig.from_pretrained('bert-base-uncased')
config.sparsity_frac = .05
config.sparsity_imp = .1
config.sparse_size = 1500
config.sparse_noise_stf = .4
config.sparse_net_params = ""

In [14]:
config

{
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pred_input_size": null,
  "pruned_heads": {},
  "sparse_net_params": "",
  "sparse_noise_stf": 0.4,
  "sparse_size": 1500,
  "sparsity_frac": 0.05,
  "sparsity_imp": 0.1,
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 30522
}