In [1]:
import os
import random
import time
import torch
import torch.nn as nn
import numpy as np

from transformers import AdamW
from colbert.utils.runs import Run
from colbert.utils.amp import MixedPrecisionManager

from colbert.training.lazy_batcher import LazyBatcher
from colbert.training.eager_batcher import EagerBatcher
from colbert.parameters import DEVICE

from colbert.modeling.colbert import ColBERT
from colbert.utils.utils import print_message
from colbert.training.utils import print_progress, manage_checkpoints

query_maxlen = 512
query_maxlen = 512
doc_maxlen = 512
dim = 128
similarity = 'cosine'

colbert = ColBERT.from_pretrained('bert-base-uncased', query_maxlen=query_maxlen, doc_maxlen=doc_maxlen, dim=dim, similarity_metric=similarity, mask_punctuation=False)

2022-11-08 20:48:40.893162: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
Some weights of the model checkpoint at bert-base-uncased were not used when initializing ColBERT: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing ColBERT from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ColBERT from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of

In [2]:
from colbert.utils.easytt import TTLayer

In [3]:
print(sum(p.numel() for p in colbert.parameters()))

109580544


## SVD

In [40]:
colbert.device

device(type='cpu')

In [41]:
print(sum(p.numel() for p in colbert.parameters()))

109580544


In [42]:
colbert.bert.encoder.layer[5].intermediate.dense.weight.data.cpu().data.numpy().shape

(3072, 768)

In [46]:
from transformers.modeling_utils import Conv1D

def factorize_to_svd(fc_w, fc_b, rank, device = 'cpu'):
    U, S, Vt = np.linalg.svd(fc_w, full_matrices=False)


    # truncate SVD and fuse Sigma matrix
    w1 = np.dot(np.diag(np.sqrt(S[0:rank])),Vt[0:rank, :])
    w2 = np.dot(U[:, 0:rank], np.diag(np.sqrt(S[0:rank])))

    # create new layers and insert weights
    out_features, in_features = fc_w.shape
    linear1 = nn.Linear(in_features = in_features, 
                          out_features = rank,
                          bias = False)
    linear1.weight = nn.Parameter(torch.FloatTensor(w1))

    linear2 = nn.Linear(in_features = rank,
                          out_features = out_features,
                          bias=True)
    linear2.weight = nn.Parameter(torch.FloatTensor(w2))
    linear2.bias = nn.Parameter(torch.FloatTensor(fc_b))

    # create factorized layer
    factorized_layer = nn.Sequential(linear1, linear2)
    
    print (linear1.weight.shape, linear2.weight.shape)
    
    return factorized_layer

In [53]:
in_ = 768
out_ = 3072
batch_size = 128
rank = 16
fc_w = colbert.bert.encoder.layer[2].intermediate.dense.weight.data.cpu().data.numpy()
fc_b = colbert.bert.encoder.layer[2].intermediate.dense.bias.data.cpu().data.numpy()
m = nn.Linear(in_, out_)
input_ = torch.randn(batch_size, in_)
m_ttm = factorize_to_svd(fc_w, fc_b,rank=70)

out1 = m(input_)
out2 = m_ttm(input_)
assert out1.shape == torch.squeeze(out2).shape
assert torch.squeeze(out2).shape == (batch_size, out_)
    

torch.Size([70, 768]) torch.Size([3072, 70])


In [54]:
import torch
from torch import nn
import numpy as np

for i in [0, 2, 4, 6, 8, 10]:
    # fc part
    fc_w = colbert.bert.encoder.layer[i].intermediate.dense.weight.data.cpu().data.numpy()
    fc_b = colbert.bert.encoder.layer[i].intermediate.dense.bias.data.cpu().data.numpy()
    factorized_layer = factorize_to_svd(fc_w, fc_b, rank = 40)
    colbert.bert.encoder.layer[i].intermediate.dense = factorized_layer


torch.Size([40, 768]) torch.Size([3072, 40])
torch.Size([40, 768]) torch.Size([3072, 40])
torch.Size([40, 768]) torch.Size([3072, 40])
torch.Size([40, 768]) torch.Size([3072, 40])
torch.Size([40, 768]) torch.Size([3072, 40])
torch.Size([40, 768]) torch.Size([3072, 40])


In [35]:
print(sum(p.numel() for p in colbert.parameters()))

96346368


In [70]:
output_dir = "/notebook/ColBERT/compressed_checkpoint"
model_to_save = colbert.module if hasattr(colbert, 'module') else colbert  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
torch.save(colbert.state_dict(), os.path.join(output_dir, 'model_tt.pth'))
print("Saving model checkpoint to %s", output_dir)


Saving model checkpoint to %s /notebook/ColBERT/compressed_checkpoint


In [80]:
print(sum(p.numel() for p in colbert.parameters()))

97498368


In [None]:
colbert.num_parameters()

## TTM

In [10]:
import sys
sys.path.append("/notebook/greenAI/src/ttm_linear/ttm_linear")

In [11]:
from ttm_linear import FactorizationTTMLinear
for i in [0, 2, 4, 6, 8, 10]:
    # fc part
    fc_w = colbert.bert.encoder.layer[i].intermediate.dense.weight.data.cpu().data.numpy()
    fc_b = colbert.bert.encoder.layer[i].intermediate.dense.bias.data.cpu().data.numpy()
    (in_, out_) = fc_w.shape
    factorized_layer = FactorizationTTMLinear(in_, out_, rank=rank, max_core_dim_product = rank)
    colbert.bert.encoder.layer[i].intermediate.dense = factorized_layer

ImportError: attempted relative import with no known parent package

## Greedy TN 

In [25]:
import sys
sys.path.append("/notebook/GreedyTN")

In [26]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [27]:
import argparse
import pickle
from argparse import RawTextHelpFormatter
from ast import literal_eval

from tqdm.auto import tqdm

import discrete_optim_tensor_decomposition
from random_tensors import *
from tensor_decomposition_models import incremental_tensor_decomposition
from utils import seed_everything
from utils import tic, toc

In [28]:
gen_dic = {'tucker': generate_tucker, 'tt': generate_tensor_train, 'tr': generate_tensor_ring, 'triangle': generate_tensor_tri}

In [29]:
matrix.shape

torch.Size([768, 3072])

In [None]:
#goal_tn = gen_dic[target_type](target_dims, target_rank)

target_full = matrix.reshape((128, 144, 128))
print (target_full.shape)
target_full = target_full / torch.norm(target_full)
result = {'target_tn': [], 'target_full': target_full}

for decomp in "TT".split():
    print(decomp + "...")
    tic()
    result[decomp] = incremental_tensor_decomposition(target_full, decomp, verbose=False, max_num_params=2000000,
                                                              rank_increment_factor=1.5 if decomp == 'CP' else 1)
    result[decomp + "-time"] = toc()

In [None]:
target_full = matrix.reshape((16, 32, 16, 16, 18))
print (target_full.shape)
target_full = target_full / torch.norm(target_full)
result = {'target_tn': [], 'target_full': target_full}

for decomp in "TT".split():
    print(decomp + "...")
    tic()
    result[decomp] = incremental_tensor_decomposition(target_full, decomp, verbose=False, max_num_params=2000000,
                                                              rank_increment_factor=1.5 if decomp == 'CP' else 1)
    result[decomp + "-time"] = toc()

In [None]:
# decomposed params: 9216 + 2359296 + 

In [None]:
# all params: 2359296