In [2]:
from transformers import GPT2Model, GPT2Config, GPT2LMHeadModel
from transformers.configuration_utils import PretrainedConfig


2023-01-19 18:24:40.154802: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [3]:
import pathlib
# BASE_DIR will be like '/home/jovyan/DemoExample/'
BASE_DIR = pathlib.Path().absolute().parents[0]
print(f"Working dir: {BASE_DIR}")

Working dir: /notebook/GreenAl


In [13]:
#BASE_DIR = "/notebook/greenAl/"

In [14]:
def cuda_memory(device, offset: int = 0):
    return (torch.cuda.memory_allocated(device) / 2**20)

def cuda_peak_memory(device, offset: int = 0):
    return (torch.cuda.max_memory_reserved(device) / 2**20)

In [15]:
from transformers import GPT2Model, GPT2Config, GPT2LMHeadModel
import torch
from transformers import GPT2Tokenizer
from transformers import DataCollatorForLanguageModeling

from datasets import load_dataset
from transformers import TextDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [16]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

    #dataset_train = load_dataset('wikitext', 'wikitext-103-v1', split='train')
    #dataset_valid = load_dataset('wikitext', 'wikitext-103-v1', split='validation')
    
dataset_train = TextDataset(tokenizer=tokenizer, 
                                file_path=str(BASE_DIR) + "/wikitext-103/wiki.train.tokens", 
                                block_size=512)


dataset_valid = TextDataset(tokenizer=tokenizer, 
                                file_path=str(BASE_DIR) + "/wikitext-103/wiki.valid.tokens", 
                                block_size=512)
    
dataset_test = TextDataset(tokenizer=tokenizer, 
                                file_path=str(BASE_DIR) + "/wikitext-103/wiki.test.tokens", block_size=512)
print (len(dataset_train), len(dataset_valid), len(dataset_test))



232564 487 557


In [17]:
from transformers import Trainer, TrainingArguments
from transformers import default_data_collator
import torch

class Object(object):
    pass

args = Object()
args.local_rank = -1
args.max_steps = 2
args.per_gpu_train_batch_size = 1
args.per_gpu_eval_batch_size = 1
args.n_gpu = 1
args.gradient_accumulation_steps = 1
args.num_train_epochs = 100
args.weight_decay = 0.0
args.learning_rate = 6.25e-3
args.adam_epsilon = 1e-8
args.warmup_steps = 0
args.seed = 42
args.mlm = False
args.device = torch.device('cuda:1')
args.fp16 = False
args.max_grad_norm = 1.0
args.logging_steps = 500.0
args.save_steps = 50
args.evaluate_during_training = True
args.output_dir = '/notebook/greenAI/out_simple_transformer'
args.eval_batch_size = 32
args.save_total_limit = 2

In [18]:
from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
                                  GPT2Config, GPT2LMHeadModel, GPT2Tokenizer)

In [19]:
from tqdm import tqdm, trange

def train1(args, train_dataset, model, tokenizer, parameters, parameters_peak):
    """ Train the model """
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps = t_total)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
    

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                          output_device=args.local_rank,
                                                          find_unused_parameters=True)
    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
    
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            inputs, labels = (batch, batch)
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            
            print ("memory before training", parameters, parameters_peak)
            outputs = model(inputs)
            
            print(f'total memory after forward: {cuda_memory(args.device) - parameters}')
            print(f'peak memory after forward: {cuda_peak_memory(args.device)}')
            print ("\n")
            
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
            print (loss.mean())
            loss.mean().backward()
            print(f'total memory after backward: {cuda_memory(args.device) - parameters}')
            print(f'preak memory after backward: {cuda_peak_memory(args.device)}')
            print ("\n\n\n")
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1
                if args.max_steps > 0 and global_step > args.max_steps:
                    break

        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    return global_step, tr_loss / global_step



## Regular GPT med

In [42]:

parameters = cuda_memory(args.device)
parameters_peak = cuda_peak_memory(args.device)
model1 = GPT2LMHeadModel.from_pretrained("gpt2")

In [43]:
train1(args, dataset_train, model1.to(args.device), tokenizer, parameters, parameters_peak)

Epoch:   0% 0/1 [00:00<?, ?it/s]
Iteration:   0% 0/232564 [00:00<?, ?it/s][A
Iteration:   0% 2/232564 [00:00<6:09:48, 10.48it/s][A
Epoch:   0% 0/1 [00:00<?, ?it/s]

memory before training 536.345703125 3528.0
total memory after forward: 900.1279296875
peak memory after forward: 3528.0


tensor(-106.5253, device='cuda:1', grad_fn=<MeanBackward0>)
total memory after backward: 586.3583984375
preak memory after backward: 3528.0




memory before training 536.345703125 3528.0
total memory after forward: 2441.5166015625
peak memory after forward: 3528.0


tensor(-199.6567, device='cuda:1', grad_fn=<MeanBackward0>)
total memory after backward: 1541.2841796875
preak memory after backward: 3528.0




memory before training 536.345703125 3528.0
total memory after forward: 2440.0166015625
peak memory after forward: 3528.0


tensor(-242.5420, device='cuda:1', grad_fn=<MeanBackward0>)
total memory after backward: 1539.7841796875
preak memory after backward: 3528.0









(3, 0.0)

In [28]:
del model1
torch.cuda.empty_cache()

## GPT-TTM 16 old version 

In [29]:
parameters = cuda_memory(args.device)
parameters_peak = cuda_peak_memory(args.device)
model1 = GPT2LMHeadModel.from_pretrained("gpt2")

In [23]:
!pwd

/notebook/GreenAl/notebooks


In [30]:
import sys
sys.path.append("/notebook/GreenAl/") 
from old.linear import TTMLinear

for i in range(len(model1.transformer.h)):
    # fc part
    old_layer = model1.transformer.h[i].mlp.c_fc
    (in_, out_) = old_layer.weight.shape
    layer = TTMLinear(d_in = old_layer.weight.shape[0], d_out = old_layer.weight.shape[1], rank = 16).to(args.device)
    model1.transformer.h[i].mlp.c_fc = layer

    # projection
    old_layer = model1.transformer.h[i].mlp.c_proj
    (in_, out_) = old_layer.weight.shape
    layer = TTMLinear(d_in = old_layer.weight.shape[0], d_out = old_layer.weight.shape[1], rank = 16).to(args.device)
    #layer = drop_layer
    model1.transformer.h[i].mlp.c_proj = layer

[(4, 4), (4, 4), (4, 4), (4, 4), (3, 12)]
[(4, 4), (4, 4), (4, 4), (4, 4), (12, 3)]
[(4, 4), (4, 4), (4, 4), (4, 4), (3, 12)]
[(4, 4), (4, 4), (4, 4), (4, 4), (12, 3)]
[(4, 4), (4, 4), (4, 4), (4, 4), (3, 12)]
[(4, 4), (4, 4), (4, 4), (4, 4), (12, 3)]
[(4, 4), (4, 4), (4, 4), (4, 4), (3, 12)]
[(4, 4), (4, 4), (4, 4), (4, 4), (12, 3)]
[(4, 4), (4, 4), (4, 4), (4, 4), (3, 12)]
[(4, 4), (4, 4), (4, 4), (4, 4), (12, 3)]
[(4, 4), (4, 4), (4, 4), (4, 4), (3, 12)]
[(4, 4), (4, 4), (4, 4), (4, 4), (12, 3)]
[(4, 4), (4, 4), (4, 4), (4, 4), (3, 12)]
[(4, 4), (4, 4), (4, 4), (4, 4), (12, 3)]
[(4, 4), (4, 4), (4, 4), (4, 4), (3, 12)]
[(4, 4), (4, 4), (4, 4), (4, 4), (12, 3)]
[(4, 4), (4, 4), (4, 4), (4, 4), (3, 12)]
[(4, 4), (4, 4), (4, 4), (4, 4), (12, 3)]
[(4, 4), (4, 4), (4, 4), (4, 4), (3, 12)]
[(4, 4), (4, 4), (4, 4), (4, 4), (12, 3)]
[(4, 4), (4, 4), (4, 4), (4, 4), (3, 12)]
[(4, 4), (4, 4), (4, 4), (4, 4), (12, 3)]
[(4, 4), (4, 4), (4, 4), (4, 4), (3, 12)]
[(4, 4), (4, 4), (4, 4), (4, 4), (

In [31]:
train1(args, dataset_train, model1.to(args.device), tokenizer, parameters, parameters_peak)

Epoch:   0% 0/1 [00:00<?, ?it/s]
Iteration:   0% 0/232564 [00:00<?, ?it/s][A
Iteration:   0% 1/232564 [00:00<9:41:02,  6.67it/s][A

memory before training 0.05029296875 3528.0
total memory after forward: 1632.81689453125
peak memory after forward: 3528.0


tensor(4.5948, device='cuda:1', grad_fn=<MeanBackward0>)
total memory after backward: 688.85986328125
preak memory after backward: 3528.0




memory before training 0.05029296875 3528.0
total memory after forward: 2523.79931640625
peak memory after forward: 3528.0


tensor(92.6889, device='cuda:1', grad_fn=<MeanBackward0>)



Iteration:   0% 2/232564 [00:00<12:14:51,  5.27it/s][A
Epoch:   0% 0/1 [00:00<?, ?it/s]

total memory after backward: 1215.09814453125
preak memory after backward: 3528.0




memory before training 0.05029296875 3528.0
total memory after forward: 2525.29931640625
peak memory after forward: 3528.0


tensor(-81.6188, device='cuda:1', grad_fn=<MeanBackward0>)
total memory after backward: 1214.59814453125
preak memory after backward: 3528.0









(3, 0.0)

In [32]:
del model1
torch.cuda.empty_cache()

In [33]:
model1 = GPT2LMHeadModel.from_pretrained("gpt2")

In [34]:
parameters = cuda_memory(args.device)
parameters_peak = cuda_peak_memory(args.device)

In [35]:
import sys
sys.path.append(str(BASE_DIR)) 
from old.linear import TTMLinear

for i in range(len(model1.transformer.h)):
    # fc part
    old_layer = model1.transformer.h[i].mlp.c_fc
    (in_, out_) = old_layer.weight.shape
    layer = TTMLinear(d_in = old_layer.weight.shape[0], d_out = old_layer.weight.shape[1], rank = 32).to(args.device)
    model1.transformer.h[i].mlp.c_fc = layer

    # projection
    old_layer = model1.transformer.h[i].mlp.c_proj
    (in_, out_) = old_layer.weight.shape
    layer = TTMLinear(d_in = old_layer.weight.shape[0], d_out = old_layer.weight.shape[1], rank = 32).to(args.device)
    #layer = drop_layer
    model1.transformer.h[i].mlp.c_proj = layer

[(8, 8), (8, 8), (12, 8), (1, 6)]
[(8, 8), (8, 8), (8, 12), (6, 1)]
[(8, 8), (8, 8), (12, 8), (1, 6)]
[(8, 8), (8, 8), (8, 12), (6, 1)]
[(8, 8), (8, 8), (12, 8), (1, 6)]
[(8, 8), (8, 8), (8, 12), (6, 1)]
[(8, 8), (8, 8), (12, 8), (1, 6)]
[(8, 8), (8, 8), (8, 12), (6, 1)]
[(8, 8), (8, 8), (12, 8), (1, 6)]
[(8, 8), (8, 8), (8, 12), (6, 1)]
[(8, 8), (8, 8), (12, 8), (1, 6)]
[(8, 8), (8, 8), (8, 12), (6, 1)]
[(8, 8), (8, 8), (12, 8), (1, 6)]
[(8, 8), (8, 8), (8, 12), (6, 1)]
[(8, 8), (8, 8), (12, 8), (1, 6)]
[(8, 8), (8, 8), (8, 12), (6, 1)]
[(8, 8), (8, 8), (12, 8), (1, 6)]
[(8, 8), (8, 8), (8, 12), (6, 1)]
[(8, 8), (8, 8), (12, 8), (1, 6)]
[(8, 8), (8, 8), (8, 12), (6, 1)]
[(8, 8), (8, 8), (12, 8), (1, 6)]
[(8, 8), (8, 8), (8, 12), (6, 1)]
[(8, 8), (8, 8), (12, 8), (1, 6)]
[(8, 8), (8, 8), (8, 12), (6, 1)]


In [41]:
train1(args, dataset_train, model1.to(args.device), tokenizer, parameters, parameters_peak)

Epoch:   0% 0/1 [00:00<?, ?it/s]
Iteration:   0% 0/232564 [00:00<?, ?it/s][A

memory before training 0.6337890625 3528.0
total memory after forward: 1219.6904296875
peak memory after forward: 3528.0


tensor(-36.5176, device='cuda:1', grad_fn=<MeanBackward0>)
total memory after backward: 687.8779296875
preak memory after backward: 3528.0







Iteration:   0% 1/232564 [00:00<14:13:52,  4.54it/s][A
Iteration:   0% 2/232564 [00:00<11:22:15,  5.68it/s][A

memory before training 0.6337890625 3528.0
total memory after forward: 2107.7001953125
peak memory after forward: 3528.0


tensor(-73.2697, device='cuda:1', grad_fn=<MeanBackward0>)
total memory after backward: 1210.7177734375
preak memory after backward: 3528.0




memory before training 0.6337890625 3528.0
total memory after forward: 2106.7001953125
peak memory after forward: 3528.0


tensor(-165.5367, device='cuda:1', grad_fn=<MeanBackward0>)


Iteration:   0% 2/232564 [00:00<16:50:22,  3.84it/s]
Epoch:   0% 0/1 [00:00<?, ?it/s]

total memory after backward: 1210.7177734375
preak memory after backward: 3528.0









(3, 0.0)

In [None]:
## New version of layers

In [36]:
del model1
torch.cuda.empty_cache()

In [37]:
model1 = GPT2LMHeadModel.from_pretrained("gpt2")

In [38]:
parameters = cuda_memory(args.device)
parameters_peak = cuda_peak_memory(args.device)

In [39]:
from src.ttm_linear.ttm_linear import FactorizationTTMLinear

In [40]:
for i in range(len(model1.transformer.h)):
    # fc part
    old_layer = model1.transformer.h[i].mlp.c_fc
    (in_, out_) = old_layer.weight.shape
    layer = FactorizationTTMLinear(in_, out_, rank=16, max_core_dim_product = 16).to(args.device)
    model1.transformer.h[i].mlp.c_fc = layer

    # projection
    old_layer = model1.transformer.h[i].mlp.c_proj
    (in_, out_) = old_layer.weight.shape
    layer = FactorizationTTMLinear(in_, out_, rank=16, max_core_dim_product = 16).to(args.device)
    #layer = drop_layer
    model1.transformer.h[i].mlp.c_proj = layer

-------------------------------------
TTM-Linear required dimensions: dim_in=768, dim_out=3072, rank=16, max_dim=16
    after best_approx: dim_in=768, dim_out=3072
    dim_in factorization:  (2, 2, 2, 2, 2, 2, 2, 2, 3)
    dim_out factorization: (2, 2, 2, 2, 2, 2, 2, 2, 3)
    dims before shrink:  [(2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (3, 2), (1, 2), (1, 3)]
    final TTM dims:  [(4, 4), (4, 4), (4, 4), (4, 4), (3, 12)]
    Original linear params: 2359296, ttm params: 13120 (x0.006)
-------------------------------------
-------------------------------------
TTM-Linear required dimensions: dim_in=3072, dim_out=768, rank=16, max_dim=16
    after best_approx: dim_in=3072, dim_out=768
    dim_in factorization:  (2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3)
    dim_out factorization: (2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3)
    dims before shrink:  [(2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 3), (2, 1), (3, 1)]
    final TTM dims:  [(4, 4), (4, 4), (4, 4), (

In [64]:
train1(args, dataset_train, model1.to(args.device), tokenizer)

Epoch:   0% 0/1 [00:00<?, ?it/s]
Iteration:   0% 0/232564 [00:00<?, ?it/s][A


memory before training 2357.9599609375 5818.0
total memory after forward: 969.232421875
peak memory after forward: 5818.0


tensor(-64.4477, device='cuda:1', grad_fn=<MeanBackward0>)
total memory after backward: 420.828125
preak memory after backward: 5818.0






Iteration:   0% 1/232564 [00:00<14:00:30,  4.61it/s][A
Iteration:   0% 2/232564 [00:00<11:47:07,  5.48it/s][A

memory before training 3303.1279296875 5818.0
total memory after forward: 910.57421875
peak memory after forward: 5818.0


tensor(-122.9731, device='cuda:1', grad_fn=<MeanBackward0>)
total memory after backward: -2.0
preak memory after backward: 5818.0




memory before training 3301.1279296875 5818.0
total memory after forward: 915.82421875
peak memory after forward: 5818.0


tensor(-169.7060, device='cuda:1', grad_fn=<MeanBackward0>)


Iteration:   0% 2/232564 [00:00<17:22:04,  3.72it/s]
Epoch:   0% 0/1 [00:00<?, ?it/s]

total memory after backward: 3.25
preak memory after backward: 5818.0









(3, 0.0)