In [6]:
from transformers import GPT2Model, GPT2Config, GPT2LMHeadModel

# Initializing a GPT2 configuration
configuration = GPT2Config()

# Initializing a model from the configuration
model = GPT2LMHeadModel(configuration)

# Accessing the model configuration
configuration = model.config

In [7]:
from tltorch.factorized_layers.factorized_linear import FactorizedLinear



In [18]:
from tltorch.factorized_tensors.core import FactorizedTensor
from tltorch.tensor_hooks._tensor_dropout import tensor_dropout, remove_tensor_dropout, CPDropout, TTDropout

from src.layers2.linear import TTMLinear

shape = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 3]
rank = [1, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,128, 128, 128, 1]
tensor = FactorizedTensor.new(shape, rank, factorization='TT')

tensor.factors

FactorList(
    (factor_0): Parameter containing: [torch.FloatTensor of size 1x2x128]
    (factor_1): Parameter containing: [torch.FloatTensor of size 128x2x128]
    (factor_2): Parameter containing: [torch.FloatTensor of size 128x2x128]
    (factor_3): Parameter containing: [torch.FloatTensor of size 128x2x128]
    (factor_4): Parameter containing: [torch.FloatTensor of size 128x2x128]
    (factor_5): Parameter containing: [torch.FloatTensor of size 128x2x128]
    (factor_6): Parameter containing: [torch.FloatTensor of size 128x2x128]
    (factor_7): Parameter containing: [torch.FloatTensor of size 128x2x128]
    (factor_8): Parameter containing: [torch.FloatTensor of size 128x2x128]
    (factor_9): Parameter containing: [torch.FloatTensor of size 128x2x128]
    (factor_10): Parameter containing: [torch.FloatTensor of size 128x2x128]
    (factor_11): Parameter containing: [torch.FloatTensor of size 128x2x128]
    (factor_12): Parameter containing: [torch.FloatTensor of size 128x3x128]

In [19]:
shapes = [elem.shape for elem in layer.ttm.tt.cores]

In [20]:
layer.ttm.tt.ranks

[128, 128]

In [21]:
layer.ttm.tt.cores

ParameterList(
    (0): Parameter containing: [torch.FloatTensor of size 1x16x16x128]
    (1): Parameter containing: [torch.FloatTensor of size 128x16x16x128]
    (2): Parameter containing: [torch.FloatTensor of size 128x3x12x1]
)

In [33]:
shapes

[torch.Size([1, 16, 16, 128]),
 torch.Size([128, 16, 16, 128]),
 torch.Size([128, 3, 12, 1])]

In [34]:
import tensorly as tl
tl.set_backend('pytorch')
import torch
from torch import nn

In [35]:
len(layer.ttm.tt.cores[0].shape) - 1

3

In [36]:
class TTDropout(nn.Module):
    def __init__(self, old_layer, proba, min_dim, rank):
        super().__init__()
        self.proba = proba
        self.min_dim = min_dim
        self.layer = old_layer
        self.rank = rank
        
        self.new_layer = self.layer #TTMLinear(self.layer.d_in, self.layer.d_out, self.rank)
        for i in range(len(self.new_layer.ttm.tt.cores) - 1):
            self.new_layer.ttm.tt.cores[i] = torch.clone(self.layer.ttm.tt.cores[i])
        
    #def create_zero_mask(self):
    #def forward(self, inpt):
               
    def apply_tensor_dropout1(self, tt_tensor, training=True):
        if (not self.proba) or ((not training)):
            return tt_tensor

        device = tt_tensor.ttm.tt.cores[0].device

        sampled_indices = []
        for i, rank in enumerate(tt_tensor.ttm.tt.ranks):
            if rank > self.min_dim:
                idx = tl.arange(rank, device=device, dtype=torch.int64)
                idx = idx[torch.bernoulli(torch.ones(rank, device=device)*(1 - self.proba),
                                          out=torch.empty(rank, device=device, dtype=torch.bool))]
                if len(idx) == 0:
                    idx = torch.randint(0, rank, size=(min_values, ), device=device, dtype=torch.int64)
            else:
                idx = tl.arange(rank, device=device, dtype=torch.int64).tolist()

            sampled_indices.append(idx)

        sampled_factors = []
        if training:
            scaling = 1/(1 - self.proba)
        else:
            scaling = 1
        for i, f in enumerate(tt_tensor.ttm.tt.cores):
            if i == 0:
                ax = len(tt_tensor.ttm.tt.cores[0].shape) - 1
                sampled_factors.append(torch.clone(torch.index_select(f, ax, sampled_indices[i])*scaling))
            elif i == (len(tt_tensor.ttm.tt.cores) - 1):
                ax = 0
                sampled_factors.append(torch.clone(torch.index_select(f, ax, sampled_indices[i - 1])*scaling))
            else:
                ax_0 = 0
                ax_end = len(tt_tensor.ttm.tt.cores[0].shape) - 1
                new_tensor = torch.index_select(f, ax_0, sampled_indices[i - 1])
                new_tensor = torch.index_select(new_tensor, ax_end, sampled_indices[i])*scaling
                sampled_factors.append(torch.clone(new_tensor))

        return nn.ParameterList(sampled_factors)
    
    def forward(self, inpt):
        if self.training:
            #print ("self training")
            self.new_layer.ttm.tt.cores = self.apply_tensor_dropout1(self.new_layer, training=True)
            return self.new_layer(inpt)
        else:
            #print ("else")
            return self.layer(inpt)

In [2]:
from src.layers2.linear import TTMLinear

In [8]:
old_layer = model.transformer.h[4].mlp.c_fc
(in_, out_) = old_layer.weight.shape
print (old_layer.weight.shape)
layer = TTMLinear(d_in=in_, d_out=out_, rank=128)
#drop_layer = TTDropout(layer, proba = 0.7, min_dim = 2, rank = 128)

torch.Size([768, 3072])
[(16, 16), (16, 16), (3, 12)]


In [4]:
import torch
inpt = torch.zeros(size = (3072, 768))

In [5]:
layer(inpt)

gabh,hcdi,iefj,kace->kbdf


tensor([[[0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.]],

        ...,

        [[0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.]]], grad_fn=<SliceBackward0>)

In [42]:
layer.in_

AttributeError: 'TTMLinear' object has no attribute 'in_'

In [9]:
layer.ttm.tt.cores

ParameterList(
    (0): Parameter containing: [torch.FloatTensor of size 1x16x16x128]
    (1): Parameter containing: [torch.FloatTensor of size 128x16x16x128]
    (2): Parameter containing: [torch.FloatTensor of size 128x3x12x1]
)

In [13]:
layer_with_drop

NameError: name 'layer_with_drop' is not defined

In [14]:
layer_drop = TTDropout

In [15]:
type(layer.ttm.tt.cores[0])

torch.nn.parameter.Parameter

In [None]:
for i in range(len(model.transformer.h)):
    # fc part
    old_layer = model.transformer.h[i].mlp.c_fc
    (in_, out_) = old_layer.weight.shape
    print (old_layer.weight.shape)
    layer = TTMLinear(d_in=in_, d_out=out_, rank=128)
    drop_layer = TTDropout(layer, proba = 0.7, min_dim = 2, rank=128)
    layer = drop_layer
    model.transformer.h[i].mlp.c_fc = layer

    # projection
    old_layer = model.transformer.h[i].mlp.c_proj
    (in_, out_) = old_layer.weight.shape
    #print (old_layer.weight.shape)
    layer = TTMLinear(d_in=in_, d_out=out_, rank=128)
    drop_layer = TTDropout(layer, proba = 0.8, min_dim = 2, rank=128)
    layer = drop_layer
    model.transformer.h[i].mlp.c_proj = layer

In [17]:
from torch.linalg import qr

In [20]:
import torch
CUDA_LAUNCH_BLOCKING=1
device = torch.device("cpu")
a = model.to(device)

In [21]:
from datasets import load_dataset
dataset_train = load_dataset('wikitext', 'wikitext-103-v1', split='train')
dataset_valid = load_dataset('wikitext', 'wikitext-103-v1', split='validation')

Reusing dataset wikitext (/root/.cache/huggingface/datasets/wikitext/wikitext-103-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)
Reusing dataset wikitext (/root/.cache/huggingface/datasets/wikitext/wikitext-103-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)


In [22]:
from transformers import GPT2Tokenizer
from transformers import DataCollatorForLanguageModeling
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
dataset_train = load_dataset('wikitext', 'wikitext-103-v1', split='train')
dataset_valid = load_dataset('wikitext', 'wikitext-103-v1', split='validation')
#data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer)

Reusing dataset wikitext (/root/.cache/huggingface/datasets/wikitext/wikitext-103-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)
Reusing dataset wikitext (/root/.cache/huggingface/datasets/wikitext/wikitext-103-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)


In [23]:
from transformers import TextDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

dataset_train = TextDataset(tokenizer=tokenizer, 
                                file_path="/notebook/greenAI/wikitext-2/wiki.train.tokens", 
                                block_size=128)

dataset_valid = TextDataset(tokenizer=tokenizer, 
                                file_path="/notebook/greenAI/wikitext-2/wiki.valid.tokens", 
                                block_size=128)

sampler = RandomSampler(dataset_train)
tdata_loader = DataLoader(dataset_train, 
                         batch_size=32, 
                         sampler=sampler)
vdata_loader = DataLoader(dataset_valid, 
                         batch_size=32, 
                         sampler=sampler)



In [24]:
len(dataset_train)

19127

In [25]:
from transformers import Trainer, TrainingArguments
from transformers import default_data_collator

class Object(object):
    pass

args = Object()
args.local_rank = -1
args.max_steps = -1
args.per_gpu_train_batch_size = 16
args.per_gpu_eval_batch_size = 16
args.n_gpu = 1
args.gradient_accumulation_steps = 1
args.num_train_epochs = 100
args.weight_decay = 0.0
args.learning_rate = 6.25e-3
args.adam_epsilon = 1e-8
args.warmup_steps = 0
args.seed = 42
args.mlm = False
args.device = torch.device('cpu')
args.fp16 = False
args.max_grad_norm = 1.0
args.logging_steps = 500.0
args.save_steps = 50
args.evaluate_during_training = True
args.output_dir = '/notebook/greenAI/out_simple_transformer'
args.eval_batch_size = 32
args.save_total_limit = 2

In [26]:
%load_ext autoreload
%autoreload 2

from help_trainer import train
model.to(device)
model.train()
train(args, dataset_train, dataset_valid, model, tokenizer)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Epoch:   0%|          | 0/100 [00:00<?, ?it/s]

num_epochs 100



Iteration:   0%|          | 0/1196 [00:00<?, ?it/s][A
Iteration:   0%|          | 1/1196 [01:08<22:47:48, 68.68s/it][A
Epoch:   0%|          | 0/100 [01:08<?, ?it/s]


IndexError: index out of range in self

In [27]:
import torch
from torch import nn

device = 'cuda:3'
x = torch.LongTensor([0, 1, 2])
x = x.to(device)
emb = nn.Embedding(3, 5)
emb.weight = nn.Parameter(emb.weight.to(device)) # Moving the weights of the embedding layer to the GPU
x = emb(x)

In [28]:
import logging
import glob
from tqdm import tqdm, trange
import numpy as np
import torch
import os

import logging
import pickle
import random
import re
import shutil

logger = logging.getLogger("transformer.log")
try:
    from torch.utils.tensorboard import SummaryWriter
except:
    from tensorboardX import SummaryWriter

%load_ext autoreload
%autoreload 2
from run_lm_finetuning import mask_tokens1

def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)
        
def _rotate_checkpoints(args, checkpoint_prefix, use_mtime=False):
    if not args.save_total_limit:
        return
    if args.save_total_limit <= 0:
        return

    # Check if we should delete older checkpoint(s)
    glob_checkpoints = glob.glob(os.path.join(args.output_dir, '{}-*'.format(checkpoint_prefix)))
    if len(glob_checkpoints) <= args.save_total_limit:
        return

    ordering_and_checkpoint_path = []
    for path in glob_checkpoints:
        if use_mtime:
            ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
        else:
            regex_match = re.match('.*{}-([0-9]+)'.format(checkpoint_prefix), path)
            if regex_match and regex_match.groups():
                ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))

    checkpoints_sorted = sorted(ordering_and_checkpoint_path)
    checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
    number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - args.save_total_limit)
    checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
    for checkpoint in checkpoints_to_be_deleted:
        logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
        shutil.rmtree(checkpoint)

        
def load_and_cache_examples(args, tokenizer, evaluate=False):
    dataset = TextDataset(tokenizer, file_path=args.eval_data_file if evaluate else args.train_data_file, block_size=args.block_size)
    return dataset


from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
                                  GPT2Config, GPT2LMHeadModel, GPT2Tokenizer)

def train1(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps = t_total)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
    

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                          output_device=args.local_rank,
                                                          find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
    set_seed(args)  # Added here for reproducibility (even between python 2 and 3)
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            inputs, labels = mask_tokens1(batch, tokenizer, args) if args.mlm else (batch, batch)
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs) if args.mlm else model(inputs, labels=labels)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                print ("backward")
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        print ("evaluation ", results, flush = True)
                        for key, value in results.items():
                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    checkpoint_prefix = 'checkpoint'
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, '{}-{}'.format(checkpoint_prefix, global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    _rotate_checkpoints(args, checkpoint_prefix)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step

def evaluate(args, model, tokenizer, prefix=""):
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(eval_output_dir)
    eval_dataset = dataset_valid
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()
    losses = []
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
        inputs = inputs.to(args.device)
        labels = labels.to(args.device)

        with torch.no_grad():
            outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
            lm_loss = outputs[0]
            eval_loss += lm_loss.mean().item()
            losses.append(eval_loss)
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    print ("nb_eval_steps", nb_eval_steps)
    perplexity1 = torch.exp(torch.tensor(eval_loss))
    
    perplexity2 = np.exp(np.array(losses).mean())/len(losses)
    
    print ("perplexity1, perplexity2", perplexity1, perplexity2)

    result = {
        "perplexity": perplexity,
        "loss":eval_loss
    }

    output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return result

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
