In [1]:
from transformers import GPT2Model, GPT2Config, GPT2LMHeadModel
from transformers.configuration_utils import PretrainedConfig


2023-01-19 17:07:32.470541: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [10]:
import pathlib
# BASE_DIR will be like '/home/jovyan/DemoExample/'
BASE_DIR = pathlib.Path().absolute().parents[0]
print(f"Working dir: {BASE_DIR}")

Working dir: /notebook/GreenAl


In [11]:
BASE_DIR = "/notebook/greenAl/"

In [17]:
def cuda_memory(device, offset: int = 0):
    return (t.cuda.memory_allocated(device)) / 2**20

def cuda_peak_memory(device, offset: int = 0):
    return (t.cuda.max_memory_allocated(device)) / 2**20

In [13]:
from transformers import GPT2Model, GPT2Config, GPT2LMHeadModel
import torch
from transformers import GPT2Tokenizer
from transformers import DataCollatorForLanguageModeling

from datasets import load_dataset
from transformers import TextDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [15]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

    #dataset_train = load_dataset('wikitext', 'wikitext-103-v1', split='train')
    #dataset_valid = load_dataset('wikitext', 'wikitext-103-v1', split='validation')
    
dataset_train = TextDataset(tokenizer=tokenizer, 
                                file_path=BASE_DIR + "/wikitext-103/wiki.train.tokens", 
                                block_size=512)


dataset_valid = TextDataset(tokenizer=tokenizer, 
                                file_path=BASE_DIR + "/wikitext-103/wiki.valid.tokens", 
                                block_size=512)
    
dataset_test = TextDataset(tokenizer=tokenizer, 
                                file_path=BASE_DIR + "/wikitext-103/wiki.test.tokens", block_size=512)
print (len(dataset_train), len(dataset_valid), len(dataset_test))

232564 487 557


In [20]:
from transformers import Trainer, TrainingArguments
from transformers import default_data_collator
import torch

class Object(object):
    pass

args = Object()
args.local_rank = -1
args.max_steps = 2
args.per_gpu_train_batch_size = 1
args.per_gpu_eval_batch_size = 1
args.n_gpu = 1
args.gradient_accumulation_steps = 1
args.num_train_epochs = 100
args.weight_decay = 0.0
args.learning_rate = 6.25e-3
args.adam_epsilon = 1e-8
args.warmup_steps = 0
args.seed = 42
args.mlm = False
args.device = torch.device('cuda:1')
args.fp16 = False
args.max_grad_norm = 1.0
args.logging_steps = 500.0
args.save_steps = 50
args.evaluate_during_training = True
args.output_dir = '/notebook/greenAI/out_simple_transformer'
args.eval_batch_size = 32
args.save_total_limit = 2

In [26]:
from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
                                  GPT2Config, GPT2LMHeadModel, GPT2Tokenizer)

In [28]:
from tqdm import tqdm, trange

def train1(args, train_dataset, model, tokenizer):
    """ Train the model """
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps = t_total)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
    

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                          output_device=args.local_rank,
                                                          find_unused_parameters=True)
    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
    set_seed(args)  # Added here for reproducibility (even between python 2 and 3)
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            inputs, labels = (batch, batch)
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            
            parameters = cuda_memory(args.device)
            parameters_peak = cuda_memory_peak(args.device)
            print ("memory before training", parameters, parameters_peak)
            outputs = model(inputs)
            
            print(f'total memory after forward: {cuda_memory(args.device) - parameters}')
            print(f'allocated memory after forward: {cuda_memory_peak(args.device) - parameters_peak}')
            
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
            loss.backward()
            print(f'total memory after backward: {cuda_memory(args.device) - parameters}')
            print(f'allocated memory after backward: {cuda_memory_peak(args.device) - parameters_peak}')
            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step



## Regular GPT med

In [16]:
model1 = GPT2LMHeadModel.from_pretrained("gpt2")

Downloading:   0%|          | 0.00/548M [00:00<?, ?B/s]

In [29]:
train1(args, dataset_train, model1, tokenizer)

NameError: name 'trange' is not defined

In [8]:
from src.layers2.linear import TTMLinear

In [9]:
old_layer = model.transformer.h[4].mlp.c_fc
(in_, out_) = old_layer.weight.shape
print (old_layer.weight.shape)
layer = TTMLinear(d_in=in_, d_out=out_, rank=128)
drop_layer = TTDropout(layer, proba = 0.7, min_dim = 2, rank = 128)

torch.Size([768, 3072])
[(16, 16), (16, 16), (3, 12)]


In [18]:
#drop_layer(inpt)

In [10]:
import torch
inpt = torch.zeros(size = (3072, 768))

In [17]:
from transformers import GPT2Tokenizer
from transformers import DataCollatorForLanguageModeling
from datasets import load_dataset
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [12]:
layer_drop = TTDropout

In [13]:
type(layer.ttm.tt.cores[0])

torch.nn.parameter.Parameter

In [23]:
model.__class__.__name__

'GPT2LMHeadModel'

In [34]:
from transformers import AutoConfig, AutoModel


In [24]:
torch.save(model.state_dict(), '/notebook/greenAI/out_tt_transformer/checkpoint-145300/model_tt.pth')

In [36]:
model.config

GPT2Config {
  "activation_function": "gelu_new",
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "transformers_version": "4.17.0.dev0",
  "use_cache": true,
  "vocab_size": 50257
}

In [25]:
new_model = GPT2LMHeadModel(configuration)

In [26]:
new_model.state_dict = torch.load('/notebook/greenAI/out_tt_transformer/checkpoint-145300/model_tt.pth')

In [1]:
model.transformer

NameError: name 'model' is not defined

In [32]:
new_model.transformer.h[0].mlp

GPT2MLP(
  (c_fc): Conv1D()
  (c_proj): Conv1D()
  (act): NewGELUActivation()
  (dropout): Dropout(p=0.1, inplace=False)
)

In [None]:
print("Model's state_dict:")
for param_tensor in model.state_dict():
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())

In [None]:
model.transformer

In [14]:
for i in range(len(model.transformer.h)):
    # fc part
    old_layer = model.transformer.h[i].mlp.c_fc
    (in_, out_) = old_layer.weight.shape
    print (old_layer.weight.shape)
    layer = TTMLinear(d_in=in_, d_out=out_, rank=128)
    drop_layer = TTDropout(layer, proba = 0.7, min_dim = 2, rank=128)
    layer = drop_layer
    model.transformer.h[i].mlp.c_fc = layer

    # projection
    old_layer = model.transformer.h[i].mlp.c_proj
    (in_, out_) = old_layer.weight.shape
    #print (old_layer.weight.shape)
    layer = TTMLinear(d_in=in_, d_out=out_, rank=128)
    drop_layer = TTDropout(layer, proba = 0.8, min_dim = 2, rank=128)
    layer = drop_layer
    model.transformer.h[i].mlp.c_proj = layer

torch.Size([768, 3072])
[(16, 16), (16, 16), (3, 12)]
[(16, 16), (16, 16), (12, 3)]
torch.Size([768, 3072])
[(16, 16), (16, 16), (3, 12)]
[(16, 16), (16, 16), (12, 3)]
torch.Size([768, 3072])
[(16, 16), (16, 16), (3, 12)]
[(16, 16), (16, 16), (12, 3)]
torch.Size([768, 3072])
[(16, 16), (16, 16), (3, 12)]
[(16, 16), (16, 16), (12, 3)]
torch.Size([768, 3072])
[(16, 16), (16, 16), (3, 12)]
[(16, 16), (16, 16), (12, 3)]
torch.Size([768, 3072])
[(16, 16), (16, 16), (3, 12)]
[(16, 16), (16, 16), (12, 3)]
torch.Size([768, 3072])
[(16, 16), (16, 16), (3, 12)]
[(16, 16), (16, 16), (12, 3)]
torch.Size([768, 3072])
[(16, 16), (16, 16), (3, 12)]
[(16, 16), (16, 16), (12, 3)]
torch.Size([768, 3072])
[(16, 16), (16, 16), (3, 12)]
[(16, 16), (16, 16), (12, 3)]
torch.Size([768, 3072])
[(16, 16), (16, 16), (3, 12)]
[(16, 16), (16, 16), (12, 3)]
torch.Size([768, 3072])
[(16, 16), (16, 16), (3, 12)]
[(16, 16), (16, 16), (12, 3)]
torch.Size([768, 3072])
[(16, 16), (16, 16), (3, 12)]
[(16, 16), (16, 16), (

In [None]:
model.transformer

In [4]:
from torch.linalg import qr

In [18]:
from transformers import TextDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

dataset_train = TextDataset(tokenizer=tokenizer, 
                                file_path="/notebook/greenAI/wikitext-103/wiki.train.tokens", 
                                block_size=512)


dataset_valid = TextDataset(tokenizer=tokenizer, 
                                file_path="/notebook/greenAI/wikitext-103/wiki.valid.tokens", 
                                block_size=512)
    
dataset_test = TextDataset(tokenizer=tokenizer, 
                                file_path="/notebook/greenAI/wikitext-103/wiki.test.tokens", block_size=512)
print (len(dataset_train))



232564


In [19]:
len(dataset_train)

232564

In [23]:
from transformers import Trainer, TrainingArguments
from transformers import default_data_collator
import torch

class Object(object):
    pass

args = Object()
args.local_rank = -1
args.max_steps = -1
args.per_gpu_train_batch_size = 1
args.per_gpu_eval_batch_size = 1
args.n_gpu = 1
args.gradient_accumulation_steps = 16
args.num_train_epochs = 100
args.weight_decay = 0.0
args.learning_rate = 6.25e-3
args.adam_epsilon = 1e-8
args.warmup_steps = 0
args.seed = 42
args.mlm = False
args.device = torch.device('cuda:3')
args.fp16 = False
args.max_grad_norm = 1.0
args.logging_steps = 500.0
args.save_steps = 50
args.evaluate_during_training = True
args.output_dir = '/notebook/greenAI/out_simple_transformer'
args.eval_batch_size = 32
args.save_total_limit = 2

In [None]:
%load_ext autoreload
%autoreload 2
device = 'cuda:0'
from help_trainer_last import train
model.to(device)


In [14]:
import torch
from torch import nn

device = 'cuda:3'
x = torch.LongTensor([0, 1, 2])
x = x.to(device)
emb = nn.Embedding(3, 5)
emb.weight = nn.Parameter(emb.weight.to(device)) # Moving the weights of the embedding layer to the GPU
x = emb(x)

In [None]:
import logging
import glob
from tqdm import tqdm, trange
import numpy as np
import torch
import os

import logging
import pickle
import random
import re
import shutil

logger = logging.getLogger("transformer.log")
try:
    from torch.utils.tensorboard import SummaryWriter
except:
    from tensorboardX import SummaryWriter

%load_ext autoreload
%autoreload 2
from run_lm_finetuning import mask_tokens1

def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)
        
def _rotate_checkpoints(args, checkpoint_prefix, use_mtime=False):
    if not args.save_total_limit:
        return
    if args.save_total_limit <= 0:
        return

    # Check if we should delete older checkpoint(s)
    glob_checkpoints = glob.glob(os.path.join(args.output_dir, '{}-*'.format(checkpoint_prefix)))
    if len(glob_checkpoints) <= args.save_total_limit:
        return

    ordering_and_checkpoint_path = []
    for path in glob_checkpoints:
        if use_mtime:
            ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
        else:
            regex_match = re.match('.*{}-([0-9]+)'.format(checkpoint_prefix), path)
            if regex_match and regex_match.groups():
                ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))

    checkpoints_sorted = sorted(ordering_and_checkpoint_path)
    checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
    number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - args.save_total_limit)
    checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
    for checkpoint in checkpoints_to_be_deleted:
        logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
        shutil.rmtree(checkpoint)

        
def load_and_cache_examples(args, tokenizer, evaluate=False):
    dataset = TextDataset(tokenizer, file_path=args.eval_data_file if evaluate else args.train_data_file, block_size=args.block_size)
    return dataset


from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
                                  GPT2Config, GPT2LMHeadModel, GPT2Tokenizer)

def train1(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps = t_total)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
    

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                          output_device=args.local_rank,
                                                          find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
    set_seed(args)  # Added here for reproducibility (even between python 2 and 3)
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            inputs, labels = mask_tokens1(batch, tokenizer, args) if args.mlm else (batch, batch)
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs) if args.mlm else model(inputs, labels=labels)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                print ("backward")
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        print ("evaluation ", results, flush = True)
                        for key, value in results.items():
                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    checkpoint_prefix = 'checkpoint'
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, '{}-{}'.format(checkpoint_prefix, global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    _rotate_checkpoints(args, checkpoint_prefix)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step

def evaluate(args, model, tokenizer, prefix=""):
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(eval_output_dir)
    eval_dataset = dataset_valid
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()
    losses = []
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
        inputs = inputs.to(args.device)
        labels = labels.to(args.device)

        with torch.no_grad():
            outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
            lm_loss = outputs[0]
            eval_loss += lm_loss.mean().item()
            losses.append(eval_loss)
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    print ("nb_eval_steps", nb_eval_steps)
    perplexity1 = torch.exp(torch.tensor(eval_loss))
    
    perplexity2 = np.exp(np.array(losses).mean())/len(losses)
    
    print ("perplexity1, perplexity2", perplexity1, perplexity2)

    result = {
        "perplexity": perplexity,
        "loss":eval_loss
    }

    output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return result