In [1]:
import clip
import os
from torch import nn
import numpy as np
import torch
import torch.nn.functional as nnf
import sys
from typing import Tuple, List, Union, Optional
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import skimage.io as io
import PIL.Image
from IPython.display import Image 
from enum import Enum
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_metric import PyRouge
from pycocoevalcap.cider.cider import Cider
import string
translator = str.maketrans('', '', string.punctuation)

N = type(None)
V = np.array
ARRAY = np.ndarray
ARRAYS = Union[Tuple[ARRAY, ...], List[ARRAY]]
VS = Union[Tuple[V, ...], List[V]]
VN = Union[V, N]
VNS = Union[VS, N]
T = torch.Tensor
TS = Union[Tuple[T, ...], List[T]]
TN = Optional[T]
TNS = Union[Tuple[TN, ...], List[TN]]
TSN = Optional[TS]
TA = Union[T, ARRAY]


D = torch.device
CPU = torch.device('cpu')


def get_device(device_id: int) -> D:
    if not torch.cuda.is_available():
        return CPU
    device_id = min(torch.cuda.device_count() - 1, device_id)
    return torch.device(f'cuda:{device_id}')


CUDA = get_device

In [2]:
class MappingType(Enum):
    MLP = 'mlp'
    Transformer = 'transformer'


class MlpTransformer(nn.Module):
     def __init__(self, in_dim, h_dim, out_d: Optional[int] = None, act=nnf.relu, dropout=0.):
         super().__init__()
         out_d = out_d if out_d is not None else in_dim
         self.fc1 = nn.Linear(in_dim, h_dim)
         self.act = act
         self.fc2 = nn.Linear(h_dim, out_d)
         self.dropout = nn.Dropout(dropout)

     def forward(self, x):
         x = self.fc1(x)
         x = self.act(x)
         x = self.dropout(x)
         x = self.fc2(x)
         x = self.dropout(x)
         return x

class MLP(nn.Module):

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.model(x)

    def __init__(self, sizes: Tuple[int, ...], bias=True, act=nn.Tanh):
        super(MLP, self).__init__()
        layers = []
        for i in range(len(sizes) - 1):
            layers.append(nn.Linear(sizes[i], sizes[i + 1], bias=bias))
            if i < len(sizes) - 2:
                layers.append(act())
        self.model = nn.Sequential(*layers)


class MultiHeadAttention(nn.Module):

    def __init__(self, dim_self, dim_ref, num_heads, bias=True, dropout=0.):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim_self // num_heads
        self.scale = head_dim ** -0.5
        self.to_queries = nn.Linear(dim_self, dim_self, bias=bias)
        self.to_keys_values = nn.Linear(dim_ref, dim_self * 2, bias=bias)
        self.project = nn.Linear(dim_self, dim_self)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, y=None, mask=None):
        y = y if y is not None else x
        b, n, c = x.shape
        _, m, d = y.shape
        # b n h dh
        queries = self.to_queries(x).reshape(b, n, self.num_heads, c // self.num_heads)
        # b m 2 h dh
        keys_values = self.to_keys_values(y).reshape(b, m, 2, self.num_heads, c // self.num_heads)
        keys, values = keys_values[:, :, 0], keys_values[:, :, 1]
        attention = torch.einsum('bnhd,bmhd->bnmh', queries, keys) * self.scale
        if mask is not None:
            if mask.dim() == 2:
                mask = mask.unsqueeze(1)
            attention = attention.masked_fill(mask.unsqueeze(3), float("-inf"))
        attention = attention.softmax(dim=2)
        out = torch.einsum('bnmh,bmhd->bnhd', attention, values).reshape(b, n, c)
        out = self.project(out)
        return out, attention


class TransformerLayer(nn.Module):

    def forward_with_attention(self, x, y=None, mask=None):
        x_, attention = self.attn(self.norm1(x), y, mask)
        x = x + x_
        x = x + self.mlp(self.norm2(x))
        return x, attention

    def forward(self, x, y=None, mask=None):
        x = x + self.attn(self.norm1(x), y, mask)[0]
        x = x + self.mlp(self.norm2(x))
        return x

    def __init__(self, dim_self, dim_ref, num_heads, mlp_ratio=4., bias=False, dropout=0., act=nnf.relu,
                 norm_layer: nn.Module = nn.LayerNorm):
        super().__init__()
        self.norm1 = norm_layer(dim_self)
        self.attn = MultiHeadAttention(dim_self, dim_ref, num_heads, bias=bias, dropout=dropout)
        self.norm2 = norm_layer(dim_self)
        self.mlp = MlpTransformer(dim_self, int(dim_self * mlp_ratio), act=act, dropout=dropout)


class Transformer(nn.Module):

    def forward_with_attention(self, x, y=None, mask=None):
        attentions = []
        for layer in self.layers:
            x, att = layer.forward_with_attention(x, y, mask)
            attentions.append(att)
        return x, attentions

    def forward(self, x, y=None, mask=None):
        for i, layer in enumerate(self.layers):
            if i % 2 == 0 and self.enc_dec: # cross
                x = layer(x, y)
            elif self.enc_dec:  # self
                x = layer(x, x, mask)
            else:  # self or cross
                x = layer(x, y, mask)
        return x

    def __init__(self, dim_self: int, num_heads: int, num_layers: int, dim_ref: Optional[int] = None,
                 mlp_ratio: float = 2., act=nnf.relu, norm_layer: nn.Module = nn.LayerNorm, enc_dec: bool = False):
        super(Transformer, self).__init__()
        dim_ref = dim_ref if dim_ref is not None else dim_self
        self.enc_dec = enc_dec
        if enc_dec:
            num_layers = num_layers * 2
        layers = []
        for i in range(num_layers):
            if i % 2 == 0 and enc_dec:  # cross
                layers.append(TransformerLayer(dim_self, dim_ref, num_heads, mlp_ratio, act=act, norm_layer=norm_layer))
            elif enc_dec:  # self
                layers.append(TransformerLayer(dim_self, dim_self, num_heads, mlp_ratio, act=act, norm_layer=norm_layer))
            else:  # self or cross
                layers.append(TransformerLayer(dim_self, dim_ref, num_heads, mlp_ratio, act=act, norm_layer=norm_layer))
        self.layers = nn.ModuleList(layers)


class TransformerMapper(nn.Module):

    def forward(self, x):
        x = self.linear(x).view(x.shape[0], self.clip_length, -1)
        prefix = self.prefix_const.unsqueeze(0).expand(x.shape[0], *self.prefix_const.shape)
        prefix = torch.cat((x, prefix), dim=1)
        out = self.transformer(prefix)[:, self.clip_length:]
        return out

    def __init__(self, dim_clip: int, dim_embedding: int, prefix_length: int, clip_length: int, num_layers: int = 8):
        super(TransformerMapper, self).__init__()
        self.clip_length = clip_length
        self.transformer = Transformer(dim_embedding, 8, num_layers)
        self.linear = nn.Linear(dim_clip, clip_length * dim_embedding)
        self.prefix_const = nn.Parameter(torch.randn(prefix_length, dim_embedding), requires_grad=True)


class ClipCaptionModel(nn.Module):

    def get_dummy_token(self, batch_size: int, device: torch.device) -> torch.Tensor:
        return torch.zeros(batch_size, self.prefix_length, dtype=torch.int64, device=device)

    def forward(self, tokens: torch.Tensor, prefix: torch.Tensor, mask: Optional[torch.Tensor] = None,
                labels: Optional[torch.Tensor] = None):
        embedding_text = self.gpt.transformer.wte(tokens)
        prefix_projections = self.clip_project(prefix).view(-1, self.prefix_length, self.gpt_embedding_size)
        embedding_cat = torch.cat((prefix_projections, embedding_text), dim=1)
        if labels is not None:
            dummy_token = self.get_dummy_token(tokens.shape[0], tokens.device)
            labels = torch.cat((dummy_token, tokens), dim=1)
        out = self.gpt(inputs_embeds=embedding_cat, labels=labels, attention_mask=mask)
        return out

    def __init__(self, prefix_length: int, clip_length: Optional[int] = None, prefix_size: int = 512,
                 num_layers: int = 8, mapping_type: MappingType = MappingType.MLP):
        super(ClipCaptionModel, self).__init__()
        self.prefix_length = prefix_length
        self.gpt = GPT2LMHeadModel.from_pretrained('gpt2')
        self.gpt_embedding_size = self.gpt.transformer.wte.weight.shape[1]
        if mapping_type == 'mlp':
            print('mlp was created')
            self.clip_project = MLP((prefix_size, (self.gpt_embedding_size * prefix_length) // 2,
                                     self.gpt_embedding_size * prefix_length))
        else:
            print('transformer was created')
            self.clip_project = TransformerMapper(prefix_size, self.gpt_embedding_size, prefix_length,
                                                                     clip_length, num_layers)


class ClipCaptionPrefix(ClipCaptionModel):

    def parameters(self, recurse: bool = True):
        return self.clip_project.parameters()

    def train(self, mode: bool = True):
        super(ClipCaptionPrefix, self).train(mode)
        self.gpt.eval()
        return self
    
def generate_beam(model, tokenizer, beam_size: int = 5, prompt=None, embed=None,
                  entry_length=67, temperature=1., stop_token: str = '. '):

    model.eval()
    stop_token_index = tokenizer.encode(stop_token)[0]
    tokens = None
    scores = None
    device = next(model.parameters()).device
    seq_lengths = torch.ones(beam_size, device=device)
    is_stopped = torch.zeros(beam_size, device=device, dtype=torch.bool)
    with torch.no_grad():
        if embed is not None:
            generated = embed
        else:
            if tokens is None:
                tokens = torch.tensor(tokenizer.encode(prompt))
                tokens = tokens.unsqueeze(0).to(device)
                generated = model.gpt.transformer.wte(tokens)
        for i in range(entry_length):
            outputs = model.gpt(inputs_embeds=generated)
            logits = outputs.logits
            logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)
            logits = logits.softmax(-1).log()
            if scores is None:
                scores, next_tokens = logits.topk(beam_size, -1)
                generated = generated.expand(beam_size, *generated.shape[1:])
                next_tokens, scores = next_tokens.permute(1, 0), scores.squeeze(0)
                if tokens is None:
                    tokens = next_tokens
                else:
                    tokens = tokens.expand(beam_size, *tokens.shape[1:])
                    tokens = torch.cat((tokens, next_tokens), dim=1)
            else:
                logits[is_stopped] = -float(np.inf)
                logits[is_stopped, 0] = 0
                scores_sum = scores[:, None] + logits
                seq_lengths[~is_stopped] += 1
                scores_sum_average = scores_sum / seq_lengths[:, None]
                scores_sum_average, next_tokens = scores_sum_average.view(-1).topk(beam_size, -1)
                next_tokens_source = next_tokens // scores_sum.shape[1]
                seq_lengths = seq_lengths[next_tokens_source]
                next_tokens = next_tokens % scores_sum.shape[1]
                next_tokens = next_tokens.unsqueeze(1)
                tokens = tokens[next_tokens_source]
                tokens = torch.cat((tokens, next_tokens), dim=1)
                generated = generated[next_tokens_source]
                scores = scores_sum_average * seq_lengths
                is_stopped = is_stopped[next_tokens_source]
            next_token_embed = model.gpt.transformer.wte(next_tokens.squeeze()).view(generated.shape[0], 1, -1)
            generated = torch.cat((generated, next_token_embed), dim=1)
            is_stopped = is_stopped + next_tokens.eq(stop_token_index).squeeze()
            if is_stopped.all():
                break
    scores = scores / seq_lengths
    output_list = tokens.cpu().numpy()
    output_texts = [tokenizer.decode(output[:int(length)]) for output, length in zip(output_list, seq_lengths)]
    order = scores.argsort(descending=True)
    output_texts = [output_texts[i] for i in order]
    return output_texts


def generate2(
        model,
        tokenizer,
        tokens=None,
        prompt=None,
        embed=None,
        entry_count=1,
        entry_length=40,  # maximum number of words
        top_p=0.8,
        temperature=1.,
        stop_token: str = '. ',
):
    model.eval()
    generated_num = 0
    generated_list = []
    stop_token_index = tokenizer.encode(stop_token)[0]
    filter_value = -float("Inf")
    device = next(model.parameters()).device
    with torch.no_grad():

        for entry_idx in trange(entry_count):
            if embed is not None:
                generated = embed
            else:
                if tokens is None:
                    tokens = torch.tensor(tokenizer.encode(prompt))
                    tokens = tokens.unsqueeze(0).to(device)

                generated = model.gpt.transformer.wte(tokens)

            for i in range(entry_length):

                outputs = model.gpt(inputs_embeds=generated)
                logits = outputs.logits
                logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)
                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(nnf.softmax(sorted_logits, dim=-1), dim=-1)
                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
                                                    ..., :-1
                                                    ].clone()
                sorted_indices_to_remove[..., 0] = 0

                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                logits[:, indices_to_remove] = filter_value
                next_token = torch.argmax(logits, -1).unsqueeze(0)
                next_token_embed = model.gpt.transformer.wte(next_token)
                if tokens is None:
                    tokens = next_token
                else:
                    tokens = torch.cat((tokens, next_token), dim=1)
                generated = torch.cat((generated, next_token_embed), dim=1)
                if stop_token_index == next_token.item():
                    break

            output_list = list(tokens.squeeze().cpu().numpy())
            output_text = tokenizer.decode(output_list)
            generated_list.append(output_text)

    return generated_list[0]

In [3]:
smooth = SmoothingFunction()
def BLEU_score(references, hypothesis):
    # Calculating the BLEU score by comparing the predicted caption with five actual captions.
    # references eg:   [[["this", "is", "an", "apple"]]]
    # hypothesis eg:  [["an", "apple", "on", "this", "tree"]]
    corpus_score_1 = sentence_bleu(references, hypothesis, weights=(1, 0, 0, 0), smoothing_function=smooth.method1)
    corpus_score_2 = sentence_bleu(references, hypothesis, weights=(0.5, 0.5, 0, 0), smoothing_function=smooth.method1)
    corpus_score_3 = sentence_bleu(references, hypothesis, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smooth.method1)
    corpus_score_4 = sentence_bleu(references, hypothesis, smoothing_function=smooth.method1)
    return [
        round(corpus_score_1, 5), round(corpus_score_2, 5), round(corpus_score_3, 5), round(corpus_score_4, 5)
    ]

def METEOR_score(references, hypothesis):
    # references eg:   [["this", "is", "an", "apple"]]
    # hypothesis eg:  ["an", "apple", "on", "this", "tree"]
    return meteor_score(references, hypothesis)

def rouge_score(references, hypotheses):
    # hypotheses=[["this", "is", "an", "apple"]]
    # references=[[["an", "apple", "on", "this", "tree"],['it', 'is', 'a', 'good', 'apple']]]
    rouge = PyRouge(rouge_n=(1), rouge_l=True, rouge_w=False, rouge_s=False, rouge_su=False)
    score = rouge.evaluate_tokenized(hypotheses, references)
    return score


def show_image(model, clip_model, preprocess, tokenizer, file_path, use_beam_search = False):
    prefix_length = 40
    filename,captions=[],[]
    with open(os.path.join(file_path,'captions.txt'), 'r') as file:
        lines = file.readlines()
        for i in range(4):
            line = lines[1+5*i]
            temp = line.split('.jpg,')
            filename.append(temp[0] + '.jpg')
            captions.append(temp[1][:-1])
            file = os.path.join(file_path,'images',filename[i])
            image = io.imread(file)
            pil_image = PIL.Image.fromarray(image)
            display(pil_image)
            image = preprocess(pil_image).unsqueeze(0).to(device)
            with torch.no_grad():
                prefix = clip_model.encode_image(image).to(device, dtype=torch.float32)
                # prefix = prefix / prefix.norm(2, -1).item()
                prefix_embed = model.clip_project(prefix).reshape(1, prefix_length, -1)
            if use_beam_search:
                generated_text_prefix = generate_beam(model, tokenizer, embed=prefix_embed)[0]
            else:
                generated_text_prefix = generate2(model, tokenizer, embed=prefix_embed)
            stop_token = '.'
            if stop_token in generated_text_prefix:
                generated_text_prefix = generated_text_prefix.split(stop_token)[0] + stop_token
            print('\n')
            print(generated_text_prefix)
            print(captions[i])
    # return model
    
def clean_sentence(ans):
    return [s.translate(translator).split(' ') for s in ans]

def evaluate_model(model, clip_model, preprocess, tokenizer, file_path, use_beam_search = False):    
    bleu = {1:[], 2:[], 3:[], 4:[]}
    meteor = []
    rouge_lf = []
    cider_scorer = Cider()
    ref_sentences,pred_sentence = {},{}
    filename,captions=[],[]
    with open(os.path.join(file_path,'captions.txt'), 'r') as file:
        lines = file.readlines()
        temp_file,caption = None,[]
        total_size = len(lines)-1
        train_size = int(total_size*0.9)
        for i in range(train_size, total_size):
            line = lines[1+i]
            temp = line.split('.jpg,')
            if temp_file is None:
                caption.append(temp[1][:-1])
                temp_file = temp[0] + '.jpg'
                continue
            elif temp_file == temp[0] + '.jpg':
                caption.append(temp[1][:-1])
                continue
            captions.append(caption)
            filename.append(temp_file)
            temp_file,caption = temp[0] + '.jpg',[temp[1][:-1]]
            
            file = os.path.join(file_path,'images',filename[-1])
            image = io.imread(file)
            pil_image = PIL.Image.fromarray(image)       
            image = preprocess(pil_image).unsqueeze(0).to(device)
            with torch.no_grad():
                prefix = clip_model.encode_image(image).to(device, dtype=torch.float32)
                # prefix = prefix / prefix.norm(2, -1).item()
                prefix_embed = model.clip_project(prefix).reshape(1, prefix_length, -1)
            if use_beam_search:
                generated_text_prefix = generate_beam(model, tokenizer, embed=prefix_embed)[0]
            else:
                generated_text_prefix = generate2(model, tokenizer, embed=prefix_embed)
            stop_token = '.'
            if stop_token in generated_text_prefix:
                generated_text_prefix = generated_text_prefix.split(stop_token)[0] + stop_token
                
            ground_truth = clean_sentence(captions[-1])
            predicted = generated_text_prefix.translate(translator).split(' ')
            ref_sentences[i] = captions[-1]
            pred_sentence[i] = [generated_text_prefix]
            
            score = BLEU_score(ground_truth, predicted)
            bleu[1].append(score[0])
            bleu[2].append(score[1])
            bleu[3].append(score[2])
            bleu[4].append(score[3])
            score = METEOR_score(ground_truth, predicted)
            meteor.append(score)
            rouge_lf.append(rouge_score([ground_truth], [predicted])['rouge-l']['f'])
    bleu_mean = [np.mean(bleu[i]) for i in range(1,5)]
    meteor_mean = np.mean(meteor)
    rouge_mean = np.mean(rouge_lf)
    cider_score,_ = cider_scorer.compute_score(ref_sentences, pred_sentence)
    return bleu_mean,meteor_mean,rouge_mean,cider_score

In [5]:
# clip_type,mapping_type='ViT-B/32','transformer'  #train without gpt
# clip_typem,mapping_type='ViT-B/32','mlp'          #train with gpt
# clip_typem,mapping_type='RN50x4','transformer'    #train resnet without gpt
# clip_typem,mapping_type='RN50x4','mlp'            #train resnet with gpt
# model_path = '/root/autodl-tmp/flicker8k_train/flicker8k-015.pt'
a = [('ViT-B/32','transformer'),('ViT-B/32','mlp'),('RN50x4','transformer'),('RN50x4','mlp')]
b = ['/root/autodl-tmp/flicker30k_train/flicker8k-010.pt','/root/autodl-tmp/flicker30k_train_gpt/flicker8k-019.pt',
    '/root/autodl-tmp/flicker30k_train_resnet/flicker30k-012.pt','/root/autodl-tmp/flicker30k_train_resnet_gpt/flicker30k-019.pt']
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
bleu_set,meteor_set,rouge_set,cider_set = [],[],[],[]
for i in range(4):
    device = CUDA(0)
    clip_type, mapping_type = a[i]
    model_path = b[i]
    clip_model, preprocess = clip.load(clip_type, device=device, jit=False)
    prefix_size = 640 if i >= 2 else 512 # i>=2代表resnet
    
    if mapping_type == 'transformer':
        prefix_length = 40
        model = ClipCaptionPrefix(prefix_length, clip_length=40, prefix_size=prefix_size,
                                          num_layers=8, mapping_type=mapping_type)
    else:
        prefix_length = 10
        model = ClipCaptionModel(prefix_length, clip_length=10, prefix_size=prefix_size,
                                          num_layers=8, mapping_type=mapping_type)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model = model.eval() 
    model = model.to(device)
    bleu,meteor,rouge,cider_score = evaluate_model(model=model,clip_model=clip_model,preprocess=preprocess,
               tokenizer=tokenizer, file_path='/root/image caption/flickr8k')
    bleu_set.append(bleu)
    meteor_set.append(meteor)
    rouge_set.append(rouge)
    cider_set.append(cider_score)

transformer was created


100%|██████████| 1/1 [00:00<00:00,  3.15it/s]
100%|██████████| 1/1 [00:00<00:00, 22.31it/s]
100%|██████████| 1/1 [00:00<00:00, 15.88it/s]
100%|██████████| 1/1 [00:00<00:00, 11.25it/s]
100%|██████████| 1/1 [00:00<00:00, 13.30it/s]
100%|██████████| 1/1 [00:00<00:00, 17.39it/s]
100%|██████████| 1/1 [00:00<00:00, 19.66it/s]
100%|██████████| 1/1 [00:00<00:00, 22.32it/s]
100%|██████████| 1/1 [00:00<00:00, 12.19it/s]
100%|██████████| 1/1 [00:00<00:00, 10.55it/s]
100%|██████████| 1/1 [00:00<00:00, 17.38it/s]
100%|██████████| 1/1 [00:00<00:00, 11.26it/s]
100%|██████████| 1/1 [00:00<00:00,  9.99it/s]
100%|██████████| 1/1 [00:00<00:00,  6.56it/s]
100%|██████████| 1/1 [00:00<00:00, 26.33it/s]
100%|██████████| 1/1 [00:00<00:00, 13.32it/s]
100%|██████████| 1/1 [00:00<00:00, 26.05it/s]
100%|██████████| 1/1 [00:00<00:00, 13.26it/s]
100%|██████████| 1/1 [00:00<00:00, 11.32it/s]
100%|██████████| 1/1 [00:00<00:00,  9.20it/s]
100%|██████████| 1/1 [00:00<00:00, 13.00it/s]
100%|██████████| 1/1 [00:00<00:00,

mlp was created


100%|██████████| 1/1 [00:00<00:00, 11.31it/s]
100%|██████████| 1/1 [00:00<00:00, 10.72it/s]
100%|██████████| 1/1 [00:00<00:00,  4.98it/s]
100%|██████████| 1/1 [00:00<00:00, 10.42it/s]
100%|██████████| 1/1 [00:00<00:00,  6.88it/s]
100%|██████████| 1/1 [00:00<00:00,  7.52it/s]
100%|██████████| 1/1 [00:00<00:00,  5.97it/s]
100%|██████████| 1/1 [00:00<00:00, 10.41it/s]
100%|██████████| 1/1 [00:00<00:00,  7.38it/s]
100%|██████████| 1/1 [00:00<00:00,  6.81it/s]
100%|██████████| 1/1 [00:00<00:00, 10.31it/s]
100%|██████████| 1/1 [00:00<00:00,  6.01it/s]
100%|██████████| 1/1 [00:00<00:00,  7.41it/s]
100%|██████████| 1/1 [00:00<00:00,  6.29it/s]
100%|██████████| 1/1 [00:00<00:00,  7.65it/s]
100%|██████████| 1/1 [00:00<00:00,  6.22it/s]
100%|██████████| 1/1 [00:00<00:00, 11.96it/s]
100%|██████████| 1/1 [00:00<00:00,  7.80it/s]
100%|██████████| 1/1 [00:00<00:00,  6.93it/s]
100%|██████████| 1/1 [00:00<00:00,  6.17it/s]
100%|██████████| 1/1 [00:00<00:00,  6.10it/s]
100%|██████████| 1/1 [00:00<00:00,

transformer was created


100%|██████████| 1/1 [00:00<00:00, 10.52it/s]
100%|██████████| 1/1 [00:00<00:00, 10.88it/s]
100%|██████████| 1/1 [00:00<00:00,  6.95it/s]
100%|██████████| 1/1 [00:00<00:00, 10.59it/s]
100%|██████████| 1/1 [00:00<00:00,  7.66it/s]
100%|██████████| 1/1 [00:00<00:00,  7.05it/s]
100%|██████████| 1/1 [00:00<00:00,  6.47it/s]
100%|██████████| 1/1 [00:00<00:00,  9.14it/s]
100%|██████████| 1/1 [00:00<00:00,  5.66it/s]
100%|██████████| 1/1 [00:00<00:00,  5.45it/s]
100%|██████████| 1/1 [00:00<00:00,  8.26it/s]
100%|██████████| 1/1 [00:00<00:00,  6.25it/s]
100%|██████████| 1/1 [00:00<00:00,  7.00it/s]
100%|██████████| 1/1 [00:00<00:00,  7.00it/s]
100%|██████████| 1/1 [00:00<00:00, 11.93it/s]
100%|██████████| 1/1 [00:00<00:00, 10.35it/s]
100%|██████████| 1/1 [00:00<00:00, 11.97it/s]
100%|██████████| 1/1 [00:00<00:00, 10.47it/s]
100%|██████████| 1/1 [00:00<00:00, 10.65it/s]
100%|██████████| 1/1 [00:00<00:00,  5.63it/s]
100%|██████████| 1/1 [00:00<00:00,  8.45it/s]
100%|██████████| 1/1 [00:00<00:00,

mlp was created


100%|██████████| 1/1 [00:00<00:00,  6.65it/s]
100%|██████████| 1/1 [00:00<00:00, 13.37it/s]
100%|██████████| 1/1 [00:00<00:00,  5.57it/s]
100%|██████████| 1/1 [00:00<00:00, 11.97it/s]
100%|██████████| 1/1 [00:00<00:00, 11.16it/s]
100%|██████████| 1/1 [00:00<00:00,  8.87it/s]
100%|██████████| 1/1 [00:00<00:00,  7.09it/s]
100%|██████████| 1/1 [00:00<00:00, 17.93it/s]
100%|██████████| 1/1 [00:00<00:00, 15.54it/s]
100%|██████████| 1/1 [00:00<00:00,  9.03it/s]
100%|██████████| 1/1 [00:00<00:00, 10.66it/s]
100%|██████████| 1/1 [00:00<00:00,  6.67it/s]
100%|██████████| 1/1 [00:00<00:00,  7.79it/s]
100%|██████████| 1/1 [00:00<00:00,  8.43it/s]
100%|██████████| 1/1 [00:00<00:00, 14.17it/s]
100%|██████████| 1/1 [00:00<00:00,  7.80it/s]
100%|██████████| 1/1 [00:00<00:00, 10.24it/s]
100%|██████████| 1/1 [00:00<00:00,  9.70it/s]
100%|██████████| 1/1 [00:00<00:00, 12.28it/s]
100%|██████████| 1/1 [00:00<00:00,  5.73it/s]
100%|██████████| 1/1 [00:00<00:00,  9.02it/s]
100%|██████████| 1/1 [00:00<00:00,

In [6]:
bleu_set

[[0.6098144004944375,
  0.41786634116192833,
  0.2716275401730532,
  0.17373831891223734],
 [0.6391612484548826,
  0.454927107540173,
  0.3092534857849196,
  0.20509682323856612],
 [0.6130999011124846,
  0.4264382694684796,
  0.2782275401730532,
  0.1796872311495674],
 [0.6649132509270704,
  0.4833706922126082,
  0.3322372435105068,
  0.22577663782447466]]

In [7]:
meteor_set

[0.4277412680836095, 0.4677643732276355, 0.435468054539084, 0.4846141338990003]

In [8]:
rouge_set

[0.6377766583149633,
 0.6479664309143827,
 0.6405221969455864,
 0.6553122592432998]

In [10]:
cider_set

[0.4498824055328265, 0.558344886419358, 0.4548387732371944, 0.608970027973429]

In [6]:
import pandas as pd
data = {"bleu1":[bleu_set[i][0] for i in range(4)],"bleu2":[bleu_set[i][1] for i in range(4)],
        "bleu3":[bleu_set[i][2] for i in range(4)],"bleu4":[bleu_set[i][3] for i in range(4)],
    "meteor":meteor_set,"rouge":rouge_set,"cider":cider_set}
df = pd.DataFrame(data,index=['transformer','mlp_with_gpt','resnet_transformer','resnet_with_gpt'])
df.to_csv('result2.csv')
df

Unnamed: 0,bleu1,bleu2,bleu3,bleu4,meteor,rouge,cider
transformer,0.759164,0.609084,0.466379,0.347313,0.5696,0.672368,0.979384
mlp_with_gpt,0.769449,0.626528,0.485299,0.362099,0.586212,0.681516,1.034346
resnet_transformer,0.77955,0.637728,0.496967,0.371384,0.586139,0.681235,1.056675
resnet_with_gpt,0.762372,0.61605,0.473515,0.350097,0.581276,0.68011,1.012818


In [17]:
df

Unnamed: 0,bleu1,bleu2,bleu3,bleu4,meteor,rouge,cider
transformer,0.609814,0.417866,0.271628,0.173738,0.427741,0.637777,0.449882
mlp_with_gpt,0.639161,0.454927,0.309253,0.205097,0.467764,0.647966,0.558345
resnet_transformer,0.6131,0.426438,0.278228,0.179687,0.435468,0.640522,0.454839
resnet_with_gpt,0.664913,0.483371,0.332237,0.225777,0.484614,0.655312,0.60897
