In [1]:
import os
import math
import gc
import json
import torch
import torch.nn.functional as F
import tiktoken
import bitsandbytes as bnb
import textwrap

from collections import defaultdict
from pathlib import Path
from torch import nn
from tiktoken.load import load_tiktoken_bpe
from transformers import PretrainedConfig
from accelerate import init_empty_weights
from accelerate.utils import BnbQuantizationConfig, load_and_quantize_model
from safetensors.torch import load_file
from dataclasses import dataclass, asdict

In [2]:
def clean_gpu():
    gc.collect()
    torch.cuda.empty_cache()

clean_gpu()

In [3]:
if torch.cuda.is_bf16_supported():
    torch.set_default_dtype(torch.bfloat16)

torch.get_default_dtype()

torch.bfloat16

### Tokenizer

In [4]:
class Tokenizer:
    def __init__(self, path):
        self.special_tokens = defaultdict(int)
        self.num_reserved_special_tokens = 256
        self.pat_str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"

        self.vocab = load_tiktoken_bpe(path)
        self.num_base_tokens = len(self.vocab)

        special_tokens = [
            '<|begin_of_text|>',
            '<|end_of_text|>',
            '<|reserved_special_token_0|>',
            '<|reserved_special_token_1|>',
            '<|reserved_special_token_2|>',
            '<|reserved_special_token_3|>',
            '<|start_header_id|>',
            '<|end_header_id|>',
            '<|reserved_special_token_4|>',
            '<|eot_id|>',
        ]

        special_tokens += [
            f'<|reserved_special_token_{i}|>'
            for i in range(5, self.num_reserved_special_tokens - 5)
        ]

        self.special_tokens = {
            token: self.num_base_tokens + i for i, token in enumerate(special_tokens)
        }

        self.model = tiktoken.Encoding(
            name=Path(path).name,
            pat_str=self.pat_str,
            mergeable_ranks=self.vocab,
            special_tokens=self.special_tokens
        )

        self.number_of_words = self.model.n_vocab

        self.bos_id = self.special_tokens['<|begin_of_text|>']
        self.eos_id = self.special_tokens['<|end_of_text|>']
        self.pad_id = -1
        self.stop_tokens = {
            self.special_tokens['<|end_of_text|>'],
            self.special_tokens['<|eot_id|>'],
        }
        
    def encode(
        self,
        s,
        *,
        bos,
        eos,
        allowed_special=set(),
        disallowed_special=()
    ):
        tokens = self.model.encode(s, allowed_special=allowed_special, disallowed_special=disallowed_special)
        if bos:
            tokens.insert(0, self.bos_id)
        if eos:
            tokens.append(self.eos_id)
        return tokens

    def decode(self, t):
        return self.model.decode(t)

    def encode_with_prompt(self, text):
        prompt = f'''<|begin_of_text|>
<|start_header_id|>user<|end_header_id|>
{text}<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
'''
        return self.encode(prompt, bos=True, eos=False)

In [5]:
tokenizer = Tokenizer('./tokenizer.model')

### Hyperparameters

In [6]:
with open('./hyperparameters.json', 'r') as f:
    hyperparameters = json.loads(f.read())

hyperparameters

{'dim': 4096,
 'n_layers': 32,
 'n_heads': 32,
 'n_kv_heads': 8,
 'vocab_size': 128256,
 'multiple_of': 1024,
 'ffn_dim_multiplier': 1.3,
 'norm_eps': 1e-05,
 'rope_theta': 500000.0,
 'max_batch_size': 6,
 'max_seq_len': 512}

Defining a data class so it is easier to pass the properties within the model by accessing a global hparams.

In [7]:
@dataclass
class ModelConfig:
    dim: int
    n_layers: int
    n_heads: int
    n_kv_heads: int
    vocab_size: int
    multiple_of: int
    ffn_dim_multiplier: float
    norm_eps: float
    rope_theta: float
    max_batch_size: int
    max_seq_len: int

    def __repr__(self):
        return json.dumps(asdict(self), indent=4)

hparams = ModelConfig(
    dim=4096,
    n_layers=32,
    n_heads=32,
    n_kv_heads=8,
    vocab_size=128256,
    multiple_of=1024,
    ffn_dim_multiplier=1.3,
    norm_eps=1e-05,
    rope_theta=500000.0,
    max_batch_size=6,
    max_seq_len=512
)

hparams, hyperparameters

({
     "dim": 4096,
     "n_layers": 32,
     "n_heads": 32,
     "n_kv_heads": 8,
     "vocab_size": 128256,
     "multiple_of": 1024,
     "ffn_dim_multiplier": 1.3,
     "norm_eps": 1e-05,
     "rope_theta": 500000.0,
     "max_batch_size": 6,
     "max_seq_len": 512
 },
 {'dim': 4096,
  'n_layers': 32,
  'n_heads': 32,
  'n_kv_heads': 8,
  'vocab_size': 128256,
  'multiple_of': 1024,
  'ffn_dim_multiplier': 1.3,
  'norm_eps': 1e-05,
  'rope_theta': 500000.0,
  'max_batch_size': 6,
  'max_seq_len': 512})

### Transformer Model

In [8]:
def precompute_freqs_complex_exponential(dim, sequence_length, theta=10000.0):
    ''' Computes the frequencies that will be used for positional encoding and also for rotary embeddings in
    the attention mechanism.
    '''
    # Get the even indices within the embedding dimension and normalises them.
    even_indices = torch.arange(0, dim, 2)[: (dim // 2)].float()
    normalised_even_indices = even_indices / dim

    # Formula for the frequencies.
    freqs = 1.0 / (theta ** normalised_even_indices)

    # Gets an increasing sequence to the size of the input sequence (time steps).
    timesteps = torch.arange(sequence_length, device=freqs.device, dtype=torch.float32)

    # Multiplies each timestep for all values in frequencies to form the frequencies matrix.
    # These will be the angles for the polar function.
    freqs = torch.outer(timesteps, freqs)

    # Creates a mask filled with ones.
    ones = torch.ones_like(freqs)

    # Computes the complex tensor representing the cartesian coordinates that correspond to the polar coordinates (abs "ones" and angles "freqs").
    freqs_complex_exponential = torch.polar(ones, freqs)

    return freqs_complex_exponential

In [9]:
def reshape_for_broadcast(freqs_cis, ndim, shape):
    shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(shape)]
    return freqs_cis.view(*shape)

def apply_rotary_emb(xq, xk, freqs_cis):
    ''' Apply the rotary embeddings.
    '''
    # We start by reshaping the inputs. Their last dimension is the head_dim, so we need to make sure we split the head dim into 2 parts
    # to account for the complex part.
    xq_complex = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
    xk_complex = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))

    # Ensure freqs_cis has the correct dimensions compatible with broadcasting. E.g (a, 1, b, c, 1)
    # Note that xq has shape (batch_size, sequence_length, n_heads, head_dim), meaning with the hyperparamers as they are
    # it will be (6, 512, 32, 4096 / 32) -> (6, 512, 32, 128). xq_complex will be (6, 512, 32, 64, 2).
    freqs_cis = reshape_for_broadcast(freqs_cis, xq_complex.ndim, xq_complex.shape)

    # Now we can apply the rotary embeddings and flatten from dimension 3 (so we get the 128 back with 4 dimensions instead of 5.
    xq_out = torch.view_as_real(xq_complex * freqs_cis).flatten(3)
    xk_out = torch.view_as_real(xk_complex * freqs_cis).flatten(3)

    # Retain the datatypes
    return xq_out.type_as(xq), xk_out.type_as(xk)

In [10]:
def repeat_kv(x, n_rep):
    ''' Repeat x n_rep times. The idea is to spread these tensors through more attention heads. 
    '''
    bs, slen, n_kv_heads, head_dim = x.shape

    if n_rep == 1:
        return x
        
    return (
        x[:, :, :, None, :]
        .expand(bs, slen, n_kv_heads, n_rep, head_dim)
        .reshape(bs, slen, n_kv_heads * n_rep, head_dim)
    )

In [11]:
class Attention(nn.Module):
    def __init__(self):
        super(Attention, self).__init__()
        self.n_heads = hparams.n_heads
        self.n_kv_heads = hparams.n_heads if hparams.n_kv_heads is None else hparams.n_kv_heads
        self.head_dim = hparams.dim // hparams.n_heads

        self.wq = nn.Linear(hparams.dim, hparams.n_heads * self.head_dim, bias=False)
        self.wk = nn.Linear(hparams.dim, self.n_kv_heads * self.head_dim, bias=False)
        self.wv = nn.Linear(hparams.dim, self.n_kv_heads * self.head_dim, bias=False)
        self.wo = nn.Linear(hparams.n_heads * self.head_dim, hparams.dim, bias=False)

        self.register_buffer('cache_k', torch.zeros((hparams.max_batch_size, hparams.max_seq_len, self.n_kv_heads, self.head_dim)))
        self.register_buffer('cache_v', torch.zeros((hparams.max_batch_size, hparams.max_seq_len, self.n_kv_heads, self.head_dim)))

    def forward(self, x, start_pos, freqs_cis, mask=None):
        batch_size, sequence_length, _ = x.shape
        xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)

        xq = xq.view(batch_size, sequence_length, self.n_heads, self.head_dim)
        xk = xk.view(batch_size, sequence_length, self.n_kv_heads, self.head_dim)
        xv = xv.view(batch_size, sequence_length, self.n_kv_heads, self.head_dim)

        xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis)

        self.cache_k = self.cache_k.to(xq)
        self.cache_v = self.cache_v.to(xq)

        self.cache_k[:batch_size, start_pos : start_pos + sequence_length] = xk.clone().detach()
        self.cache_v[:batch_size, start_pos : start_pos + sequence_length] = xv.clone().detach()

        keys = self.cache_k[:batch_size, : start_pos + sequence_length]
        values = self.cache_v[:batch_size, : start_pos + sequence_length]

        keys = repeat_kv(keys, 4)
        values = repeat_kv(values, 4)

        xq = xq.transpose(1, 2)
        keys = keys.transpose(1, 2)
        values = values.transpose(1, 2)

        # scores = torch.matmul(xq, keys.transpose(2, 3)) / math.sqrt(self.head_dim)
        # if mask is not None:
        #     scores = scores + mask
        # scores = F.softmax(scores.float(), dim=-1).type_as(xq)
        # output = torch.matmul(scores, values)

        output = F.scaled_dot_product_attention(xq, keys, values, scale=(1 / math.sqrt(self.head_dim)), attn_mask=mask)

        output = output.transpose(1, 2).contiguous().view(batch_size, sequence_length, -1)

        return self.wo(output)

In [12]:
class FeedForward(nn.Module):
    def __init__(self, dim, hidden_dim, multiple_of, ffn_dim_multiplier):
        super(FeedForward, self).__init__()
        hidden_dim = int(2 * hidden_dim / 3)
        if ffn_dim_multiplier is not None:
            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)

        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
        self.w3 = nn.Linear(dim, hidden_dim, bias=False)

    def forward(self, x):
        return self.w2(F.silu(self.w1(x)) * self.w3(x))

In [13]:
class RMSNorm(nn.Module):
    def __init__(self, dim, eps=1e-6):
        super(RMSNorm, self).__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(dim))

    def _norm(self, x):
        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)

    def forward(self, x):
        output = self._norm(x.float()).type_as(x)
        return output * self.weight

In [14]:
class TransformerBlock(nn.Module):
    def __init__(self, layer_id):
        super(TransformerBlock, self).__init__()
        self.n_heads = hparams.n_heads
        self.dim = hparams.dim
        self.attention = Attention()
        self.feed_forward = FeedForward(
            dim=hparams.dim,
            hidden_dim=4 * hparams.dim,
            multiple_of=hparams.multiple_of,
            ffn_dim_multiplier=hparams.ffn_dim_multiplier
        )
        self.layer_id = layer_id
        self.attention_norm = RMSNorm(hparams.dim, eps=hparams.norm_eps)
        self.ffn_norm = RMSNorm(hparams.dim, eps=hparams.norm_eps)

    def forward(self, x, start_position, freqs_cis, mask=None):
        hidden_state = x + self.attention(self.attention_norm(x), start_position, freqs_cis, mask)
        output = hidden_state + self.feed_forward(self.ffn_norm(hidden_state))
        return output

In [15]:
# Required so it is compatible with the Hugging Face trainer.
class CustomConfig(PretrainedConfig):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.model_type = 'custom'

In [16]:
class Transformer(nn.Module):
    def __init__(self):
        super(Transformer, self).__init__()

        # required for Hugging Face
        self.config = CustomConfig()

        self.vocab_size = hparams.vocab_size
        self.n_layers = hparams.n_layers

        self.tok_embeddings = nn.Embedding(
            hparams.vocab_size,
            hparams.dim
        )

        self.layers = nn.ModuleList()
        for layer_id in range(self.n_layers):
            self.layers.append(TransformerBlock(layer_id))

        self.norm = RMSNorm(hparams.dim, eps=hparams.norm_eps)
        self.output = nn.Linear(hparams.dim, hparams.vocab_size, bias=False)

        self.freqs_cis = precompute_freqs_complex_exponential(
            hparams.dim // hparams.n_heads,
            hparams.max_seq_len * 2,
            hparams.rope_theta
        )

    def forward(
        self,
        input_ids,
        attention_mask=None,
        start_position=0,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None
    ):
        ''' Some arguments in the method signature are not used but are required to make this compatible with the trainer from Hugging Face.
        '''
        batch_size, sequence_length = input_ids.shape
        hidden_state = self.tok_embeddings(input_ids)
        self.freqs_cis = self.freqs_cis.to(hidden_state.device)

        freqs_cis = self.freqs_cis[start_position : start_position + sequence_length]

        mask = None

        if sequence_length > 1:
            mask = torch.full((sequence_length, sequence_length), float('-inf'), device=input_ids.device)
            mask = torch.triu(mask, diagonal=1)

            cached_shift = torch.zeros((sequence_length, start_position), device = input_ids.device)
            mask = torch.hstack([cached_shift, mask]).type_as(hidden_state) 

        for layer in self.layers:
            hidden_state = layer(hidden_state, start_position, freqs_cis, mask)

        hidden_state = self.norm(hidden_state)

        logits = self.output(hidden_state).float()

        loss = None
        if labels is not None:
            logits = logits.view(-1, logits.size(-1))  # (batch_size * sequence_length, num_classes)
            labels = labels.view(-1)  # (batch_size * sequence_length)
            loss = nn.CrossEntropyLoss(ignore_index=-100)(logits, labels)

        if loss is not None:
            return {'logits': logits, 'loss': loss}
        else:
            return logits

### Testing

In [17]:
def sample_top_p(probs, p):
    ''' Top P - Sorts the tokens from highest probabilities to lowest and calculates cumulative probabilities up to the cumulative >= p.
    '''
    probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
    probs_sum = torch.cumsum(probs_sort, dim=-1)
    mask = probs_sum - probs_sort > p
    probs_sort[mask] = 0.0
    
    probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))

    next_token = torch.multinomial(probs_sort, num_samples=1)

    next_token = torch.gather(probs_idx, -1, next_token)
    return next_token

In [18]:
def temperature_and_top_p_sampling(logits, temperature, top_p):
    ''' Applies temperature and calculates top P. If temperature is 0 we just get the token with highest logit.
    '''
    if temperature > 0:
        probs = torch.softmax(logits[:, -1] / temperature, dim=-1)
        next_token = sample_top_p(probs, top_p)
    else:
        next_token = torch.argmax(logits[:, -1], dim=-1)
    return next_token

In [19]:
def generate(prompt_tokens, max_gen_len, temperature, top_p, full_seq, token_window):
    ''' Note: It is better to use full_seq = True for improved quality in the output.
    This implementation uses a KV cache to speed up the generation. By default, after generating the first token, it needs only the latest token in order to take advantage of the cache.
    It works well in 16/8-bit quantization, but I had a few issues when I quantized it further to 4-bit.
    To address that, I created a 'token window' that works as a sliding window and provides enough context for the model.
    '''
    batch_size = len(prompt_tokens)

    # Finding the boundaries / limits.
    min_prompt_len = min(len(t) for t in prompt_tokens)
    max_prompt_len = max(len(t) for t in prompt_tokens)
    total_len = min(hparams.max_seq_len, max_gen_len + max_prompt_len)

    # Here we assume we receive a batch of multiple tokenized sequences.
    pad_id = tokenizer.pad_id
    tokens = torch.full((batch_size, total_len), pad_id, dtype=torch.long, device='cuda')
    
    for batch, tokens_list in enumerate(prompt_tokens):
        tokens[batch, : len(tokens_list)] = torch.tensor(tokens_list, dtype=torch.long, device='cuda')

    # Define stop conditions, input mask and the stop tokens (extracted from the tokenizer)
    previous_position = 0
    eos_reached = torch.tensor([False] * batch_size, device='cuda')
    input_text_mask = tokens != pad_id
    stop_tokens = torch.tensor(list(tokenizer.stop_tokens), device='cuda')

    with torch.no_grad():
        for current_position in range(min_prompt_len, total_len):
            if full_seq:
                # no cache
                logits = model.forward(tokens[:, :current_position], start_position=0)
            else:
                # uses cache with a sliding window of size token_window.
                logits = model.forward(tokens[:, max(0, previous_position - token_window):current_position], start_position=previous_position)
                
            # Temperature and sampling.
            next_token = temperature_and_top_p_sampling(logits, temperature, top_p)
            next_token = next_token.reshape(-1)

            # Gets the next token depending on the condition (mask) and appends to tokens.
            next_token = torch.where(
                input_text_mask[:, current_position], tokens[:, current_position], next_token
            )
            tokens[:, current_position] = next_token

            # Checks if we reached the eos on all sequences in the batch and updates the current position.
            eos_reached |= (~input_text_mask[:, current_position]) & (torch.isin(next_token, stop_tokens))
            
            previous_position = current_position
            if all(eos_reached):
                break

        # For all the sequences, we extract all tokens up to a stop_token if it exists.
        out_tokens = []
        for i, toks in enumerate(tokens.tolist()):
            start = len(prompt_tokens[i])
            toks = toks[start : len(prompt_tokens[i]) + max_gen_len]

            for stop_token in tokenizer.stop_tokens:
                try:
                    eos_idx = toks.index(stop_token)
                    toks = toks[:eos_idx]
                except ValueError:
                    pass
            out_tokens.append(toks)

    torch.cuda.empty_cache()

    return out_tokens

In [20]:
def test_dialogue_custom(
    texts,
    *,
    max_gen_len=256,
    temperature=0.6,
    top_p=0.9,
    full_seq=False,
    token_window=4,
    text_width=200
):
    if not isinstance(texts, list):
        texts = [texts]

    prompt_tokens = [tokenizer.encode_with_prompt(text) for text in texts]
    
    generation_tokens = generate(
        prompt_tokens=prompt_tokens,
        max_gen_len=max_gen_len,
        temperature=temperature,
        top_p=top_p,
        full_seq=full_seq,
        token_window=token_window
    )

    results = [tokenizer.decode(t) for t in generation_tokens]

    for result in results:
        _result = result.split('<|eot_id|>')[0]

        paragraphs = _result.split('\n')
        _result = '\n'.join(textwrap.fill(paragraph, width=text_width) for paragraph in paragraphs)

        print(_result)

        if len(results) > 1:
            print('\n-------------------------------------------\n\n')

## Initialize and quantize the model with the pretrained weights

In [21]:
clean_gpu()

with init_empty_weights():
    model = Transformer()

bnb_quantization_config = BnbQuantizationConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4'
)

model = load_and_quantize_model(
    model,
    bnb_quantization_config=bnb_quantization_config,
    device_map = 'auto',
    weights_location = './pretrained.pth'
)

### Pretrained weights tests

In [22]:
test_dialogue_custom('Where is the city of new york?')

<|start_header_id|>assistant<|end_header_id|>
_header_id|>
<|start_header_id|>assistant<|end_header_id|>
_header_id|>
<|start_header_id|>assistant<|end_header_id|>
_header_id|>
<|start_header_id|>assistant<|end_header_id|>
_header_id|>
<|start_header_id|>assistant<|end_header_id|>
_header_id|>
<|start_header_id|>assistant<|end_header_id|>
_header_id|>
<|start_header_id|>assistant<|end_header_id|>
_header_id|>
<|start_header_id|>assistant<|end_header_id|>
_header_id|>
<|start_header_id|>assistant<|end_header_id|>
_header_id|>
<|start_header_id|>assistant<|end_header_id|>
_header_id|>
<|start_header_id|>assistant<|end_header_id|>
_header_id|>
<|start_header_id|>assistant<|end_header_id|>
_header_id|>
<|start_header_id|>assistant<|end_header_id|>
_header_id|>
<|start_header_id|>assistant<


In [23]:
test_dialogue_custom('What is healthier? Fish or Meat?')

<|start_header_id|>assistant<|end_header_id|>
<|start_header_id|>assistant<|end_header_id|>
<|start_header_id|>assistant<|end_header_id|>
<|start_header_id|>assistant<|end_header_id|>
<|start_header_id|>assistant<|end_header_id|>
<|start_header_id|>assistant<|end_header_id|>
<|start_header_id|>assistant<|end_header_id|>
<|start_header_id|>assistant<|end_header_id|>
<|start_header_id|>assistant<|end_header_id|>
<|start_header_id|>assistant<|end_header_id|>
<|start_header_id|>assistant<|end_header_id|>
<|start_header_id|>assistant<|end_header_id|>
<|start_header_id|>assistant<|end_header_id|>
<|start_header_id|>assistant<|end_header_id|>
<|start_header_id|>assistant<|end_header_id|>
<|start_header_id|>assistant<|end_header_id|>
<|start_header_id|>assistant<|end_header_id|>
<


In [24]:
test_dialogue_custom('Using python, how do I write a function to add 2 numbers and print the result?')

You can add two numbers using the + operator in Python. For example, if you want to add 2 and 3, you can use the following code:
result = 2 + 3
print(result)
This will print 5.
Alternatively, you can use the sum() function to add a list of numbers. For example, if you have a list of numbers called numbers, you can use the following code to add all the numbers in the list:
sum(numbers)
This will return the sum of all the numbers in the list.
You can also use the built-in function sum() to add a list of numbers. For example, if you have a list of numbers called numbers, you can use the following code to add all the numbers in the list:
sum(numbers)
This will return the sum of all the numbers in the list.
Alternatively, you can use the built-in function sum() to add a list of numbers. For example, if you have a list of numbers called numbers, you can use the following code to add all the numbers in the
list:
sum(numbers)
This will return the sum of all the numbers in the list.
Alternative

In [25]:
test_dialogue_custom(['What is the capital of Portugal?', 'What is the capital of the UK?'])

Portugal's capital is Lisbon.

-------------------------------------------


What is the capital of the UK?

-------------------------------------------




### Loading finetuned weights

In [26]:
model.load_state_dict(load_file('./finetuned.safetensors'))

<All keys matched successfully>

### Finetuned weights tests

In [27]:
test_dialogue_custom('Where is the city of new york?')

The city of New York is located in the state of New York in the United States of America. It is located in the northeastern part of the country, on the island of Manhattan. The city is situated on the
Hudson River and is surrounded by the Atlantic Ocean on the east and the Long Island Sound on the north.
The city is known for its iconic landmarks, including the Statue of Liberty, the Empire State Building, and the Brooklyn Bridge. It is also known for its diverse culture, with a large population of
immigrants from around the world. The city is also home to a wide range of industries, including finance, media, and technology, and is a major hub for international trade and commerce.


In [28]:
test_dialogue_custom('What is healthier? Fish or Meat?')

It is difficult to say which is healthier, fish or meat. Both have their own advantages and disadvantages. Fish is a good source of protein and omega-3 fatty acids, which are beneficial for heart
health. However, it is important to note that some types of fish may contain high levels of mercury, which can be harmful to health. Meat is also a good source of protein and nutrients, but it is
important to choose lean cuts of meat and to limit consumption of processed meats, which are often high in saturated fat and sodium. In general, a balanced diet that includes both fish and meat is
considered to be the most healthy option.


In [29]:
test_dialogue_custom('Using python, how do I write a program to add 2 numbers and print the result?')

You can use the `sum` function in Python to add two numbers and print the result. The `sum` function takes two arguments, the first being the list of numbers you want to add, and the second being the
number you want to add to each number in the list. Here is an example of how to use the `sum` function:
```
numbers = [1, 2, 3, 4, 5]
result = sum(numbers, 10)
print(result)
```
In this example, the `sum` function takes the list `numbers` and adds 10 to each number in the list, resulting in the number 15. The `print` function is then used to print the result to the console.


In [30]:
test_dialogue_custom(['What is the capital of Portugal?', 'What is the capital of the UK?'])

The capital of Portugal is Lisbon.

-------------------------------------------


The capital of the UK is London.

-------------------------------------------


