In [17]:
print("Hello")

Hello


In [2]:
import torch

if torch.cuda.is_available():
    print("CUDA is available! You have GPU access.")
else:
    print("CUDA is not available. You do not have GPU access.")

CUDA is available! You have GPU access.


In [3]:
from transformers import GPT2Model, GPT2Config
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers.models.gpt2.modeling_gpt2 import GPT2Block
from typing import Optional, Tuple, Union # Import Optional, Tuple, and Union

  from .autonotebook import tqdm as notebook_tqdm


ALU implementation

In [4]:
class ALU(torch.nn.Module):
    def __init__(self, model_dim=768, hidden_dim=512, internal_dim=10, use_output_projection=False):
        super(ALU, self).__init__()

        # input mlp does model_dim -> hidden_dim -> hidden_dim -> (internal_dim * 2 + 4)
        self.input_mlp = nn.Sequential(
            nn.Linear(model_dim, hidden_dim),
            nn.LeakyReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.LeakyReLU(),
            nn.Linear(hidden_dim, internal_dim * 2 + 4),
            nn.LeakyReLU()
        )

        if use_output_projection:
            # output projection does 1 -> internal_dim -> hidden_dim -> model_dim
            self.output_projection = nn.Sequential(
                nn.Linear(1, internal_dim),
                nn.ReLU(),
                nn.Linear(internal_dim, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, model_dim)
            )

        self.eps = 1e-8
        self.base = torch.tensor([1, 2, 4, 8, 16, 32, 64, 128, 256, 512])

    def forward(self, x):
        # print("X-before: ", x.shape)
        x = self.input_mlp(x)
        a = x[:, :10]
        b = x[:, 10:20]
        op = x[:, 20:24]
        # print("X-after: ", x.shape)
        # print("A: ", a.shape)
        # print("B: ", b.shape)
        # print("OP: ", op.shape)
        base = torch.tensor([1, 2, 4, 8, 16, 32, 64, 128, 256, 512], device=x.device, dtype=x.dtype)
        a = torch.matmul(a, base)
        b = torch.matmul(b, base)

        op_weights = F.softmax(op, dim=1)  # Shape: (batch_size, 4)

        add = a + b
        sub = a - b
        mul = a * b
        div = a / (b + self.eps)

        op_outs = torch.stack([add, sub, mul, div], dim=1)  # Shape: (batch_size, 4)
        result = torch.sum(op_outs * op_weights, dim=1, keepdim=True)  # Shape: (batch_size, 1)

        if hasattr(self, 'output_projection'):
            result = self.output_projection(result)

        return result

Standard GPT-2

In [5]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig, GPT2Config
configuration = GPT2Config()
model = GPT2LMHeadModel(configuration)
print(model)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)


Modified GPT-2

In [6]:
class CustomGPT2Block(GPT2Block):
    def __init__(self, config):
        super().__init__(config)
        self.alu = ALU(model_dim=config.n_embd, use_output_projection=True)
        self.linear = nn.Linear(config.n_embd, config.n_embd)  # Linear
        self.final_projection = nn.Linear(config.n_embd * 2, config.n_embd)

    def forward(
        self,
        hidden_states: Optional[Tuple[torch.FloatTensor]],
        layer_past: Optional[Tuple[torch.Tensor]] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = False,
        output_attentions: Optional[bool] = False,
    ) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
        residual = hidden_states
        hidden_states = self.ln_1(hidden_states)
        attn_outputs = self.attn(
            hidden_states,
            layer_past=layer_past,
            attention_mask=attention_mask,
            head_mask=head_mask,
            use_cache=use_cache,
            output_attentions=output_attentions,
        )
        attn_output = attn_outputs[0]  # output_attn: a, present, (attentions)
        outputs = attn_outputs[1:]
        # residual connection
        hidden_states = attn_output + residual

        if encoder_hidden_states is not None:
            # add one self-attention block for cross-attention
            if not hasattr(self, "crossattention"):
                raise ValueError(
                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with "
                    "cross-attention layers by setting `config.add_cross_attention=True`"
                )
            residual = hidden_states
            hidden_states = self.ln_cross_attn(hidden_states)
            cross_attn_outputs = self.crossattention(
                hidden_states,
                attention_mask=attention_mask,
                head_mask=head_mask,
                encoder_hidden_states=encoder_hidden_states,
                encoder_attention_mask=encoder_attention_mask,
                output_attentions=output_attentions,
            )
            attn_output = cross_attn_outputs[0]
            # residual connection
            hidden_states = residual + attn_output
            outputs = outputs + cross_attn_outputs[2:]  # add cross attentions if we output attention weights

        alu_hidden_states = self.linear(hidden_states) # NEW CODE: using a linear layer to transform the current hidden_state for alu computation
        summed_alu_hidden_states = alu_hidden_states.sum(dim=1)  # NEW CODE: summing across dimension 1 (sequence length) Shape: [batch_size, embedding_dim]
        alu_output = self.alu(summed_alu_hidden_states)     # NEW CODE: calling the ALU using the hidden_states
        residual = hidden_states
        hidden_states = self.ln_2(hidden_states)
        feed_forward_hidden_states = self.mlp(hidden_states)
        # residual connection
        hidden_states = residual + feed_forward_hidden_states
        hidden_states = torch.cat([hidden_states, alu_output.unsqueeze(1).expand(-1, hidden_states.size(1), -1)], dim=-1) # NEW CODE: concatenating the ALU output to the hidden states
        hidden_states = self.final_projection(hidden_states)  # NEW CODE: projecting the hidden_state to the required dimension
        outputs = (hidden_states,) + outputs

        if use_cache:
            outputs = (hidden_states,) + outputs
        else:
            outputs = (hidden_states,) + outputs[1:]

        return outputs  # hidden_states, present, (attentions, cross_attentions)

In [7]:
class CustomGPT2Model(GPT2LMHeadModel):
    def __init__(self, config):
        super().__init__(config)
        self.transformer = GPT2Model(config)
        num_layers = len(self.transformer.h)
        for i in range(num_layers - 3, num_layers):
            self.transformer.h[i] = CustomGPT2Block(config)
        
        #Add LM head
        #self.lm_head = torch.nn.Linear(config.n_embd, config.vocab_size, bias=False)

    def forward(self, input_ids=None, past_key_values=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None, labels=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None):
        
        return super().forward(
            input_ids,
            past_key_values, 
            attention_mask, 
            token_type_ids, 
            position_ids, 
            head_mask, 
            inputs_embeds, 
            encoder_hidden_states, 
            encoder_attention_mask, 
            labels,
            use_cache, 
            output_attentions, 
            output_hidden_states, 
            return_dict)
    
        # hidden_states = outputs.last_hidden_state
        # logits = self.lm_head(hidden_states)

        # return logits


In [8]:
model2 = CustomGPT2Model(configuration)
print(model2)

CustomGPT2Model(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-8): 9 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (9-11): 3 x CustomGPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_p

Later to load the weights (Need to verify this once)

In [9]:
config = GPT2Config.from_pretrained('gpt2')
customModel = CustomGPT2Model(config)

# If you want to load pre-trained weights:
state_dict = GPT2Model.from_pretrained('gpt2').state_dict()
customModel.load_state_dict(state_dict, strict=False)

_IncompatibleKeys(missing_keys=['transformer.wte.weight', 'transformer.wpe.weight', 'transformer.h.0.ln_1.weight', 'transformer.h.0.ln_1.bias', 'transformer.h.0.attn.c_attn.weight', 'transformer.h.0.attn.c_attn.bias', 'transformer.h.0.attn.c_proj.weight', 'transformer.h.0.attn.c_proj.bias', 'transformer.h.0.ln_2.weight', 'transformer.h.0.ln_2.bias', 'transformer.h.0.mlp.c_fc.weight', 'transformer.h.0.mlp.c_fc.bias', 'transformer.h.0.mlp.c_proj.weight', 'transformer.h.0.mlp.c_proj.bias', 'transformer.h.1.ln_1.weight', 'transformer.h.1.ln_1.bias', 'transformer.h.1.attn.c_attn.weight', 'transformer.h.1.attn.c_attn.bias', 'transformer.h.1.attn.c_proj.weight', 'transformer.h.1.attn.c_proj.bias', 'transformer.h.1.ln_2.weight', 'transformer.h.1.ln_2.bias', 'transformer.h.1.mlp.c_fc.weight', 'transformer.h.1.mlp.c_fc.bias', 'transformer.h.1.mlp.c_proj.weight', 'transformer.h.1.mlp.c_proj.bias', 'transformer.h.2.ln_1.weight', 'transformer.h.2.ln_1.bias', 'transformer.h.2.attn.c_attn.weight', 't

In [10]:
customModel

CustomGPT2Model(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-8): 9 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (9-11): 3 x CustomGPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_p

In [11]:
from transformers import GPT2Tokenizer, GPT2Config, GPT2Model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
customModel.eval()
input_text = "Once upon a time,"

input_ids = tokenizer(input_text, return_tensors="pt")["input_ids"].to(customModel.device)
#print("Device of input ids is", input_ids.device)
# with torch.no_grad():
#     logits = customModel(input_ids=input_ids)

output_ids = customModel.generate(
    input_ids=input_ids,
    max_length=6,  # Maximum length of generated text
    num_return_sequences=1,  # Number of sequences to generate
    do_sample=True,  # Enable sampling
    top_k=50,  # Use top-k sampling
    temperature=0.7,  # Sampling temperature
)

predicted_ids = output_ids

generated_text = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)

print("Generated Text:", generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Generated Text: Once upon a time, spawn


In [12]:
#Create a dataset to finetune
import torch.nn.functional as F
from torch.utils.data import Dataset, IterableDataset

class ArithmeticDataset(IterableDataset):
    def __init__(self, min_val=0, max_val=256):
        self.min_val = min_val
        self.max_val = max_val
        
        self.operations = {
            0: lambda x, y: x + y,    # addition
            1: lambda x, y: x - y,    # subtraction
            2: lambda x, y: x * y,    # multiplication
            3: lambda x, y: x / (y + 1e-8)  # division
        }
    
    def __iter__(self):
        while True:
            # Generate random numbers
            num1 = torch.rand(1) * (self.max_val - self.min_val) + self.min_val
            num2 = torch.rand(1) * (self.max_val - self.min_val) + self.min_val
            
            # Generate random operations
            op_idx = torch.tensor([0]) # torch.randint(0, 4, (1,))
            operation = F.one_hot(op_idx, num_classes=4).float()
            
            # Calculate targets
            target = self.operations[op_idx.item()](num1, num2)            
            
            yield num1, num2, operation.squeeze(0), target

In [13]:
ad = ArithmeticDataset()
print(next(iter(ad)))
dataloader = torch.utils.data.DataLoader(ad, batch_size=2)
print(next(iter(dataloader)))

(tensor([241.6788]), tensor([249.2799]), tensor([1., 0., 0., 0.]), tensor([490.9587]))
[tensor([[133.8290],
        [ 29.2392]]), tensor([[ 87.0907],
        [103.3537]]), tensor([[1., 0., 0., 0.],
        [1., 0., 0., 0.]]), tensor([[220.9198],
        [132.5928]])]


In [14]:
# import sys
# !{sys.executable} -m pip install wandb
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33mrichidubey[0m ([33mrichidubey-georgia-institute-of-technology[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
from tqdm import tqdm
import numpy as np
import torch
import pandas as pd
import wandb
from datetime import datetime
from torch.utils.data import Dataset, IterableDataset
from torch.utils.data import DataLoader
wandb.require("service")

from transformers import GPT2Tokenizer, GPT2Config, GPT2Model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
def arithmetic_loss(predictions, targets, scale_factor=10000.0):
    abs_error = (predictions - targets)**2
    # rel_error = torch.abs((predictions - targets) / (targets + 1e-8)) * scale_factor
    loss = abs_error # + rel_error
    return torch.sum(loss)

def train_model(
    model,
    num_epochs=6000,
    batch_size=1024,
    initial_lr=1e-3,
    device='cuda',
    # eval_every=500,
    use_wandb=False,
    project_name="arithmetic_training"
):
    model = model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=initial_lr)
    
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=200, gamma=0.7)
    
    dataset = ArithmeticDataset()
    dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=1, pin_memory=True, persistent_workers=True)
    
    steps_per_epoch = 1000
    best_loss = float('inf')
    
    # Initialize logging
    if use_wandb:
        wandb.init(project=project_name)
        wandb.config.update({
            "learning_rate": initial_lr,
            "batch_size": batch_size,
            "num_epochs": num_epochs,
            "scheduler_step_size": 200,
            "scheduler_gamma": 0.7
        })
    else:
        # Create CSV log file with timestamp
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        log_file = f'training_log_{timestamp}.csv'
        log_data = []
    
    for epoch in range(num_epochs):
        model.train()
        epoch_losses = []
        epoch_diffs = []
        
        data_iter = iter(dataloader)
        pbar = tqdm(range(steps_per_epoch), desc=f'Epoch {epoch+1}/{num_epochs}')
        for step in pbar:
            try:
                batch = next(data_iter)
            except StopIteration:
                data_iter = iter(dataloader)
                batch = next(data_iter)
            
            num1, num2, operation, targets = [item.to(device) for item in batch]
            
            # num1 = num1.unsqueeze(1)
            # num2 = num2.unsqueeze(1)
            ###############################
            optimizer.zero_grad()
            
            # inp_txt = str(num1) + str(operation) + str(num2)

            

            operation_mapping = {0: "+", 1: "-", 2: "*", 3: "/"}

            # Decode the one-hot tensor into operation symbols
            decoded_operations = [operation_mapping[torch.argmax(op).item()] for op in operation]

            inp_txt = [
                f"{num1.item()} {op} {num2.item()}" for num1, op, num2 in zip(num1, decoded_operations, num2)
            ]

            print("INp text is ", inp_txt)

            input_ids = tokenizer(inp_txt, return_tensors="pt", padding=True, truncation = True)["input_ids"].to(model.device)

            # print("num1 is", num1)
            # print("num2 is", num2 )
            # print("op is", operation)

            # print("inp_txt is", inp_txt)

            # print("input_ids has shape", input_ids.shape)
            print("Input ids has shape", input_ids)

            output_ids = model.generate(
                input_ids=input_ids,
                max_length=4,  # Maximum length of generated text
                num_return_sequences=1,  # Number of sequences to generate
                do_sample=True,  # Enable sampling
                top_k=50,  # Use top-k sampling
                temperature=0.7,  # Sampling temperature
            )

            predicted_ids = output_ids

            # print("Shape of logits is", logits.shape)
            # print("Shape of predicted ids is", predicted_ids.shape)

            predictions = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)[-1]


            # logits = customModel(input_ids=input_ids)

            # predicted_ids = torch.argmax(logits, dim=-1)
            # predictions = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)

            ################################

            try: 
                numeric_prediction = float(predictions)  # Only if it should be a number
                predictions_tensor = torch.tensor([numeric_prediction]).to(device)  # Convert to tensor
            except ValueError:
                # print(f"Decoded output is not numeric: {predictions}")
                predictions_tensor = torch.tensor([0.0], device=device, requires_grad=True)
                
            predictions =  predictions_tensor
            loss = arithmetic_loss(predictions, targets)
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            
            epoch_losses.append(loss.item())
            
            with torch.no_grad():
                diffs = torch.abs(predictions - targets)
                epoch_diffs.extend(diffs.cpu().numpy())
            
            pbar.set_postfix({'Loss': loss.item()})
        
        with torch.no_grad():
            model.eval()

            test_num1, test_num2, test_op, test_targets = [item.to(device) for item in next(iter(dataloader))]

            ########################################

            operation_mapping = {0: "+", 1: "-", 2: "*", 3: "/"}

            # Decode the one-hot tensor into operation symbols
            decoded_operations = [operation_mapping[torch.argmax(op).item()] for op in test_op]


            inp_txt = [
                f"{num1.item()} {op.item()} {num2.item()}" for num1, op, num2 in zip(test_num1, decoded_operations, test_num2)
            ]
        
            input_ids = tokenizer(inp_txt, return_tensors="pt")["input_ids"].to(model.device)
            
            output_ids = model.generate(
                input_ids=input_ids,
                max_length=4,  # Maximum length of generated text
                num_return_sequences=1,  # Number of sequences to generate
                do_sample=True,  # Enable sampling
                top_k=50,  # Use top-k sampling
                temperature=0.7,  # Sampling temperature
            )

            predicted_ids = output_ids

            # print("Shape of logits is", logits.shape)
            # print("Shape of predicted ids is", predicted_ids.shape)

            predictions = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)[-1]

            try: 
                numeric_prediction = float(predictions)  # Only if it should be a number
                predictions_tensor = torch.tensor([numeric_prediction]).to(device)  # Convert to tensor
            except ValueError:
                # print(f"Decoded output is not numeric: {predictions}")
                predictions_tensor = torch.tensor([0.0], device=device, requires_grad=True)
                
            test_pred =  predictions_tensor
            ####################################################
            
            test_loss = arithmetic_loss(test_pred, test_targets)
           
            first_pred = test_pred[0].item()
            first_target = test_targets[0].item()
            
            # Format to 5 decimal places
            first_pred_formatted = f"{first_pred:.5f}"
            first_target_formatted = f"{first_target:.5f}"
            
            current_lr = optimizer.param_groups[0]['lr']
            train_loss = np.mean(epoch_losses)
            val_loss = test_loss.item()
            avg_diff = np.mean(epoch_diffs)
            median_diff = np.median(epoch_diffs)
            
            if use_wandb:
                wandb.log({
                    'learning_rate': current_lr,
                    'train_loss': train_loss,
                    'val_loss': val_loss,
                    'avg_prediction_diff': avg_diff,
                    'median_prediction_diff': median_diff,
                    'epoch': epoch + 1
                })
            else:
                log_data.append({
                    'epoch': epoch + 1,
                    'learning_rate': current_lr,
                    'train_loss': train_loss,
                    'val_loss': val_loss,
                    'avg_prediction_diff': avg_diff,
                    'median_prediction_diff': median_diff
                })
            
            print(
                f'Epoch {epoch+1}/{num_epochs} | '
                f'LR: {current_lr:.2e} | '
                f'Train Loss: {train_loss:.4f} | '
                f'Val Loss: {val_loss:.4f} | '
                f'Avg Diff: {avg_diff:.4f} | '
                f'First Pred: {first_pred_formatted} | '
                f'First Target: {first_target_formatted}'
            )
        
        model.train()
        
        # Save the best model
        if train_loss < best_loss:
            best_loss = train_loss
            torch.save(model.state_dict(), 'best_arithmetic_model.pt')
        
        scheduler.step()
        print(f'Epoch {epoch+1} completed. Average loss: {train_loss:.4f}\n')
    
    if not use_wandb:
        pd.DataFrame(log_data).to_csv(log_file, index=False)
        print(f"Training log saved to {log_file}")
    
    if use_wandb:
        wandb.finish()

In [21]:
train_model(customModel, num_epochs=20, batch_size=1024, initial_lr=1e-4, device='cuda', use_wandb=False)

Epoch 1/20:   0%|                                                                                                                      | 0/1000 [00:00<?, ?it/s]

INp text is  ['39.44532775878906 + 56.15846252441406', '182.74403381347656 + 161.368408203125', '87.34925842285156 + 171.78358459472656', '92.96932983398438 + 241.03765869140625', '156.76275634765625 + 187.2506866455078', '128.2958526611328 + 64.32244873046875', '110.33406066894531 + 227.0379638671875', '175.83277893066406 + 211.7412567138672', '213.18832397460938 + 219.33740234375', '94.76513671875 + 49.940093994140625', '119.56062316894531 + 52.02989196777344', '194.41751098632812 + 232.977783203125', '190.35797119140625 + 80.49978637695312', '122.58100891113281 + 25.713546752929688', '19.692047119140625 + 138.43807983398438', '142.10028076171875 + 44.06886291503906', '218.4680633544922 + 69.15025329589844', '63.6024169921875 + 69.93028259277344', '55.15858459472656 + 129.18702697753906', '66.25399780273438 + 136.5983123779297', '244.0214385986328 + 152.60360717773438', '205.21202087402344 + 28.924270629882812', '17.752227783203125 + 60.75213623046875', '15.243072509765625 + 46.99902

Epoch 1/20:   0%|                                                                                                                      | 0/1000 [00:00<?, ?it/s]


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).