In [None]:



class Time_Series_GPT(nn.Module):
    
    def __init__(self, params_obj):
        
        super().__init__()
        self.output_size  = params_obj.output_size
        self.seq_length   = params_obj.seq_length       ## 40 or 15
        self.input_size   = params_obj.input_size
        self.num_features = params_obj.num_features
        self.device       = params_obj.device
        self.block_size   = params_obj.block_size
        self.MyName       = "The GPT model"

        self.pos_emb_table   = nn.Embedding(params_obj.block_size, params_obj.n_embd)      ## [block, 512] or [40, 512]
        
        self.blocks = nn.Sequential(
                *[  Block(params_obj.n_embd, params_obj.n_head, params_obj.block_size) for _ in range(params_obj.n_layer)  ]
        )
        
        self.ln_f        = nn.LayerNorm(  params_obj.n_embd    )        
        self.lm_ffw_head = nn.Linear(params_obj.n_embd, self.num_features)      ## [512, 65] # FFW Layer

        #######################################################################

        self.map_24_512 = nn.Linear(self.num_features, 512)  ## [24, 512] # projection
        self.map_act    = nn.ReLU()

        #######################################################################

        self.map_24_512_1 = nn.Linear(self.num_features, 100)  ## [24, 512] # projection
        self.map_act1     = nn.ReLU()
        self.LayerNorm1   = nn.LayerNorm( 100 )
        
        self.map_24_512_2 = nn.Linear(100, 200)      ## [24, 512] # projection
        self.map_act2     = nn.ReLU()
        self.LayerNorm2   = nn.LayerNorm( 200 )
        
        self.map_24_512_3 = nn.Linear(200, 512)      ## [24, 512] # projection
        
        self.dropout_24_512 = nn.Dropout(0.2)
   

    def forward(self,  idx, targets):

        B = idx.shape[0]       ## 16 batch 
        T = idx.shape[1]       ## 40 or 15

        ############################################################

        idx = self.map_24_512_1( idx )       ## (8, 15, 24) goes in
        idx = self.map_act1(     idx )
        idx = self.dropout_24_512(     idx )
        idx = self.LayerNorm1(   idx )
        
        idx = self.map_24_512_2( idx )
        idx = self.map_act2(     idx )
        idx = self.dropout_24_512(     idx )
        idx = self.LayerNorm2(   idx )
        
        idx = self.map_24_512_3( idx )
        
        tok_emb = idx                        ## (B, 15, 512)

        ###########################################################
        
        pos_emb = self.pos_emb_table( torch.arange(T, device=self.device) )  
        
        ###########################################################
        
        ## x = tok_emb + pos_emb + conv_emb + per_conv_emb    ## [B, T, E] or [N, 40, 512], now [N, 15, 24]

        x = tok_emb + pos_emb 

        ############################################################
        
        x = self.blocks(  x  )               ## (B, T, E)   
        x = self.ln_f(    x  )               ## (B, T, E)   ## norm
        logits = self.lm_ffw_head(x)         ## [B, 40, 65]  or [N, 15, 24]
        return logits
        
        
    def generate(self, idx, max_new_tokens):    ## idx is (B, T)
        print("max tokens ", max_new_tokens)
        print(idx.shape)
        for _ in range(max_new_tokens):
            ## crop idx to the last block_size tokens
            idx_cond = idx[:, -self.block_size:, :]
            logits = self(idx_cond, 0 )    ## ## get preds
            logits = logits[:, -1, :]    ## focus on last one (B, E)
            logits = logits.unsqueeze(0)
            ## probs = F.softmax(logits, dim= -1)    ## (B, E) get probs
            ## idx_next = torch.multinomial(probs, num_samples=1)     ## (B, 1) selected
            idx = torch.cat(  (idx, logits), dim=1  )   ## (B, T+1) append sample to running sequence
            
        return idx



In [None]:

def forward(self, idx, targets=None, reasoning_steps=1):
    """
    Forward method supporting both standard and iterative (reasoning) modes.
    
    Args:
        idx (Tensor): Input tensor of shape [B, T, F]
        targets (Tensor or None): Optional targets for loss calculation
        reasoning_steps (int): Number of reasoning iterations. If 1, use normal mode.

    Returns:
        logits: Final prediction (B, F)
        loss (optional): If targets provided, return MSE loss
    """
    B, T, _ = idx.shape

    # Clone to avoid modifying original input during reasoning
    working_idx = idx.clone()

    for step in range(reasoning_steps):
        # Crop for causal context if needed
        idx_cond = working_idx[:, -self.block_size:, :]

        # === Begin Core Forward ===
        # Projection MLP
        x = self.map_24_512_1(idx_cond)
        x = self.map_act1(x)
        x = self.dropout_24_512(x)
        x = self.LayerNorm1(x)

        x = self.map_24_512_2(x)
        x = self.map_act2(x)
        x = self.dropout_24_512(x)
        x = self.LayerNorm2(x)

        x = self.map_24_512_3(x)

        # Add positional encoding
        pos_emb = self.pos_emb_table(torch.arange(x.shape[1], device=self.device))
        x = x + pos_emb

        # Transformer blocks
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_ffw_head(x)
        # === End Core Forward ===

        # Extract latest step’s prediction
        last_pred = logits[:, -1:, :]  # shape: (B, 1, F)

        # In reasoning mode, feed it back in
        if reasoning_steps > 1 and step < reasoning_steps - 1:
            # Detach to stop gradient unless we want full backprop (optional)
            working_idx = torch.cat((working_idx, last_pred.detach()), dim=1)

    final_pred = last_pred.squeeze(1)  # shape: (B, F)

    if targets is not None:
        loss = F.mse_loss(final_pred, targets)
        return final_pred, loss

    return final_pred




## Merge these


In [None]:



class Time_Series_GPT(nn.Module):
    
    def __init__(self, params_obj):
        
        super().__init__()
        self.output_size  = params_obj.output_size
        self.seq_length   = params_obj.seq_length       ## 40 or 15
        self.input_size   = params_obj.input_size
        self.num_features = params_obj.num_features
        self.device       = params_obj.device
        self.block_size   = params_obj.block_size
        self.MyName       = "The GPT model"

        self.pos_emb_table   = nn.Embedding(params_obj.block_size, params_obj.n_embd)      ## [block, 512] or [40, 512]
        
        self.blocks = nn.Sequential(
                *[  Block(params_obj.n_embd, params_obj.n_head, params_obj.block_size) for _ in range(params_obj.n_layer)  ]
        )
        
        self.ln_f        = nn.LayerNorm(  params_obj.n_embd    )        
        self.lm_ffw_head = nn.Linear(params_obj.n_embd, self.num_features)      ## [512, 65] # FFW Layer

        #######################################################################

        self.map_24_512 = nn.Linear(self.num_features, 512)  ## [24, 512] # projection
        self.map_act    = nn.ReLU()

        #######################################################################

        self.map_24_512_1 = nn.Linear(self.num_features, 100)  ## [24, 512] # projection
        self.map_act1     = nn.ReLU()
        self.LayerNorm1   = nn.LayerNorm( 100 )
        
        self.map_24_512_2 = nn.Linear(100, 200)      ## [24, 512] # projection
        self.map_act2     = nn.ReLU()
        self.LayerNorm2   = nn.LayerNorm( 200 )
        
        self.map_24_512_3 = nn.Linear(200, 512)      ## [24, 512] # projection
        
        self.dropout_24_512 = nn.Dropout(0.2)
   

    def forward(self,  idx, targets=None, reasoning_steps=1, backprop_through_steps=False):
        
        ## targets (Tensor or None): Optional targets for loss calculation
        ## reasoning_steps (int): Number of reasoning iterations. If 1, use normal mode.
        ## logits: Final prediction (B, F)
        ## loss (optional): If targets provided, return MSE loss
        
        B = idx.shape[0]       ## 16 batch 
        T = idx.shape[1]       ## 40 or 15

        # Clone to avoid modifying original input during reasoning
        working_idx = idx.clone()
        
        ############################################################
        
        for step in range(reasoning_steps):
            
            # Crop for causal context if needed, from 1..21, removes 1, so that 1..20
            idx_cond = working_idx[:, -self.block_size:, :]     ## makes sure just block size, 
            
            # === Begin Core Forward ===
            # Projection MLP
            
            x = self.map_24_512_1(idx_cond)   ## (8, 15, 24) goes in
            x = self.map_act1(x)
            x = self.dropout_24_512(x)
            x = self.LayerNorm1(x)
            
            x = self.map_24_512_2(x)
            x = self.map_act2(x)
            x = self.dropout_24_512(x)
            x = self.LayerNorm2(x)
            
            x = self.map_24_512_3(x)
            
            pos_emb = self.pos_emb_table(torch.arange(T, device=self.device) )
            
            ## x = tok_emb + pos_emb + conv_emb + per_conv_emb  ## [B, T, E] or [N, 40, 512], now [N, 15, 24]
            ## x = tok_emb + pos_emb 
            
            x = x + pos_emb       ## (B, 15, 512)
                       
            # Transformer blocks
            x = self.blocks(x)      ## (B, T, E)
            x = self.ln_f(x)        ## (B, T, E)   ## normalization
            logits = self.lm_ffw_head(x)     ## [B, 40, 65]  or [N, 15, 24] 
            # === End Core Forward ===
            
            # Extract latest step’s prediction
            last_pred = logits[:, -1:, :]  # shape: (B, 1, F)
            

            # In reasoning mode, feed it back in
            if reasoning_steps > 1 and step < reasoning_steps - 1:
                
                if backprop_through_steps:
                    working_idx = torch.cat((working_idx, last_pred), dim=1)
                else:
                    ## Detach to stop gradient unless we want full backprop (optional)
                    working_idx = torch.cat((working_idx, last_pred.detach()), dim=1)
            

        ############################################################
       
        final_pred = last_pred.squeeze(1)  # shape: (B, F)

        return final_pred


            
    def generate(self, idx, max_new_tokens):    ## idx is (B, T)
        print("max tokens ", max_new_tokens)
        print(idx.shape)
        for _ in range(max_new_tokens):
            ## crop idx to the last block_size tokens
            idx_cond = idx[:, -self.block_size:, :]
            logits = self(idx_cond, 0 )    ## ## get preds
            logits = logits[:, -1, :]    ## focus on last one (B, E)
            logits = logits.unsqueeze(0)
            ## probs = F.softmax(logits, dim= -1)    ## (B, E) get probs
            ## idx_next = torch.multinomial(probs, num_samples=1)     ## (B, 1) selected
            idx = torch.cat(  (idx, logits), dim=1  )   ## (B, T+1) append sample to running sequence
            
        return idx




In [None]:

# Without backprop through reasoning chain
output = model(idx, reasoning_steps=10, backprop_through_steps=False)
loss = F.mse_loss(output, targets)

# With full backprop (heavier)
output = model(idx, reasoning_steps=10, backprop_through_steps=True)
loss = F.mse_loss(output, targets)




## the loss


In [None]:


def function_train_rc(train_CIVS_2, reasoning_steps=1, backprop_through_steps=False):
    
    
    # === Model Setup ===
    model = Time_Series_GPT.Time_Series_GPT(tsGPT_obj).to(tsGPT_obj.device)
    optimizer = torch.optim.Adam(model.parameters(), lr=tsGPT_obj.learning_rate)
    criterion = nn.MSELoss()


    # === History ===
    history = {
        'loss': [], 'test_loss': [],
        'loss_A': [], 'loss_B': [], 'loss_C': [],
        'loss_SIM0': [], 'loss_SIM1': [], 'loss_SIM2': [],
        'loss_SIM3': [], 'loss_SIM4': [], 'loss_SIM5': []
    }

    # === Data Scaling ===
    train_CIVS_tr, x_means, x_standard_devs = tsGPT_obj.standardize_x_scales(train_CIVS_2)
    train_CIVS_tr_scaled = (train_CIVS_tr - x_means) / x_standard_devs

    model.train()
    

    for batch_i, epoch in enumerate(range(tsGPT_obj.max_iters)):
        xb, yb = tsGPT_obj.get_batch(train_CIVS_tr_scaled)  # shapes: [B, T, F]
        
   
    logits   = model( xb, yb )
        B, T, E  = logits.shape
        
        # === Forward Pass with Reasoning Mode ===
        pred = model(
            xb, 
            targets=None, 
            reasoning_steps=reasoning_steps,
            backprop_through_steps=backprop_through_steps
        )  # pred: [B, F] (only the last timestep)
        
        
        #####################################################################################
        
 
        # === Match dimensions for loss ===
        B, T, E = xb.shape
        y_train = yb[:, -1, :]  # last timestep only
        pred = pred.view(B, E)

        # === Loss based on deltas (per your original) ===
        # You used deltas across timesteps; here it's a bit tricky since pred is just last step
        # Instead, we'll simulate your original by comparing to last yb timestep
        loss_SI  = criterion(pred[:, 2], y_train[:, 2])       # special loss on feature 2
        loss_all = criterion(pred, y_train)                   # full loss
        
        
        
        #####################################################################################

        loss_total = loss_SI

        # === Backprop ===
        optimizer.zero_grad(set_to_none=True)
        loss_total.backward()
        optimizer.step()
         

        # === Logging ===
        history['loss'].append(loss_total.item())
        history['loss_A'].append(loss_SI.item())
        history['loss_B'].append(loss_all.item())
        history['loss_C'].append(0)

        # Placeholders if Curve_SIMs aren't defined
        history['loss_SIM0'].append(0)
        history['loss_SIM1'].append(0)
        

        # === Debug Print ===
        if batch_i % 500 == 0:
            print(f"[{batch_i}] Train Loss: {loss_total.item():.6f}")
            print("Loss_SI:", loss_SI.item(), "Loss_all:", loss_all.item())
            print("Prediction shape:", pred.shape)
            print("===")

    tsGPT_obj.plot_losses_training(history)
    return model, history, x_means, x_standard_devs, train_CIVS_tr_scaled




In [None]:

model, history, x_means, x_std, scaled_data = function_train_rc(
    train_CIVS_2,
    reasoning_steps=10,
    backprop_through_steps=True
)

