# MLP Matrix Learning
This notebook trains a multi-layer perception network using triplet margin loss to understand similar commits, using embeddings of commits as training data and input.

# Load the data

In [23]:
import numpy as np 

train_embeddings_path = 'embedding/triplet_train_embeddings.npy'
train_embeddings = np.load(train_embeddings_path, allow_pickle=True)
anchors = train_embeddings[0:3]
positive = train_embeddings[3:6]
negative = train_embeddings[6:9]


## Create training dataset

In [24]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn


class TripleCommitDataset(Dataset):
    def __init__(self, anchors, positive, negative):
        self.anchors = anchors
        self.positive = positive
        self.negative = negative

    def __len__(self):
        return len(self.anchors)

    def __getitem__(self, idx):
        anchor = self.anchors[idx]
        pos = self.positive[idx]
        neg = self.negative[idx]
        return anchor, pos, neg
    
dataset = TripleCommitDataset(anchors, positive, negative)
dataloader = DataLoader(dataset, batch_size=3, shuffle=True)

## Define MLP

In [25]:
class MLPEmbedding(nn.Module):
    def __init__(self, input_dim=768, output_dim=64):
        super().__init__()
        self.net = nn.Sequential(
                    nn.Linear(768, 512),
                    nn.BatchNorm1d(512),
                    nn.ReLU(),
                    nn.Linear(512, 256),
                    nn.BatchNorm1d(256),
                    nn.ReLU(),
                    nn.Linear(256, 64)
                )
    def forward(self, x):
        return self.net(x)
    
model = MLPEmbedding(768, 64)

## Setup Optimiser and Loss

In [26]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
triplet_loss = nn.TripletMarginLoss(margin=1.0)

## Train the MLP

In [27]:
epochs = 10
for epoch in range(epochs):
    model.train()
    total_loss = 0.0
    for anchor, positive, negative in dataloader:
        optimizer.zero_grad()
        
        anchor_out = model(anchor)
        positive_out = model(positive)
        negative_out = model(negative)
        
        loss_value = triplet_loss(anchor_out, positive_out, negative_out)
        loss_value.backward()
        optimizer.step()
        
        total_loss += loss_value.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")

Epoch 1/10, Loss: 1.9232
Epoch 2/10, Loss: 1.2317
Epoch 3/10, Loss: 0.3430
Epoch 4/10, Loss: 0.0424
Epoch 5/10, Loss: 0.0000
Epoch 6/10, Loss: 0.0000
Epoch 7/10, Loss: 0.0781
Epoch 8/10, Loss: 0.0443
Epoch 9/10, Loss: 0.0000
Epoch 10/10, Loss: 0.0000


## Load Test Data
test_embeddings_path = 'embedding/embeddings.npy'
test_embeddings = np.load(train_embeddings_path, allow_pickle=True)