In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torchvision.transforms as transforms
from tqdm.notebook import tqdm
from IPython.display import clear_output
from matplotlib.pyplot import imshow,show

In [None]:
def find_smallest_indices(arr, k=100):#chatgpt
  num_rows = arr.shape[0]
  smallest_indices = np.zeros((num_rows, k), dtype=int)

  for i in tqdm(range(num_rows)):
    sorted_indices = np.argsort(arr[i])
    smallest_indices[i] = sorted_indices[:k]

  return smallest_indices

In [3]:
NUM_EPOCHS = 40
BATCH_SIZE = 128
DEVICE = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
DEVICE

'cuda'

In [None]:
train_data = np.load("train_data.npy")
train_starts = pd.read_csv("train_starts.csv")
val_data = np.load("val_data.npy")
val_starts = pd.read_csv("val_starts.csv")
test_data = np.load("test_data.npy")
test_ids = np.load("test_ids.npy")

In [None]:
train_starts = np.array(train_starts["0"]).astype(int)
train_starts

In [None]:
val_starts = np.array(val_starts["0"]).astype(int)
val_starts

In [6]:
#transform_mean = np.mean(train_data)
#transform_std = np.std(train_data)

transform_mean = -20.218616
transform_std = 14.000551

In [8]:
class known_dataset(Dataset):
    def __init__(self, data, starts):
        self.data = data
        self.starts = starts
        self.transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize([transform_mean],[transform_std])
        ])

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self,idx):
        anchor = self.transform(self.data[idx])
        clique_idx = self.getcliqueidx(idx)

        positive_idx = np.random.randint(self.starts[clique_idx],self.starts[clique_idx+1])
        while positive_idx == idx:
            positive_idx = np.random.randint(self.starts[clique_idx],self.starts[clique_idx+1])

        negative_idx = np.random.randint(self.__len__())
        while self.starts[clique_idx] <= negative_idx < self.starts[clique_idx + 1]:
            negative_idx = np.random.randint(self.__len__())

        positive = self.transform(self.data[positive_idx])
        negative = self.transform(self.data[negative_idx])
        return torch.concatenate((anchor,positive,negative),dim=0)

    def getcliqueidx(self,idx):
        for i in range(len(self.starts)):
            if idx < self.starts[i]:
                return i-1

In [9]:
train_dataset = known_dataset(train_data,train_starts)
val_dataset = known_dataset(val_data,val_starts)

In [10]:
train_loader = DataLoader(train_dataset,batch_size = BATCH_SIZE,shuffle = True)

In [7]:
class NN(nn.Module):
    def __init__(self):
        super().__init__()
        #input shape is 1x84x50
        self.conv1 = self.ConvBlock(1,16,4,2,(1,2))
        #16x42x26
        self.conv2 = self.ConvBlock(16,64,3,(3,2),(1,2))
        #64x14x14
        self.conv3 = self.ConvBlock(64,256,4,2,1)
        #256x7x7
        self.conv4 = self.ConvBlock(256,1024,4,2,1)
        #1024x3x3
        self.conv5 = self.ConvBlock(1024,4096,3,2,0)
        #4096x1x1
        self.flat = nn.Sequential(
            nn.Flatten(),
            nn.Linear(4096,1024),
            nn.LeakyReLU(0.1),
            nn.Linear(1024,256)
        )
    def ConvBlock(self, in_features, out_features, kernel_size, stride, padding):
        return nn.Sequential(
            nn.Conv2d(in_features,out_features,kernel_size,stride,padding),
            nn.BatchNorm2d(out_features),
            nn.LeakyReLU(0.1),
        )
    def forward(self,x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.conv5(x)
        return self.flat(x)

In [12]:
model = NN().to(DEVICE)
opt = optim.Adam(model.parameters(),lr=3e-5)

In [8]:
transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize([transform_mean],[transform_std])
    ])

In [14]:
scheduler = optim.lr_scheduler.StepLR(opt, step_size=5, gamma=0.1**0.5)

In [15]:
tml = nn.TripletMarginLoss(margin = 3)
tmwdl = nn.TripletMarginWithDistanceLoss(distance_function = lambda x,y: 1 - F.cosine_similarity(x,y),margin = 0.1)

In [16]:
start_epoch = 0

In [None]:
for epoch in range(start_epoch,NUM_EPOCHS):
    model.train()
    with tqdm(train_loader) as TQDM:
        for batch_idx, batch in enumerate(TQDM):
            batch = batch.to(DEVICE)
            anchor = batch[:,0,:,:][:,None,:,:]
            positive = batch[:,1,:,:][:,None,:,:]
            negative = batch[:,2,:,:][:,None,:,:]

            loss = tml(model(anchor),model(positive),model(negative)) + tmwdl(model(anchor),model(positive),model(negative))

            opt.zero_grad()
            loss.backward()
            opt.step()

            TQDM.set_postfix({"epoch": epoch,"loss": loss.item()})

    embs = torch.zeros((val_data.shape[0],256)).to(DEVICE)
    model.eval()
    with torch.no_grad():
        for i in tqdm(range(val_data.shape[0])):
            embs[i] = model(transform(val_data[i]).unsqueeze(0).to(DEVICE))
        map = torch.cdist(embs,embs,p=2)
    indices_of_100_smallest = find_smallest_indices(map.cpu().numpy())
    accuracy = 0
    for i in range(indices_of_100_smallest.shape[0]):
        clique_idx = val_dataset.getcliqueidx(i)
        clique = np.arange(val_starts[clique_idx],val_starts[clique_idx + 1],dtype=int)
        clique = set(clique)
        indices = set(indices_of_100_smallest[i])
        accuracy+= len(clique & indices) / np.min((len(clique),100))
    accuracy/=indices_of_100_smallest.shape[0]
    print(accuracy)

In [None]:
embs = torch.zeros((test_data.shape[0],256)).to(DEVICE)
model.eval()
with torch.no_grad():
    for i in tqdm(range(test_data.shape[0])):
        embs[i] = model(transform(test_data[i]).unsqueeze(0).to(DEVICE))
    map = torch.cdist(embs,embs,p = 2)

In [None]:
map = map.cpu().numpy()
map.shape

In [26]:
for i in range(map.shape[0]):
    map[i,i] = np.inf

In [None]:
indices_of_100_smallest = find_smallest_indices(map, k=100)

In [None]:
result = []
for i in tqdm(range(map.shape[0])):
    line = ""
    line += str(test_ids[i])
    for j in range(100):
        line += ' ' + str(test_ids[indices_of_100_smallest[i,j]].item())
    line += '\n'
    result.append(line)

In [None]:
f = open("submission.txt",'w')
f.writelines(result)
f.close()