In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Setup

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
import pandas as pd
import seaborn as sns
import os
import os.path as op
from pathlib   import Path
from glob      import glob
from tqdm      import tqdm
from datetime  import datetime
import random

import csv
import json
import numpy as np
import matplotlib.pyplot as plt

import torchvision.transforms as transforms

In [None]:
###################################################
# !!! Remember to look at data_dictionary.txt !!! #
###################################################


# Set your target file here #######################

with open("/content/drive/MyDrive/CS155Proj1/train.json", 'r') as f:
    track_data = json.load(f)

###################################################


# How many tracks are there?
print(f"n_tracks = {len(track_data.keys())}")

# What do the track Unique IDs (UIDs) look like?
track_uids = list(track_data.keys())
print(f"5 Example Track IDs = {track_uids[:5]}")

# What fields are avaiable for each track?
example_uid = track_uids[0]
print(f"Per-track keys = {track_data[example_uid].keys()}")

# What do the (t, x, y) track coordinates look like?
example_coords = track_data[track_uids[0]]['txy']
example_coords = np.array(example_coords)
np.set_printoptions(threshold=10)
print(f"Coordinate array = \n{example_coords}")

# What does the label look like?
example_label = track_data[track_uids[0]]['label']
print(f"Label = {example_label}")

n_tracks = 16080
5 Example Track IDs = ['lab_0_0', 'lab_0_1', 'lab_0_2', 'lab_0_3', 'lab_0_4']
Per-track keys = dict_keys(['txy', 'label'])
Coordinate array = 
[[  1.    184.166 463.817]
 [  2.    183.941 463.692]
 [  3.    183.716 463.567]
 ...
 [299.    146.777 448.634]
 [300.    146.795 448.518]
 [301.    146.813 448.402]]
Label = 0


In [None]:
list(track_data.items())[0][1]['label']

0

#Dataset

In [None]:
# data augmentation

def rotate(coords, angle):
    rotation_matrix = np.array([[np.cos(angle), -np.sin(angle)], [np.sin(angle), np.cos(angle)]])
    return coords @ rotation_matrix.T

def stride_time(coords, factor):
    assert factor == int(factor)
    factor = int(factor)
    return coords[0::factor, :]

def stretch_space(coords, factor):
    return coords * factor

def flip_time(coords):
    return np.flip(coords, axis=0) - coords[-1, :] + coords[0, :]  # start at same point

def flip_space(coords):
    return np.hstack([-coords[:,0:1], coords[:,1:2]])  # flip over y axis (negate x coordinate)

def augment(coords, rotate_angles=(), stride_time_factors=(), stretch_space_factors=(), flip_t=False, flip_s=False):
    # if isinstance(coords, np.array):
        # coords = [coords]

    aug_list = coords

    new_samples = []
    for angle in rotate_angles:
        new_samples += [rotate(c, angle) for c in aug_list]
    aug_list += new_samples

    new_samples = []
    for factor in stride_time_factors:
        new_samples += [stride_time(c, factor) for c in aug_list]
    aug_list += new_samples

    new_samples = []
    for factor in stretch_space_factors:
        new_samples += [stretch_space(c, factor) for c in aug_list]
    aug_list += new_samples

    new_samples = []
    if flip_t:
        new_samples += [flip_time(c) for c in aug_list]
    aug_list += new_samples

    new_samples = []
    if flip_s:
        new_samples += [flip_space(c) for c in aug_list]
    aug_list += new_samples

    return [x.astype(np.float32) for x in aug_list]


In [None]:
class CnnDataset(torch.utils.data.Dataset):
    def __init__(self, data, test=False, crop_size=50, aug=True):
        # `data` is a list of dicts, each dict contains id, txy, and label

        self.test = test
        self.crop_size = crop_size

        self.data = data
        self.id_to_crop_ids = {}

        self.rotate_angles = np.linspace(0, 2*np.pi, )

        cropped_data = []
        for ii, x in enumerate(data):
            txy = x['txy']
            label = x['label']
            track_id = x['id']

            if label == 1 or label is None:
                stride = 10
            else:
                stride = 30

            if len(txy) < self.crop_size:
                # start at 0
                txy = txy - txy[0, :]

                reps = int(self.crop_size / len(txy)) + 1 # guaranteed to be larger than necessary
                copies = [txy]
                for i in range(1, reps):
                  copies += [txy + copies[i-1][-1,:]]
                txy = np.vstack(copies)
                txy = txy[:self.crop_size, :]

                txys = [txy]

            else:
                crops = int((len(txy)-self.crop_size)/stride) + 1
                start_idxs = np.round(np.linspace(0, len(txy) - self.crop_size, crops)).astype(int)
                # self.id_to_crop_ids[ii] = []
                txys = []
                for start_idx in start_idxs:
                    # start at 0
                    new_txy = txy[start_idx:start_idx+self.crop_size, :]
                    new_txy = new_txy - new_txy[0, :]
                    txys += [new_txy]
                    # self.id_to_crop_ids[ii] += [len(cropped_data)-1]


            if aug:
                aug_list = augment(txys,
                    rotate_angles=np.linspace(0, 2*np.pi, 13)[1:-1] + np.random.normal(0, np.pi/12, 11),
                    # stride_time_factors=[2],
                    stretch_space_factors=np.array([0.5, 0.75, 1.25, 1.5]) + np.random.uniform(-0.1, 0.1, 4),
                    flip_t=True,
                    flip_s=True,
                )
            else:
                aug_list = txys

            self.id_to_crop_ids[ii] = []
            for aug_txy in aug_list:
                cropped_data += [{'id': track_id, 'txy': aug_txy, 'label': label}]
                self.id_to_crop_ids[ii] += [len(cropped_data)-1]
        self.cropped_data = cropped_data

    def __len__(self):
        if self.test:
            return len(self.data)
        else:
            return len(self.cropped_data)
    def __getitem__(self, idx):
        if idx < 0 or idx >= len(self):
            # print(idx, " raising indexerror")
            raise IndexError()

        if self.test:
            crops = [self.cropped_data[j] for j in self.id_to_crop_ids[idx]]
            ret = self.data[idx].copy()
            if ret['label'] is None:
                ret = {k: v for k, v in ret.items() if k != 'label'}
            ret['txy'] = crops
            return ret
        else:
            if self.cropped_data[idx]['label'] is None:
                return {k: v for k, v in self.cropped_data[idx].items() if k != 'label'}
            else:
                return self.cropped_data[idx]



In [None]:
path = "/content/drive/MyDrive/CS155Proj1/train.json"
with open(path, 'r') as f:
  TRACK_DATA = json.load(f)

In [None]:
lab_data = [{'id': k, 'txy': np.array(v['txy'], dtype=np.float32)[:,1:], 'label': v['label']} for (k, v) in TRACK_DATA.items() if k.startswith('lab')]  # TODO

# random.shuffle(lab_data)
lab_data = sorted(lab_data, key=lambda x: x['id'])
trn_vid_nums = range(0, 16)
val_vid_nums = range(16, 19)
trn_data = [x for x in lab_data if int(x['id'].split('_')[1]) in trn_vid_nums]
val_data = [x for x in lab_data if int(x['id'].split('_')[1]) in val_vid_nums]

trn_set = CnnDataset(trn_data, test=False, aug=True)
val_set = CnnDataset(val_data, test=True, aug=True)
trn_loader = torch.utils.data.DataLoader(trn_set, batch_size=32, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_set, batch_size=32, shuffle=False)

In [None]:
len(trn_set), len(val_set.cropped_data)

(550320, 86160)

#Model and training

In [None]:
from torch.nn.modules.activation import Sigmoid
model = nn.Sequential(
  nn.Conv1d(in_channels=2, out_channels=10, kernel_size=5),
  nn.ReLU(),
  nn.BatchNorm1d(10),
  nn.Conv1d(in_channels=10, out_channels=10, kernel_size=5),
  nn.ReLU(),
  nn.BatchNorm1d(10),

  nn.MaxPool1d(2),


  nn.Conv1d(in_channels=10, out_channels=10, kernel_size=5),
  nn.ReLU(),
  nn.BatchNorm1d(10),
  nn.Conv1d(in_channels=10, out_channels=10, kernel_size=5),
  nn.ReLU(),
  nn.BatchNorm1d(10),

  nn.MaxPool1d(2),

  nn.Flatten(),
  nn.Linear(60, 40),
  nn.ReLU(),
  nn.Linear(40, 1),

  nn.Sigmoid(),
)

In [None]:
def evaluate(model, eval_set, eval_loader, is_test, thres=0.5, eval_every=1, raw_wts=False):

    old_is_test = eval_set.test
    eval_set.test = is_test
    total = 0
    correct = 0
    tp = 0
    tn = 0
    fp = 0
    fn = 0

    if is_test:
        with torch.no_grad():
            model.eval()
            for i in range(0, len(eval_set), eval_every):
                sample = eval_set[i]
            # for i, sample in enumerate(eval_set):
                crop_predicted = []
                for crop in sample['txy']:
                    output = model(torch.Tensor(crop['txy'][None,:,:].swapaxes(1,2)))
                    if raw_wts:
                        crop_predicted += [output.data[:,0]]
                    else:
                        crop_predicted += [1*(output.data > thres)[:,0]]

                if raw_wts:
                    pred = 1 if sum(crop_predicted) > len(crop_predicted) * thres else 0
                else:
                    pred = 1 if sum(crop_predicted) > len(crop_predicted) / 2 else 0

                if (pred == sample['label']):
                    correct += 1
                    if sample['label'] == 1:
                        tp += 1
                    else:
                        tn += 1
                else:
                    if sample['label'] == 1:  # pred = 0 so false negative
                        fn += 1
                    else:
                        fp += 1

                total += 1
                # predictions += [[sample['id'], pred]]

    else:
        with torch.no_grad():
            model.eval()
            for i in range(0, len(eval_loader), eval_every):
                sample = eval_set[i]
            # for i, sample in enumerate(eval_loader):
                # forward pass
                output = model(sample['txy'].swapaxes(1,2))
                # find accuracy
                predicted = 1*(output.data > thres)[:,0]
                total += sample['label'].size(0)
                correct += (predicted == sample['label']).sum().item()
                tp += ((predicted == sample['label'])*(predicted == 1)).sum().item()
                tn += ((predicted == sample['label'])*(predicted == 0)).sum().item()
                fp += ((predicted != sample['label'])*(predicted == 1)).sum().item()
                fn += ((predicted != sample['label'])*(predicted == 0)).sum().item()
                # find loss
                # loss = criterion(output[:,0], sample['label'].to(dtype=torch.float32))
                # validation_loss_history[epoch] += loss.item()
            # validation_loss_history[epoch] /= len(eval_loader)

    acc = correct / total
    prc = tp / (tp + fp)
    rec = tp / (tp + fn)
    f2 = 5 * prc * rec / (4*prc + rec)

    eval_set.test = old_is_test

    return acc, prc, rec, f2


In [None]:
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=0)

n_epochs = 2

# store metrics
training_accuracy_history = np.zeros([n_epochs, 1])
training_loss_history = np.zeros([n_epochs, 1])
validation_accuracy_history = np.zeros([n_epochs, 1])
validation_loss_history = np.zeros([n_epochs, 1])

for epoch in range(n_epochs):
    train_total = 0
    train_correct = 0
    # train
    model.train()
    trn_set.test=False
    for i, sample in enumerate(trn_loader):
        optimizer.zero_grad()
        # forward pass
        output = model(sample['txy'].swapaxes(1,2))
        # calculate categorical cross entropy loss
        loss = criterion(output[:,0], sample['label'].to(dtype=torch.float32))
        # backward pass
        loss.backward()
        optimizer.step()

        # track training accuracy
        # _, predicted = torch.max(output.data, 1)
        predicted = 1*(output.data > 0.5)[:,0]

        train_total += sample['label'].size(0)
        train_correct += (predicted == sample['label']).sum().item()
        # track training loss
        training_loss_history[epoch] += loss.item()
        # progress update after 180 batches (~1/10 epoch for batch size 32)
    training_loss_history[epoch] /= len(trn_loader)
    training_accuracy_history[epoch] = train_correct / train_total
    if epoch % 1 == 0:
        print(f'Epoch {epoch+1}/{n_epochs}:\tloss: {training_loss_history[epoch,0]:0.4f}, acc: {training_accuracy_history[epoch,0]:0.4f}',end='')

        acc, prc, rec, f2 = evaluate(model, val_set, val_loader, is_test=True)
        print(f', val consensus acc {acc:0.4f} prc {prc:0.4f} rec {rec:0.4f} f2 {f2:0.4f}', end='')
        acc, prc, rec, f2 = evaluate(model, val_set, val_loader, is_test=False)
        print(f', val acc {acc:0.4f} prc {prc:0.4f} rec {rec:0.4f} f2 {f2:0.4f}', end='')


Epoch 1/2:	loss: 0.2970, acc: 0.8649, val consensus acc: 0.9375
Epoch 2/2:	loss: 0.2505, acc: 0.8866, val consensus acc: 0.9375


#Final validation

In [None]:
acc, prc, rec, f2 = evaluate(model, trn_set, trn_loader, is_test=True, thres=0.2, eval_every=10)
print(f'consensus acc {acc:0.4f} prc {prc:0.4f} rec {rec:0.4f} f2 {f2:0.4f}', end='')
# acc, prc, rec, f2 = evaluate(model, val_set, val_loader, is_test=False, thres=0.5)
# print(f', val acc {acc:0.4f} prc {prc:0.4f} rec {rec:0.4f} f2 {f2:0.4f}', end='')

consensus acc 0.8919 prc 0.7143 rec 1.0000 f2 0.9259

In [None]:
acc, prc, rec, f2 = evaluate(model, trn_set, trn_loader, is_test=True, thres=0.35, eval_every=10)
print(f'consensus acc {acc:0.4f} prc {prc:0.4f} rec {rec:0.4f} f2 {f2:0.4f}', end='')

consensus acc 0.9730 prc 0.9091 rec 1.0000 f2 0.9804

In [None]:
acc, prc, rec, f2 = evaluate(model, trn_set, trn_loader, is_test=True, thres=0.35, eval_every=11)
print(f'consensus acc {acc:0.4f} prc {prc:0.4f} rec {rec:0.4f} f2 {f2:0.4f}', end='')

consensus acc 0.9412 prc 0.9286 rec 0.9286 f2 0.9286

In [None]:
acc, prc, rec, f2 = evaluate(model, trn_set, trn_loader, is_test=True, thres=0.35, eval_every=11, raw_wts=True)
print(f'consensus acc {acc:0.4f} prc {prc:0.4f} rec {rec:0.4f} f2 {f2:0.4f}', end='')

consensus acc 0.9412 prc 0.9286 rec 0.9286 f2 0.9286

In [None]:
for tt in [0.05, 0.1, 0.15, 0.2, 0.25, 0.3]

In [None]:
# validate
val_total = 0
val_correct = 0
val_set.test = False
with torch.no_grad():
    model.eval()
    for i, sample in enumerate(val_loader):
        # forward pass
        output = model(sample['txy'].swapaxes(1,2))
        # find accuracy
        # _, predicted = torch.max(output.data, 1)
        predicted = 1*(output.data > 0.5)[:,0]
        val_total += sample['label'].size(0)
        val_correct += (predicted == sample['label']).sum().item()
        # find loss
        loss = criterion(output[:,0], sample['label'].to(dtype=torch.float32))
        validation_loss_history[epoch] += loss.item()
    validation_loss_history[epoch] /= len(val_loader)
    validation_accuracy_history[epoch] = val_correct / val_total
print(f', val loss: {validation_loss_history[epoch,0]:0.4f}, val acc: {validation_accuracy_history[epoch,0]:0.4f}')


# validate 2

# predictions = []
val_set.test = True
val_total = 0
val_correct = 0
with torch.no_grad():
    model.eval()
    for i, sample in enumerate(val_set):
        crop_predicted = []
        for crop in sample['txy']:
          output = model(torch.Tensor(crop['txy'][None,:,:].swapaxes(1,2)))
          crop_predicted += [1*(output.data > 0.5)[:,0]]
        pred = 1 if sum(crop_predicted) > len(crop_predicted) / 2 else 0
        if (pred == sample['label']):
          val_correct += 1
        val_total += 1
        # predictions += [[sample['id'], pred]]
print(f', consensus val acc: {val_correct/val_total:0.4f}')




, val loss: 0.5921, val acc: 0.8075
, val acc: 0.9375


In [None]:
val_set.test = True
val_total = 0
val_correct = 0
with torch.no_grad():
    model.eval()
    for i, sample in enumerate(val_set):
        crop_predicted = []
        for crop in sample['txy']:
          output = model(torch.Tensor(crop['txy'][None,:,:].swapaxes(1,2)))
          crop_predicted += [output.data[:,0]]
        pred = 1 if sum(crop_predicted) > len(crop_predicted) / 2 else 0
        if (pred == sample['label']):
          val_correct += 1
        val_total += 1
        # predictions += [[sample['id'], pred]]
print(f', wtd consensus val acc: {val_correct/val_total:0.4f}')

, wtd consensus val acc: 0.9375


#Generate test predictions

In [None]:
testpath = "/content/drive/MyDrive/CS155Proj1/test.json"
with open(testpath, 'r') as f:
  test_data = json.load(f)
test_data = [{'id': k, 'txy': np.array(v['txy'], dtype=np.float32)[:,1:], 'label': v['label']} for (k, v) in test_data.items()]

tst_set = CnnDataset(test_data, test=True, aug=True)

In [None]:
# submission
thres=0.15
raw_wts=True
print_update_every=20
print('|'+' '*print_update_every+'|')
print(' ', end='')


predictions = [['UID', 'label']]
outputs = []
with torch.no_grad():
    model.eval()
    for i, sample in enumerate(tst_set):
        crop_predicted = []
        for crop in sample['txy']:
          output = model(torch.Tensor(crop['txy'][None,:,:].swapaxes(1,2)))
          if raw_wts:
            crop_predicted += [output.data[:,0]]
          else:
            crop_predicted += [1*(output.data > thres)[:,0]]

        if raw_wts:
          pred = 1 if sum(crop_predicted) > len(crop_predicted) * thres else 0
        else:
          pred = 1 if sum(crop_predicted) > len(crop_predicted) / 2 else 0
        predictions += [[sample['id'], pred]]
        outputs += [[sample['id'], float(sum(crop_predicted))/len(crop_predicted)]]


        if i in [int(temp) for temp in np.round(np.linspace(-1, len(tst_set)-1, print_update_every+1))]:
            print('*', end='')

        # predictions += [np.array([sample['id'], predicted.numpy()], dtype=str).T]

predictions = np.array(predictions)
outputs = np.array(outputs)
np.savetxt('/content/drive/MyDrive/CS155Proj1/cnn_submission_4.csv', predictions, delimiter=',', fmt='%s')
np.savetxt('/content/drive/MyDrive/CS155Proj1/cnn_submission_4_outputs.csv', outputs, delimiter=',', fmt='%s')

|                    |
 *******************