In [1]:
import os
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
import gzip

random_state = 59

import torch
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
import torch.nn as nn
import torchaudio


In [65]:
audio_title, audio_data, sound_label, direction_label = [], [], [], []
left_audio, right_audio = [], []
for file in os.listdir('AudioData/'):
    audio_title.append(file)
    with gzip.open('AudioData/'+file, 'rb') as f:
        data = pickle.load(f)['S_db']
        audio_data.append(np.stack((data[0], data[1]), axis=2))
        left_audio.append(data[0])
        right_audio.append(data[1])
    if 'generated' in file:
        direction_label.append(file.split('_')[1])
        sound_label.append(file.split('_')[3])
    else:
        direction_label.append(file.split('_')[0])
        sound_label.append(file.split('_')[2])

df = pd.DataFrame({#'audio title': audio_title,
                    'audio_data': audio_data,
                    'left_audio': left_audio,
                    'right_audio': right_audio,
                    'direction': direction_label,
                    'sound': sound_label})

print(df['direction'].value_counts())
print(df['sound'].value_counts())
df.head(2)

hardRight    296
hardLeft      51
Name: direction, dtype: int64
gunshot     296
footstep     51
Name: sound, dtype: int64


Unnamed: 0,audio_data,left_audio,right_audio,direction,sound
0,"[[[77.67500827779001, 78.82224374018566], [63....","[[77.67500827779001, 63.40599804807859, 65.211...","[[78.82224374018566, 67.60445251709298, 68.197...",hardLeft,footstep
1,"[[[57.43526651761323, 60.82275704575457], [56....","[[57.43526651761323, 56.80122292698559, 61.468...","[[60.82275704575457, 57.769317207876924, 64.14...",hardLeft,footstep


In [69]:
#print(df.iloc[0])

conversion = {'hardLeft': 'hardRight',
              'hardRight': 'hardLeft'}


for index, (audio_data, left_audio, right_audio, direction, sound) in df.iterrows():
    
    if direction in conversion.keys():
        
        audio_data = [audio_data[1], audio_data[0]]
        left_audio, right_audio = right_audio, left_audio
        direction = direction.replace(direction, conversion[direction]) # hard coded
        
        generated = {'audio_data': audio_data,
                    'left_audio': left_audio,
                    'right_audio': right_audio,
                    'direction': direction,
                    'sound': sound}
        
        df = df.append(generated, ignore_index=True)
        
print(df['direction'].value_counts())
print(df['sound'].value_counts())

hardLeft     347
hardRight    347
Name: direction, dtype: int64
gunshot     592
footstep    102
Name: sound, dtype: int64


Establish transforms, segment dataset and apply transforms, apply weights according to class. Load dataloaders with weighted random sampler. Initialize model (CNN). Train model, record losses/history.


# Create weights for ALL classes (with variable # of classes)
weights = {'direction': [], 'sound': []}

for output in ['direction', 'sound']:
    
    for count in df[output].value_counts():
        
        weights[output].append(1/count)
     
for k, v in weights.items():
    print(k, v)

In [87]:
# Apply transformations to dataset

#data_transforms = transforms.Compose([
#    transforms.ToTensor()
#    #torchaudio.transforms.TimeMasking(time_mask_param=0.5)
#    ])

data_transforms = transforms.Compose([])

class SegmentData(Dataset):
    def __init__(self, dataset, transform):
        self.dataset = dataset
        self.transform = transform

        
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, index):
             
        audio_array = self.dataset.iloc[index]['audio_data']
        left_audio = self.dataset.iloc[index]['left_audio']
        right_audio = self.dataset.iloc[index]['right_audio']
        direction = self.dataset.iloc[index]['direction']
        sound = self.dataset.iloc[index]['sound']
        
        if self.transform is not None:
            audio_data = self.transform(audio_array)
            left_audio = self.transform(left_audio)
            right_audio = self.transform(right_audio)
        
        data = (index, audio_data, left_audio, right_audio, direction, sound)
            
        return data
    

# Dataframe containing center, left, and right

LR_df = df[df['direction'].str.contains('Left|Right|center')]

LR_df['direction'] = LR_df['direction'].replace(['hardRight', 'softRight'], 'Right')
LR_df['direction'] = LR_df['direction'].replace(['hardLeft', 'softLeft'], 'Left')
LR_df['direction'] = LR_df['direction'].replace(['center'], 'Center')


LR_df['direction'].value_counts()


In [89]:
df['audio_data'][50].shape

train_df, test_df = train_test_split(df, test_size=0.33,
                                     random_state=random_state)



direction_counts = train_df['direction'].value_counts()

weights = 1. / torch.tensor(direction_counts.values, dtype=torch.float)
sample_weights = weights[pd.Categorical(train_df['direction']).codes]
sampler = WeightedRandomSampler(weights=sample_weights, num_samples=len(train_df), replacement=True)

train_ds = SegmentData(train_df, transform=data_transforms)
test_ds = SegmentData(test_df, transform=data_transforms)

train_dataloader = DataLoader(train_ds, batch_size=4, sampler=sampler, pin_memory=True)
test_dataloader = DataLoader(test_ds, batch_size=4, shuffle=True, pin_memory=True)

type(train_dataloader)

torch.utils.data.dataloader.DataLoader

In [100]:
train_df['audio_data'][671][0].shape

(25, 2)

In [91]:
train_ds[0][1]

[array([[73.58742193, 62.83314382],
        [89.1373303 , 67.08851018],
        [79.62172209, 75.62653901],
        [83.59113853, 83.73321438],
        [86.10320344, 73.11124288],
        [81.37218171, 79.63314626],
        [85.05338455, 80.6814704 ],
        [86.90159107, 76.74652398],
        [94.87304884, 76.66214027],
        [89.2600541 , 81.14821115],
        [74.73701908, 90.2392904 ],
        [84.89809631, 85.1241222 ],
        [91.09712474, 77.02196268],
        [92.09372223, 80.17463721],
        [92.2044529 , 87.03913523],
        [85.26055048, 85.85567817],
        [88.02887407, 87.06334012],
        [83.49897723, 79.63491674],
        [77.8657095 , 83.28174698],
        [81.84777137, 79.3222743 ],
        [77.48364757, 69.48314325],
        [75.80808385, 77.856764  ],
        [78.06493898, 73.57284698],
        [75.1951908 , 67.95649909],
        [70.01477676, 71.90072176]]),
 array([[73.74510138, 62.47175362],
        [89.33767041, 66.29670658],
        [79.09813159, 75.8

In [41]:
next(iter(train_dataloader))

RuntimeError: each element in list of batch should be of equal size

In [69]:
import models

model = models.CNNv2


In [76]:
# Model initializer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#model = CNN(num_directions=len(['Left', 'Right', 'Center']))
model = models.CNNv2(input_shape=2, 
    hidden_units=10, 
    output_shape=len(['Left', 'Right'])).to(device)
model = model.to(device)


# Optimizer and criterion
criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

# Training loop

num_epochs = 10

for epoch in range(num_epochs):
    train_loss, val_loss = [], []
    train_correct, val_correct = 0, 0
    
    # Training loop
    model.train()
    for idx, (i, audio_data, left_audio, right_audio, direction, sound) in enumerate(train_dataloader):

        audio_data = audio_data.to(device)
        direction = pd.Categorical(direction).codes
        direction = torch.tensor(direction, dtype=torch.long, device=device)

        optimizer.zero_grad()

        output = model(audio_data.float())
        loss = criterion(output, direction)
        
        #print("output:", output, "direction:", direction)

        # Compute accuracy
        pred_direction = output.argmax(dim=1)
        train_correct += pred_direction.eq(direction).sum().item()

        train_loss.append(loss.item())

        loss.backward()
        optimizer.step()

    # Validation loop
    model.eval()
    with torch.no_grad():
        for idx, (i, audio_data, left_audio, right_audio, direction, sound) in enumerate(test_dataloader):

            audio_data = audio_data.to(device)
            direction = pd.Categorical(direction).codes
            direction = torch.tensor(direction, dtype=torch.long, device=device)

            output = model(audio_data.float())
            loss = criterion(output, direction)

            # Compute accuracy
            pred_direction = output.argmax(dim=1)
            val_correct += pred_direction.eq(direction).sum().item()

            val_loss.append(loss.item())

    # Calculate epoch accuracy and loss
    train_acc = train_correct / len(train_dataloader.dataset)
    val_acc = val_correct / len(test_dataloader.dataset)
    avg_train_loss = sum(train_loss) / len(train_loss)
    avg_val_loss = sum(val_loss) / len(val_loss)

    # Print epoch results
    print(f"Epoch {epoch+1}: Train Loss = {avg_train_loss:.4f}, Train Accuracy = {train_acc:.4f}, Val Loss = {avg_val_loss:.4f}, Val Accuracy = {val_acc:.4f}")

TypeError: pic should be PIL Image or ndarray. Got <class 'list'>

In [189]:
train_loss

[0.8411638736724854,
 0.8455745577812195,
 0.8294551372528076,
 1.1197551488876343,
 0.8305132985115051,
 1.4494388103485107,
 0.8282938599586487,
 1.1231935024261475,
 1.1389272212982178,
 1.457764744758606,
 0.7950509786605835,
 1.4688360691070557,
 0.8393155336380005,
 0.8368718028068542,
 1.4483870267868042,
 0.8113885521888733,
 0.8495606780052185,
 1.1132034063339233,
 0.8890126943588257,
 1.3911525011062622,
 0.8582789897918701,
 0.8186428546905518,
 1.367274284362793,
 0.8260793685913086,
 0.8659787774085999,
 0.8701518177986145,
 0.9499830007553101]

In [162]:
val_loss

[tensor(0.8014, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(1.3014, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(0.8014, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(1.3014, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(1.3014, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(1.3014, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(1.3014, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(0.8014, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(1.3014, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(1.3014, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(1.3014, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(1.0514, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(1.3014, device='cuda:0', grad_fn=<NllLossBackward0>)]