In [1]:
from pathlib import Path
import pickle
import pandas as pd
import numpy as np
from torchvision import transforms
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, Subset, Dataset
from torch import nn, optim
import torch
from sklearn.metrics import accuracy_score
from fastprogress.fastprogress import master_bar, progress_bar
import os
from tqdm import tqdm
from torch.nn.functional import softmax
from zipfile import ZipFile 
from PIL import Image

# Configs 

In [2]:
batch_size = 8
lr = 1e-3
tmax = 10
eta_min = 1e-5
num_epochs = 10
num_classes = 80
debug = False
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dataset Class

In [3]:
trainsforms_dict = {
    'train': transforms.Compose([
        transforms.RandomHorizontalFlip(0.5),
        transforms.ToTensor(),
    ]),
    'test': transforms.Compose([
        transforms.RandomHorizontalFlip(0.5),
        transforms.ToTensor(),
    ])
}


class FreeShoundTrainDataset(Dataset):
    def __init__(self, mels, labels, transform, time_mask=0.1, freq_mask=0.1, spec_aug=True):
        self.mels = mels  # (num_samples, 128, var, 3)
        self.labels = labels
        self.tfms = transform
        self.time_mask = time_mask
        self.freq_mask = freq_mask
        self.spec_aug = spec_aug

    def __len__(self):
        return len(self.mels)

    def __getitem__(self, idx):
        mel = self.mels[idx]  # (128, 451, 3)
        base_dim, time_dim, _ = mel.shape
        crop = np.random.randint(0, time_dim-base_dim)  # (97)
        image = mel[:, crop: crop + base_dim, ...]  # (128, 128, 3)

        if self.spec_aug:
            freq_mask_begin = int(np.random.uniform(
                0, 1 - self.freq_mask) * base_dim)
            image[freq_mask_begin:freq_mask_begin +
                  int(self.freq_mask * base_dim), ...] = 0
            time_mask_begin = int(np.random.uniform(
                0, 1 - self.time_mask) * base_dim)
            image[:, time_mask_begin:time_mask_begin +
                  int(self.time_mask * base_dim), ...] = 0

        image = Image.fromarray(image[..., 0], mode='L')  # (128, 128)
        image = self.tfms(image).div_(255)  # (1, 128, 128)
        if self.labels is not None:
            label = np.asarray(self.labels)[idx]
            label = torch.from_numpy(label).float()
        return (image, label) if self.labels is not None else image

# Model 

In [4]:
import torch.nn as nn
import torchvision.models as models


class Model(nn.Module):
    def __init__(self, num_classes, pretrained=False):
        super().__init__()

        # Load the model with pretrained weights if specified
        self.model = models.resnet18(weights=None)

        # Modify the first convolution layer for grayscale input (1 channel)
        self.model.conv1 = nn.Conv2d(
            1, 64, kernel_size=7, stride=2, padding=3, bias=False
        )

        # Get the input features of the final fully connected layer
        self.in_features = self.model.fc.in_features

        # Replace the final fully connected layer
        self.model.fc = nn.Linear(self.in_features, num_classes)

    def forward(self, x):
        return self.model(x)

# Utilities 

In [5]:
def transform_labels(df):
    df["labels"] = df["labels"].str.split(",")
    unique_labels = set()
    for labels in df["labels"]:
        unique_labels.update(labels)
    unique_labels = sorted(list(unique_labels))

    for label in unique_labels:
        df[label] = df["labels"].apply(lambda x: 1 if label in x else 0)

    df = df.drop("labels", axis=1)

    return df


def make_prediction(model, test_dl):
    model.eval()
    preds = []
    for xb in tqdm(test_dl):
        with torch.no_grad():
            logits = model(xb.to(device))
            pred = softmax(logits, dim=1)
        preds.extend(pred.cpu().numpy())
    return preds


def create_unique_labels(all_labels):
    label_dict = {}
    all_labels_set = []
    first_labels_set = []
    for labs in all_labels:
        lab = labs.split(",")
        for l in lab:
            if l in label_dict:
                label_dict[l] = label_dict[l] + 1
            else:
                label_dict[l] = 0

        all_labels_set.append(set(lab))
        first_labels_set.append(lab[0])
    classes = list(label_dict.keys())

    return label_dict, classes, all_labels_set, first_labels_set

In [6]:
path = Path("/kaggle/input/freesound-audio-tagging-2019")
data_dir = Path('data')
(data_dir/'test_files').mkdir(parents=True, exist_ok=True)

In [7]:
%%time
# loading the temp.zip and creating a zip object 
with ZipFile(path/'test.zip', 'r') as zObject: 
    zObject.extractall(path=data_dir/'test_files') 
zObject.close() 

CPU times: user 20.7 s, sys: 3.5 s, total: 24.2 s
Wall time: 35.8 s


In [8]:
# load input data
path = Path("/kaggle/input/freesound-audio-tagging-2019")
train_df = pd.read_csv(path / "train_curated.csv")
train_df = transform_labels(train_df)
y_train = train_df.iloc[:, 1:]

test_images = sorted(os.listdir("data/test_files"))
sub = pd.read_csv(path / "sample_submission.csv")

# load preprocessed audios and corresponding labels
path = Path('/kaggle/input/data-preprocessing/data')
processed_test = pickle.load(
    open(path / "mels_test.pkl", "rb")
)  # (4970, 128, var, 3)
y_train = train_df.iloc[:, 1:]


test_ds = FreeShoundTrainDataset(processed_test, None, trainsforms_dict["train"])
if debug:
    test_ds = Subset(test_ds, range(100))
test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=False)
print("=======Data Loader Prepared=========")

# load model
model = Model(num_classes=80)
model.load_state_dict(torch.load("/kaggle/input/sound-classification-training/model.pt",weights_only=True, map_location=device))
model.to(device)
print("=======Model Loaded=========")

preds = make_prediction(model, test_dl)
sub = pd.DataFrame(preds, columns=y_train.columns)
if debug:
    test_fns = sorted(os.listdir("data/test_files"))[:100]
else:
    test_fns = sorted(os.listdir("data/test_files"))
sub["fname"] = test_fns
sub = sub[["fname"] + [col for col in sub.columns if col != "fname"]]
sub.to_csv('submission.csv', index=False)



100%|██████████| 421/421 [00:03<00:00, 113.31it/s]


In [9]:
sub.shape

(3361, 81)

In [10]:
sub.head()

Unnamed: 0,fname,Accelerating_and_revving_and_vroom,Accordion,Acoustic_guitar,Applause,Bark,Bass_drum,Bass_guitar,Bathtub_(filling_or_washing),Bicycle_bell,...,Toilet_flush,Traffic_noise_and_roadway_noise,Trickle_and_dribble,Walk_and_footsteps,Water_tap_and_faucet,Waves_and_surf,Whispering,Writing,Yell,Zipper_(clothing)
0,4260ebea.wav,0.0005526355,0.0001168164,0.0009200795,0.0013,0.000352,0.006471157,0.0005433353,0.171324,0.011739,...,0.01060497,0.000683,0.046363,0.004191,0.05046605,0.001014,0.040781,0.017554,0.000657,0.001418
1,426eb1e0.wav,0.0005287796,5.855016e-06,8.593398e-06,0.543166,0.000434,3.973585e-05,8.584962e-06,0.005555,3.7e-05,...,0.003574183,0.00233,0.001743,0.004118,0.0003404551,0.004296,0.002681,0.000232,0.000381,0.002426
2,428d70bb.wav,0.0008677715,0.0008808777,0.000445382,0.001728,0.000923,5.742359e-05,1.928717e-05,0.006147,0.000701,...,0.002746448,0.003746,0.002694,0.001234,0.001094117,0.000503,0.018195,0.000507,0.005958,0.004192
3,4292b1c9.wav,9.737381e-07,9.461909e-07,5.647109e-07,7e-06,3e-06,1.16376e-07,1.095956e-08,2.3e-05,2e-06,...,6.321749e-07,9e-06,3.6e-05,1.1e-05,7.773207e-07,1.1e-05,0.002051,4.8e-05,0.002757,0.000362
4,429c5071.wav,0.01757704,0.000695109,0.001466429,0.001714,0.029338,0.01375458,0.001992168,0.003632,0.011808,...,0.003827383,0.003707,0.002108,0.012698,0.001684943,0.004943,0.011436,0.009589,0.002562,0.01234
