In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
 
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm

In [2]:
df = pd.read_pickle('crowd_train_40_120_with_audio_feature.pkl')

In [3]:
freq = df.source_id.value_counts()
freq = freq[(100 <= freq) & (freq <= 120)]
df2 = df[df.source_id.isin(freq.index)]
df2 = df2.reset_index(drop=True)

In [4]:
df2 = df.copy()

In [5]:
# np.vstack(df.mfcc.to_numpy()).reshape(55744 , -1, 256).shape
X = np.vstack(df2.audio_feature.to_numpy()).reshape(df2.shape[0] , -1)
le = LabelEncoder()
y = le.fit_transform(df2.source_id)
X.shape, y.shape

((55744, 498), (55744,))

In [39]:
df2['y'] = y

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train.shape, X_test.shape

((44595, 498), (11149, 498))

In [7]:
class AudioDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.Tensor(X)
        self.y = torch.Tensor(y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

    def __len__(self):
        return len(self.y)

In [8]:
train_dataset = AudioDataset(X_train, y_train)
test_dataset = AudioDataset(X_test, y_test)

In [9]:
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [28]:
class ClfNN(nn.Module):
    def __init__(self, in_features=498, n_classes=51):
        super(ClfNN, self).__init__()

        self.lin_layers = nn.Sequential(
            nn.Linear(in_features, 1024),
            nn.BatchNorm1d(num_features=1024),
            nn.GELU(),
            nn.Linear(1024, 768),
            nn.BatchNorm1d(num_features=768),
            nn.GELU(),
            nn.Linear(768, n_classes)
        )
        self.sigmoid = nn.Softmax()

    def forward(self, x):
        out = self.lin_layers(x)
        #out = self.sigmoid(out)

        return out

In [29]:
def evaluate_epoch(model, loader, criterion, optimizer, train=True):
    loss_arr = []
    f1_arr = []

    for inputs, labels in loader:
        inputs = inputs.to(device)
        labels = labels.type(torch.LongTensor)
        labels = labels.to(device)
        
        if train:
            model.train()
            optimizer.zero_grad()

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            preds = torch.argmax(outputs, 1).cpu()
            loss.backward()
            optimizer.step()
        else:
            model.eval()
            with torch.no_grad():
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                preds = torch.argmax(outputs, 1).cpu()
                    
        loss_arr.append(loss.item())

        f1_arr.append(f1_score(labels.cpu(), preds, average='weighted'))

    return sum(loss_arr)/len(loss_arr), sum(f1_arr)/len(f1_arr)


In [30]:
def train(train_loader, val_loader, model, optim, criterion, epochs):
    history = []
    log_template = "\nEpoch {ep:03d} train_loss: {t_loss:0.4f} \
    val_loss {v_loss:0.4f} train_f1 {t_acc:0.4f} val_f1 {v_acc:0.4f}"

    with tqdm(desc="epoch", total=epochs) as pbar_outer:

        for epoch in range(epochs):
            train_loss, train_acc = evaluate_epoch(model, train_loader, criterion, optim, train=True)
            print("loss", train_loss)

            val_loss, val_acc = evaluate_epoch(model, val_loader, criterion, optim, train=False)
            history.append((train_loss, train_acc, val_loss, val_acc))

            pbar_outer.update(1)
            tqdm.write(log_template.format(ep=epoch+1, t_loss=train_loss,\
                                           v_loss=val_loss, t_acc=train_acc, v_acc=val_acc))

    return history

In [31]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = ClfNN(X.shape[1], len(set(y)))
model.to(device)

optim = torch.optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

In [32]:
history = train(train_dataloader, test_dataloader, model, optim, criterion, 10)

epoch:   0%|          | 0/10 [00:00<?, ?it/s]

loss 2.785521099522945


epoch:  10%|█         | 1/10 [00:06<01:00,  6.69s/it]


Epoch 001 train_loss: 2.7855     val_loss 1.2290 train_f1 0.4385 val_f1 0.7236
loss 0.7595423553005353


epoch:  20%|██        | 2/10 [00:13<00:52,  6.53s/it]


Epoch 002 train_loss: 0.7595     val_loss 0.8135 train_f1 0.8051 val_f1 0.8004
loss 0.40682184903573787


epoch:  30%|███       | 3/10 [00:19<00:45,  6.43s/it]


Epoch 003 train_loss: 0.4068     val_loss 0.6892 train_f1 0.8934 val_f1 0.8260
loss 0.25128942645733465


epoch:  40%|████      | 4/10 [00:25<00:38,  6.40s/it]


Epoch 004 train_loss: 0.2513     val_loss 0.6246 train_f1 0.9374 val_f1 0.8419
loss 0.17050969120184523


epoch:  50%|█████     | 5/10 [00:32<00:31,  6.37s/it]


Epoch 005 train_loss: 0.1705     val_loss 0.6391 train_f1 0.9601 val_f1 0.8446
loss 0.12858006739093678


epoch:  60%|██████    | 6/10 [00:38<00:25,  6.36s/it]


Epoch 006 train_loss: 0.1286     val_loss 0.6255 train_f1 0.9693 val_f1 0.8470
loss 0.10905365614485064


epoch:  70%|███████   | 7/10 [00:44<00:19,  6.36s/it]


Epoch 007 train_loss: 0.1091     val_loss 0.6330 train_f1 0.9738 val_f1 0.8506
loss 0.09158446694159054


epoch:  80%|████████  | 8/10 [00:51<00:12,  6.40s/it]


Epoch 008 train_loss: 0.0916     val_loss 0.6316 train_f1 0.9783 val_f1 0.8575
loss 0.08688425998653751


epoch:  90%|█████████ | 9/10 [00:57<00:06,  6.40s/it]


Epoch 009 train_loss: 0.0869     val_loss 0.6036 train_f1 0.9773 val_f1 0.8563
loss 0.06968708512395984


epoch: 100%|██████████| 10/10 [01:04<00:00,  6.40s/it]


Epoch 010 train_loss: 0.0697     val_loss 0.6062 train_f1 0.9824 val_f1 0.8623





In [33]:
import IPython.display as ipd

In [34]:
def one_predict(features, model):
    tns = torch.tensor(features).type(torch.float).unsqueeze(0)
    model.eval()
    sf = nn.Softmax(dim=1)
    return torch.argmax(sf(model(tns.to(device)))).cpu().item()

In [61]:
tns = torch.tensor(X).type(torch.float)
model.eval()

res = model(tns.to(device))
sf = nn.Softmax(dim=1)
preds = torch.argmax(sf(res), dim=1).cpu().numpy()

In [68]:
print(classification_report(y, preds))

              precision    recall  f1-score   support

           0       0.94      0.92      0.93        72
           1       0.95      0.92      0.94        79
           2       1.00      0.94      0.97        54
           3       1.00      1.00      1.00        59
           4       0.99      1.00      0.99        89
           5       0.99      0.96      0.97        71
           6       1.00      0.98      0.99        46
           7       0.99      0.77      0.86        90
           8       1.00      0.93      0.96        85
           9       1.00      1.00      1.00        48
          10       1.00      1.00      1.00        83
          11       1.00      1.00      1.00        82
          12       1.00      0.90      0.95        49
          13       1.00      0.94      0.97        71
          14       0.99      1.00      0.99        78
          15       1.00      0.98      0.99        60
          16       1.00      0.78      0.88        79
          17       0.90    

In [69]:
tns = torch.tensor(X_test).type(torch.float)
model.eval()

res = model(tns.to(device))
sf = nn.Softmax(dim=1)
preds = torch.argmax(sf(res), dim=1).cpu().numpy()

print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.83      0.71      0.77        14
           1       0.82      0.88      0.85        16
           2       1.00      0.82      0.90        11
           3       1.00      1.00      1.00        12
           4       1.00      1.00      1.00        18
           5       1.00      0.86      0.92        14
           6       1.00      0.89      0.94         9
           7       1.00      0.50      0.67        18
           8       1.00      0.76      0.87        17
           9       1.00      1.00      1.00        10
          10       1.00      1.00      1.00        17
          11       1.00      1.00      1.00        16
          12       1.00      0.60      0.75        10
          13       1.00      0.79      0.88        14
          14       0.94      1.00      0.97        16
          15       1.00      0.92      0.96        12
          16       1.00      0.44      0.61        16
          17       0.65    

In [71]:
df2[df2['y'] == 7]

Unnamed: 0,audio_path,source_id,audio_feature,y
593,crowd_train\wavs/4dd4c7d5381b9bb8f5ff62323943c...,01436d6512d60bb305b7c7ae1cbca224,"[0.49071264266967773, 0.3666366636753082, 0.32...",7
2060,crowd_train\wavs/60336d19378c78b72414f7cf2e46e...,01436d6512d60bb305b7c7ae1cbca224,"[0.5164117217063904, 0.5142006874084473, 0.524...",7
3513,crowd_train\wavs/423dbca3afcc56874d4a867d3fcc6...,01436d6512d60bb305b7c7ae1cbca224,"[0.4808836579322815, 0.42027533054351807, 0.42...",7
4220,crowd_train\wavs/935ecc08103c4dfd57a93f79ee258...,01436d6512d60bb305b7c7ae1cbca224,"[0.38926491141319275, 0.33664944767951965, 0.3...",7
4550,crowd_train\wavs/5964a9e81981190bb48efa96ffd80...,01436d6512d60bb305b7c7ae1cbca224,"[0.5735481381416321, 0.5685892701148987, 0.535...",7
...,...,...,...,...
37738,crowd_train\wavs/1cd720553487a0df6c831b57fbb49...,01436d6512d60bb305b7c7ae1cbca224,"[0.3092426359653473, 0.30572178959846497, 0.31...",7
38183,crowd_train\wavs/7f64ff7efd71239fc9e6fd5210ab1...,01436d6512d60bb305b7c7ae1cbca224,"[0.4315500259399414, 0.4176945090293884, 0.426...",7
38509,crowd_train\wavs/638474fa154058d789bf908871ade...,01436d6512d60bb305b7c7ae1cbca224,"[0.525324285030365, 0.4852810204029083, 0.4356...",7
38853,crowd_train\wavs/22b0afc0e44913a7ac47a81585951...,01436d6512d60bb305b7c7ae1cbca224,"[0.44340845942497253, 0.42662084102630615, 0.4...",7


In [76]:
data = df2.loc[37738]
display(ipd.Audio(data.audio_path))
one_predict(data.audio_feature, model)

7

In [75]:
data = df2.loc[2060]
display(ipd.Audio(data.audio_path))
one_predict(data.audio_feature, model)

7

In [78]:
data = df2.loc[4550]
display(ipd.Audio(data.audio_path))
one_predict(data.audio_feature, model)

764