In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, accuracy_score
 
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from tqdm.notebook import tqdm

In [2]:
df = pd.read_pickle('crowd_train_all_data_embedded.pkl')

In [3]:
def get_dataframe(df, min_count, max_count):
    df_temp = df.copy()

    freq = df_temp.source_id.value_counts()
    freq = freq[min_count <= freq]
    df_temp = df_temp[df.source_id.isin(freq.index)]

    df_temp = df_temp.groupby('source_id').apply(
        lambda x: x.nlargest(max_count, 'duration')
    ).reset_index(drop=True).drop(columns=['duration', 'hash_id', 'annotator_emo', 'golden_emo', 'annotator_id', 'speaker_text', 'speaker_emo'])

    df_temp.dropna(inplace=True)
    
    print(df_temp.source_id.nunique())
    
    return df_temp

In [4]:
df_t = get_dataframe(df, 200, 10)
df_t = get_dataframe(df, 100, 10)

102
354


In [5]:
# df_t = pd.concat([df_t, df_t, df_t])

In [6]:
X = np.vstack(df_t.audio_feature.to_numpy()).reshape(df_t.shape[0] , -1)
le = LabelEncoder()
y = le.fit_transform(df_t.source_id)
X.shape, y.shape

((3540, 498), (3540,))

In [7]:
# X = X[:, 36 + 384: 36 + 384 + 60] #mfcc only

In [8]:
df_t['y'] = y

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42, stratify=y)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train.shape, X_test.shape

((1770, 498), (1770, 498))

In [10]:
X_train = np.concatenate([X_train, X_train])
y_train = np.concatenate([y_train, y_train])

In [11]:
X_train.shape, X_test.shape

((3540, 498), (1770, 498))

In [12]:
class AudioDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.Tensor(X)
        self.y = torch.Tensor(y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

    def __len__(self):
        return len(self.y)

In [13]:
train_dataset = AudioDataset(X_train, y_train)
test_dataset = AudioDataset(X_test, y_test)

In [14]:
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=True)

In [16]:
import torch.nn as nn

class ClfNN(nn.Module):
    def __init__(self, in_features=498, n_classes=51, layer_sizes=[512, 768, 512], dropouts=[0.2, 0.2, 0.2]):
        super(ClfNN, self).__init__()
        assert len(layer_sizes) == len(dropouts), "Each layer must have a corresponding dropout rate."

        layers = []
        for i in range(len(layer_sizes)):
            if i == 0:
                layers.append(nn.Linear(in_features, layer_sizes[i]))
            else:
                layers.append(nn.Linear(layer_sizes[i-1], layer_sizes[i]))
            layers.append(nn.BatchNorm1d(num_features=layer_sizes[i]))
            layers.append(nn.GELU())
            layers.append(nn.Dropout(p=dropouts[i]))

        layers.append(nn.Linear(layer_sizes[-1], n_classes))
        
        self.lin_layers = nn.Sequential(*layers)
        # self.sigmoid = nn.Softmax(dim=1)

    def forward(self, x):
        out = self.lin_layers(x)

        return out


In [17]:
def evaluate_epoch(model, loader, criterion, optimizer, train=True):
    loss_arr = []
    acc = []
    f1_w = []
    f1_micro = []

    for inputs, labels in tqdm(loader, leave=False):
        inputs = inputs.to(device)
        labels = labels.type(torch.LongTensor)
        labels = labels.to(device)
        
        if train:
            model.train()
            optimizer.zero_grad()

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            preds = torch.argmax(outputs, 1).cpu()
            loss.backward()
            optimizer.step()
        else:
            model.eval()
            with torch.no_grad():
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                preds = torch.argmax(outputs, 1).cpu()
                    
        loss_arr.append(loss.item())

        f1_w.append(f1_score(labels.cpu(), preds, average='weighted'))
        f1_micro.append(f1_score(labels.cpu(), preds, average='micro'))
        acc.append(accuracy_score(labels.cpu(), preds))

    return sum(loss_arr)/len(loss_arr), sum(acc)/len(acc), sum(f1_w)/len(f1_w), sum(f1_micro)/len(f1_micro)


In [18]:
def train(train_loader, val_loader, model, optim, criterion, epochs):
    history = []
    log_template = "\nEpoch {ep:03d} train_loss: {t_loss:0.4f} \
    val_loss {v_loss:0.4f} train_f1 {t_acc:0.4f} val_f1 {v_acc:0.4f}"

    with tqdm(desc="epoch", total=epochs, leave=False) as pbar_outer:

        for epoch in range(epochs):
            train_loss, train_acc, train_f1_w, train_f1_micro = evaluate_epoch(model, train_loader, criterion, optim, train=True)
            print("loss", train_loss)

            val_loss, val_acc, val_f1_w, val_f1_micro = evaluate_epoch(model, val_loader, criterion, optim, train=False)
            history.append((train_loss, train_acc, train_f1_w, train_f1_micro, val_loss, val_acc, val_f1_w, val_f1_micro))
            # if len(history) > 3 and val_acc < history[-2][-1] and val_acc < history[-3][-1] and val_acc < history[-4][-1]:
            #     break

            pbar_outer.update(1)
            tqdm.write(log_template.format(ep=epoch+1, t_loss=train_loss,\
                                           v_loss=val_loss, t_acc=train_acc, v_acc=val_acc))

    return history

In [40]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = ClfNN(in_features=X.shape[1], n_classes=len(set(y)), layer_sizes=[500, 400], dropouts=[0.3, 0.3])
model.to(device)

optim = torch.optim.Adam(model.parameters(), lr=3*10**-4)
criterion = nn.CrossEntropyLoss()

In [None]:
history = train(train_dataloader, test_dataloader, model, optim, criterion, 50)

In [42]:
history = np.array(history)
max_ep_f1_w_id = np.argmax(history[:, -2])
history[max_ep_f1_w_id]

array([0.02126611, 0.99916295, 0.99916295, 0.99916295, 1.15980335,
       0.76865434, 0.76600854, 0.76865434])

In [None]:
import mlflow
from sklearn import metrics

mlflow.set_tracking_uri("http://127.0.0.1:5000")
experiment = mlflow.set_experiment(f"{len(set(y))} classes, 5 per class")

run_name = "fully-connected normalized"

with mlflow.start_run(run_name=run_name) as run:
    model_params = {
        "in_features": X.shape[1],
        "n_classes": len(set(y)),
        "layer_sizes": [500, 400],
        "dropouts": [0.3, 0.3]
    }
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    model = ClfNN(in_features=X.shape[1], n_classes=len(set(y)), layer_sizes=[350, 200], dropouts=[0.2, 0.2])
    model.to(device)

    model_params["optim"] = 'Adam'
    model_params["criterion"] = 'CrossEntropyLoss'
    model_params["epochs"] = 50
    optim = torch.optim.Adam(model.parameters())
    criterion = nn.CrossEntropyLoss()

    history = train(train_dataloader, test_dataloader, model, optim, criterion, model_params["epochs"])

    history = np.array(history)
    max_ep_f1_w_id = np.argmax(history[:, -2])
    _, _, train_f1_w, _, _, val_acc, val_f1_w, val_f1_micro = history[max_ep_f1_w_id]

    mlflow.log_metric("train f1_weighted", train_f1_w)
    mlflow.log_metric("f1_weighted", val_f1_micro)
    mlflow.log_metric("f1_micro", val_f1_w)
    mlflow.log_metric("accuracy", val_acc)
    
    mlflow.log_params(model_params)

    # mlflow.sklearn.log_model(
    #     sk_model=mlflow, 
    #     input_example=X_test[:10], 
    #     artifact_path=f"mlflow/{run_name}/model"
    # )

In [21]:
# catboost_clf_all = catboost.CatBoostClassifier(
#     iterations=1_000, task_type="GPU", devices="0"
# )

# catboost_clf_all.fit(
#     X_train,
#     y_train,
#     verbose=100,
# )

In [32]:
import IPython.display as ipd

In [33]:
def one_predict(features, model):
    tns = torch.tensor(features).type(torch.float).unsqueeze(0)
    model.eval()
    sf = nn.Softmax(dim=1)
    return torch.argmax(sf(model(tns.to(device)))).cpu().item()

In [60]:
tns = torch.tensor(X).type(torch.float)
model.eval()

res = model(tns.to(device))
sf = nn.Softmax(dim=1)
preds = torch.argmax(sf(res), dim=1).cpu().numpy()

In [61]:
print(classification_report(y, preds))

              precision    recall  f1-score   support

           0       0.93      0.97      0.95        72
           1       0.82      0.91      0.86        79
           2       1.00      1.00      1.00        10
           3       1.00      0.81      0.90        54
           4       0.91      1.00      0.95        10
           5       1.00      0.85      0.92        39
           6       1.00      1.00      1.00        59
           7       1.00      0.87      0.93        30
           8       1.00      0.99      0.99        89
           9       0.98      0.90      0.94        71
          10       0.98      1.00      0.99        46
          11       1.00      0.90      0.95        10
          12       0.98      0.99      0.98        90
          13       0.96      1.00      0.98        25
          14       0.95      0.91      0.93        22
          15       0.94      0.95      0.95        85
          16       0.90      0.90      0.90        10
          17       1.00    

In [62]:
tns = torch.tensor(X_test).type(torch.float)
model.eval()

res = model(tns.to(device))
sf = nn.Softmax(dim=1)
preds = torch.argmax(sf(res), dim=1).cpu().numpy()

print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.92      0.86      0.89        14
           1       0.65      0.81      0.72        16
           2       1.00      1.00      1.00         2
           3       1.00      0.73      0.84        11
           4       0.67      1.00      0.80         2
           5       1.00      0.62      0.77         8
           6       1.00      1.00      1.00        12
           7       1.00      0.83      0.91         6
           8       1.00      1.00      1.00        18
           9       0.92      0.79      0.85        14
          10       0.90      1.00      0.95         9
          11       1.00      0.50      0.67         2
          12       0.94      0.94      0.94        18
          13       0.83      1.00      0.91         5
          14       0.67      0.50      0.57         4
          15       0.83      0.88      0.86        17
          16       0.67      1.00      0.80         2
          17       1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [73]:
t_f_preds = y_test - preds
t_f_preds[t_f_preds!=0]
false_ids = np.nonzero(t_f_preds)

In [87]:
X_test[false_ids].shape

(2848, 498)

In [90]:
df_with_false_preds = pd.DataFrame({
    'audio_feature_false': list(X_test[false_ids])
})
df_with_false_preds

Unnamed: 0,audio_feature_false
0,"[0.21550001204013824, 0.18788830935955048, 0.3..."
1,"[0.35597920417785645, 0.42777353525161743, 0.3..."
2,"[0.5100259780883789, 0.5315358638763428, 0.559..."
3,"[0.6011212468147278, 0.5051518082618713, 0.462..."
4,"[0.41035521030426025, 0.45205679535865784, 0.4..."
...,...
2843,"[0.35095277428627014, 0.48138725757598877, 0.6..."
2844,"[0.4015119969844818, 0.34858793020248413, 0.34..."
2845,"[0.33429470658302307, 0.44800877571105957, 0.5..."
2846,"[0.45880335569381714, 0.4921809434890747, 0.51..."


In [96]:
df_with_false_preds['merge'] = df_with_false_preds.audio_feature_false.astype(str)

In [97]:
df_t['merge'] = df_t.audio_feature.astype(str)

In [100]:
df_with_false_preds.shape, df_t.shape

((2848, 2), (118994, 5))

In [105]:
df_false = pd.merge(df_t, df_with_false_preds, on='merge', how='right')

In [106]:
df_false

Unnamed: 0,audio_path,source_id,audio_feature,y,merge,audio_feature_false
0,crowd_train\wavs/2f361e787dddfe8626981c5007f4e...,7dce79b9fcbb8f4d54b7582db14cc4e3,"[0.21550001204013824, 0.18788830935955048, 0.3...",810,[ 2.15500012e-01 1.87888309e-01 3.10878068e-...,"[0.21550001204013824, 0.18788830935955048, 0.3..."
1,crowd_train\wavs/06ea3d5fb4761671e3a2af4d4f6de...,6150a8e97eb693d1eeb3c708d36692be,"[0.35597920417785645, 0.42777353525161743, 0.3...",627,[ 3.55979204e-01 4.27773535e-01 3.69624972e-...,"[0.35597920417785645, 0.42777353525161743, 0.3..."
2,crowd_train\wavs/982c99a30d185f41808d312064d43...,73f31088c3ffd295e40a88c175baa9ef,"[0.5100259780883789, 0.5315358638763428, 0.559...",735,[ 5.10025978e-01 5.31535864e-01 5.59709072e-...,"[0.5100259780883789, 0.5315358638763428, 0.559..."
3,crowd_train\wavs/24b8b30a18e47522ad6b3e6fcc34b...,f8fb39176715e186dcf05b94bb0bcba5,"[0.6011212468147278, 0.5051518082618713, 0.462...",1567,[ 6.01121247e-01 5.05151808e-01 4.62675869e-...,"[0.6011212468147278, 0.5051518082618713, 0.462..."
4,crowd_train\wavs/a74253f7c0f22cd769c9b45f5f34a...,384763fcce09a2caa5deac6ce566cdc4,"[0.41035521030426025, 0.45205679535865784, 0.4...",344,[ 4.10355210e-01 4.52056795e-01 4.42063093e-...,"[0.41035521030426025, 0.45205679535865784, 0.4..."
...,...,...,...,...,...,...
3357,crowd_train\wavs/f6dd372462d06e0336a82afe25f3a...,182f8886f0758fb7e1c244b1eeeb22c4,"[0.35095277428627014, 0.48138725757598877, 0.6...",148,[ 3.50952774e-01 4.81387258e-01 6.15000725e-...,"[0.35095277428627014, 0.48138725757598877, 0.6..."
3358,crowd_train\wavs/8226b09530a408703b22572643053...,d37bed637c9e4a23a81e7ca177ed84fc,"[0.4015119969844818, 0.34858793020248413, 0.34...",1341,[ 4.01511997e-01 3.48587930e-01 3.43610287e-...,"[0.4015119969844818, 0.34858793020248413, 0.34..."
3359,crowd_train\wavs/cb616044016e0059a295d91265461...,80ab45e9ccb51c96f5fbcd6eee290127,"[0.33429470658302307, 0.44800877571105957, 0.5...",832,[ 3.34294707e-01 4.48008776e-01 5.85476816e-...,"[0.33429470658302307, 0.44800877571105957, 0.5..."
3360,crowd_train\wavs/200030530896d01b1a1f97dead436...,5363bf091bfb3cd97a9c4b2bb787c15e,"[0.45880335569381714, 0.4921809434890747, 0.51...",529,[ 4.58803356e-01 4.92180943e-01 5.14282107e-...,"[0.45880335569381714, 0.4921809434890747, 0.51..."


In [112]:
with open(df_false.loc[0].audio_path, 'rb') as old_file:
    with open('hihi.wav', 'wb') as new_file:
        new_file.write(old_file.read())

In [107]:
data = df_false.loc[0]
display(ipd.Audio(data.audio_path))
one_predict(data.audio_feature, model)

621

In [108]:
data = df_t.loc[3361]
display(ipd.Audio(data.audio_path))
one_predict(data.audio_feature, model)

52

In [109]:
data = df_t.loc[3357]
display(ipd.Audio(data.audio_path))
one_predict(data.audio_feature, model)

52