In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
from transformers import AutoConfig, Wav2Vec2FeatureExtractor
from src.models import Wav2Vec2ForSpeechClassification, HubertForSpeechClassification

model_name_or_path = "/home/danielkim/aihub/output/checkpoint-13750"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config = AutoConfig.from_pretrained(model_name_or_path)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path)
sampling_rate = feature_extractor.sampling_rate

# for wav2vec
model = Wav2Vec2ForSpeechClassification.from_pretrained(model_name_or_path).to(device)


def speech_file_to_array_fn(path, sampling_rate):
    speech_array, _sampling_rate = torchaudio.load(path)
    resampler = torchaudio.transforms.Resample(_sampling_rate, sampling_rate)
    speech = resampler(speech_array).squeeze().numpy()
    return speech


def predict(path, sampling_rate):
    speech = speech_file_to_array_fn(path, sampling_rate)
    inputs = feature_extractor(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
    inputs = {key: inputs[key].to(device) for key in inputs}

    with torch.no_grad():
        logits = model(**inputs).logits

    scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
    outputs = [{"Class": config.id2label[i], "Score": score} for i, score in
               enumerate(scores)]
    return outputs


path = "/home/danielkim/data/test_data/1.wav"
outputs = predict(path, sampling_rate)    

2021-09-30 14:24:33.732229: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  ../aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


In [3]:
outputs

[{'Class': 1, 'Score': 0.053375043},
 {'Class': 2, 'Score': 0.5953251},
 {'Class': 3, 'Score': 0.19300097},
 {'Class': 4, 'Score': 0.028130386},
 {'Class': 5, 'Score': 0.033860095},
 {'Class': 6, 'Score': 0.028211843},
 {'Class': 7, 'Score': 0.06809646}]

In [4]:
scores = [element['Score'] for element in outputs]

In [5]:
def get_max_idx(item):
    indexes = [element['Class'] for element in item]
    scores = [element['Score'] for element in item]
    return indexes[np.argmax(scores)]

In [6]:
import pandas as pd

In [7]:
df = pd.read_csv('/home/danielkim/data/old/test.csv')

In [8]:
predictions = []

In [9]:
from tqdm import tqdm

In [None]:
for index, (_, filename) in tqdm(df.iterrows()):
    prediction = predict('/home/danielkim/data/test_data/' + str(filename) + '.wav', sampling_rate)
    predictions.append(prediction)

1862it [01:21, 25.23it/s]

In [14]:
import numpy as np

In [15]:
real_predictions = []

In [16]:
for prediction in predictions:
    idx = get_max_idx(prediction)
    real_predictions.append(idx)

In [17]:
df['age_'] = real_predictions

In [18]:
df.to_csv('/home/danielkim/prediction_new_new.csv', index=False)