In [1]:
# !unzip -n "DATA/classification-of-short-noisy-audio-speech.zip" -d "DATA/"
# !unzip -n "/home/ipynb/notebooks/wav2vec2/DATA/train_not_noise.zip" -d "/home/ipynb/notebooks/wav2vec2/DATA/"

# 1. Загрузка и установка бибилотек. Установка параметров

In [2]:
# !pip install transformers
# !pip install librosa
# !pip install datasets
# !pip install nlpaug
# !git clone https://github.com/facebookresearch/denoiser
# !pip install julius

In [3]:
import nlpaug.augmenter.audio as naa
from nlpaug.util.audio.visualizer import AudioVisualizer
import librosa
import librosa.display as librosa_display
import matplotlib.pyplot as plt
from types import SimpleNamespace
from datasets import load_dataset, load_metric, Dataset, DatasetDict, concatenate_datasets
import random
from IPython.display import display, Audio
from transformers import AutoFeatureExtractor
import numpy as np
import pandas as pd
from tqdm import tqdm, tqdm_notebook
import os
# from google.colab import files, drive
import soundfile as sf
import torch
from pathlib import Path
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

In [4]:
# drive.mount('/content/drive')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
params = SimpleNamespace()
params.sr = 16000 #sample rate
params.model_chekpoint = "OthmaneJ/distil-wav2vec2"
params.batch_size = 32
params.max_duration = 1.0
params.PATH_TO_NOISE = '/home/ipynb/notebooks/wav2vec2/DATA/hackaton_ds/train/'
params.PATH_NOT_NOISE = '/home/ipynb/notebooks/wav2vec2/DATA/train/'

params.PATH_TEST = '/home/ipynb/notebooks/wav2vec2/DATA/hackaton_ds/test'
params.PATH_NOT_NOISE_TEST = '/home/ipynb/notebooks/wav2vec2/DATA/test/'

params.SAVE_PATH = '/home/ipynb/notebooks/wav2vec2/DATA/'
params.CLASS_NAMES2ID = {'yes':0, 'no':1, 'up':2, 'down':3, 'left':4, 'right':5, 'on':6, 'off':7, 'stop':8, 'go':9}
params.ID2CLASS_NAMES = {0:'yes', 1:'no', 2:'up', 3:'down', 4:'left', 5:'right', 6:'on', 7:'off', 8:'stop', 9:'go'}

# 2. Загрузка и обработка данных

## 2.1. Очистка файлов от шума

In [6]:
# %cd ./denoiser

In [7]:
# for class_folder in tqdm_notebook(os.listdir(params.PATH_NOT_NOISE)):
#     dir_noise = params.PATH_NOT_NOISE + class_folder
#     dir_not_noise = params.PATH_NOT_NOISE + class_folder
#     !python -m denoiser.enhance --master64 --noisy_dir=$dir_noise --out_dir=$dir_not_noise --device 'cuda'

## 2.2. Загрузка файлов из папки

In [8]:
def load_audiodata(load_path):
    audio_dataset = {}            
    for class_files in tqdm_notebook(os.listdir(load_path)):     
        for audio_file in tqdm_notebook(os.listdir(os.path.join(load_path, class_files))):
            # if 'enhanced.wav' in (audio_file.split('_')):
            array_file, _ = librosa.load(os.path.join(load_path, class_files, audio_file), sr=16000)
            if class_files in params.CLASS_NAMES2ID.keys():
                label_audio = params.CLASS_NAMES2ID[class_files]
            elif class_files == '_background_noise_':
                label_audio = params.CLASS_NAMES2ID['_silence_']
            else:
                label_audio = params.CLASS_NAMES2ID['_unknown_']
            try:
                audio_dataset['input_values'].append(array_file)
                audio_dataset['label'].append(label_audio)
            except:
                audio_dataset['input_values'] = [array_file]
                audio_dataset['label'] = [label_audio]                
    return audio_dataset

In [9]:
train_dataset = load_audiodata(params.PATH_TO_NOISE)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for class_files in tqdm_notebook(os.listdir(load_path)):


  0%|          | 0/10 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for audio_file in tqdm_notebook(os.listdir(os.path.join(load_path, class_files))):


  0%|          | 0/8895 [00:00<?, ?it/s]

  0%|          | 0/8875 [00:00<?, ?it/s]

  0%|          | 0/8820 [00:00<?, ?it/s]

  0%|          | 0/8875 [00:00<?, ?it/s]

  0%|          | 0/8910 [00:00<?, ?it/s]

  0%|          | 0/8925 [00:00<?, ?it/s]

  0%|          | 0/8905 [00:00<?, ?it/s]

  0%|          | 0/8835 [00:00<?, ?it/s]

  0%|          | 0/8845 [00:00<?, ?it/s]

  0%|          | 0/8905 [00:00<?, ?it/s]

In [10]:
train_dataset = Dataset.from_dict(train_dataset)

## 2.3. Преобразование данных в наборы для обучения

In [11]:
metric = load_metric("accuracy")
feature_extractor = AutoFeatureExtractor.from_pretrained(params.model_chekpoint)
params.sr = feature_extractor.sampling_rate
feature_extractor



Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": false,
  "sampling_rate": 16000
}

In [12]:
def preprocess_function(examples):
    audio_arrays = [x for x in examples["input_values"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=params.sr,
        max_length = int(params.sr * params.max_duration),
        truncation=True,
    )
    return inputs

In [13]:
# load_from_cache_file=False
train_dataset = train_dataset.map(preprocess_function, batched=True)
train_dataset

  0%|          | 0/89 [00:00<?, ?ba/s]

  return array(a, dtype, copy=False, order=order)


Dataset({
    features: ['input_values', 'label'],
    num_rows: 88790
})

# 3. Обучение

In [14]:
params.model_chekpoint = '/home/ipynb/notebooks/wav2vec2/checkpoint-693-finetuned-ks-noised_linear/checkpoint-4158'
num_labels = len(params.CLASS_NAMES2ID)
model = AutoModelForAudioClassification.from_pretrained(
    params.model_chekpoint,
    num_labels=num_labels,
    label2id=params.CLASS_NAMES2ID,
    id2label=params.ID2CLASS_NAMES,
)

In [15]:
model_name = params.model_chekpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-ks-noised_linear",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate = 1e-6,
    per_device_train_batch_size=params.batch_size,
    gradient_accumulation_steps = 4,
    per_device_eval_batch_size=params.batch_size,
    num_train_epochs=10,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    # lr_scheduler_type = "constant_with_warmup"
    # push_to_hub=True,
)

In [16]:
def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

In [17]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=train_dataset,
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics
)

In [18]:
trainer.train()

***** Running training *****
  Num examples = 88790
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 6930


Epoch,Training Loss,Validation Loss,Accuracy
0,0.2385,0.071873,0.975515
1,0.2765,0.071615,0.975515
2,0.2729,0.069566,0.975797
3,0.2585,0.06881,0.976045
4,0.2646,0.068495,0.976225
5,0.228,0.068604,0.976225
6,0.2374,0.068641,0.976168


***** Running Evaluation *****
  Num examples = 88790
  Batch size = 32
Saving model checkpoint to checkpoint-693-finetuned-ks-noised_linear/checkpoint-693
Configuration saved in checkpoint-693-finetuned-ks-noised_linear/checkpoint-693/config.json
Model weights saved in checkpoint-693-finetuned-ks-noised_linear/checkpoint-693/pytorch_model.bin
Configuration saved in checkpoint-693-finetuned-ks-noised_linear/checkpoint-693/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 88790
  Batch size = 32
Saving model checkpoint to checkpoint-693-finetuned-ks-noised_linear/checkpoint-1386
Configuration saved in checkpoint-693-finetuned-ks-noised_linear/checkpoint-1386/config.json
Model weights saved in checkpoint-693-finetuned-ks-noised_linear/checkpoint-1386/pytorch_model.bin
Configuration saved in checkpoint-693-finetuned-ks-noised_linear/checkpoint-1386/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 88790
  Batch size = 32
Saving model checkpoin

KeyboardInterrupt: 

# 6. Получение предсказаний модели

In [65]:
# %cd /content/denoiser/

In [64]:
# dir_noise = params.PATH_TEST
# dir_not_noise = params.PATH_NOT_NOISE_TEST
# !python -m denoiser.enhance --master64 --noisy_dir=$dir_noise --out_dir=$dir_not_noise --device 'cuda'

In [18]:
sample_csv = pd.read_csv(params.SAVE_PATH + 'sample.csv')
sample_csv

Unnamed: 0,id,category
0,5d03214e74f5ea475b22,yes
1,686ad9e641964142b6b0,yes
2,eec1364a587997161bb8,yes
3,8fb7ede79446ad22c60f,yes
4,de573903106ba3524064,yes
...,...,...
29615,05ae7715e1499b5a72c8,yes
29616,2e14d09dad51733d4658,yes
29617,267a9b10c2a33d67f084,yes
29618,341143991403fd8ae3c6,yes


In [19]:
def load_audiodata_for_submit(load_path):
    audio_dataset = {}   
    for audio_file in tqdm_notebook(os.listdir(load_path)):
        #if 'enhanced.wav' in (audio_file.split('_')):
        array_file, _ = librosa.load(os.path.join(load_path, audio_file), sr=16000)        
        try:
            audio_dataset['input_values'].append(array_file)
            audio_dataset['id'].append(audio_file.split('.')[0])           
        except:
            audio_dataset['input_values'] = [array_file]
            audio_dataset['id'] = [audio_file.split('.')[0]]                           
    return audio_dataset

pred_data = load_audiodata_for_submit(params.PATH_TEST)

def preprocess_function2(examples):
    audio_arrays = [x for x in examples]
    inputs = feature_extractor(
        audio_arrays, 
        sampling_rate=feature_extractor.sampling_rate, 
        max_length=int(feature_extractor.sampling_rate * params.max_duration), 
        truncation=True, 
    )
    return inputs

pred_data['input_values'] = preprocess_function2(pred_data['input_values'])['input_values']

pred_data_for_model = Dataset.from_dict(pred_data)
dict_predict = trainer.predict(pred_data_for_model)
pred_data['predict'] = np.argmax(dict_predict.predictions, axis=1)
pd_pred_data = pd.DataFrame.from_dict(pred_data)
pd_pred_data['category'] = pd_pred_data.predict.apply(lambda x: params.ID2CLASS_NAMES[x])
pd_pred_data.drop(labels=['input_values', 'predict'], axis=1).to_csv(params.SAVE_PATH + 'submission_linear_final.csv', index=False)
pd_pred_data.to_csv('DATA/pd_pred_data_linear_final.csv')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for audio_file in tqdm_notebook(os.listdir(load_path)):


  0%|          | 0/29620 [00:00<?, ?it/s]

  return array(a, dtype, copy=False, order=order)
The following columns in the test set  don't have a corresponding argument in `Wav2Vec2ForSequenceClassification.forward` and have been ignored: id.
***** Running Prediction *****
  Num examples = 29620
  Batch size = 32


In [20]:
# params.model_chekpoint = '/home/ipynb/notebooks/wav2vec2/checkpoint-1386-finetuned-ks-noised_plus_2_up_epoch/checkpoint-693'
# num_labels = len(params.CLASS_NAMES2ID)
# model = AutoModelForAudioClassification.from_pretrained(
#     params.model_chekpoint,
#     num_labels=num_labels,
#     label2id=params.CLASS_NAMES2ID,
#     id2label=params.ID2CLASS_NAMES,
# )

# model_name = params.model_chekpoint.split("/")[-1]
# args = TrainingArguments(
#     f"{model_name}-finetuned-ks-noised_plus_2_up_epoch",
#     evaluation_strategy = "epoch",
#     save_strategy = "epoch",
#     learning_rate = 1e-6,
#     per_device_train_batch_size=params.batch_size,
#     gradient_accumulation_steps = 4,
#     per_device_eval_batch_size=params.batch_size,
#     num_train_epochs=2,
#     warmup_ratio=0.1,
#     logging_steps=10,
#     load_best_model_at_end=True,
#     metric_for_best_model="accuracy",
#     lr_scheduler_type = "constant_with_warmup"
#     # push_to_hub=True,
# )

# trainer = Trainer(
#     model,
#     args,
#     train_dataset=train_dataset,
#     eval_dataset=train_dataset,
#     tokenizer=feature_extractor,
#     compute_metrics=compute_metrics
# )

# def load_audiodata_for_submit(load_path):
#     audio_dataset = {}   
#     for audio_file in tqdm_notebook(os.listdir(load_path)):
#         #if 'enhanced.wav' in (audio_file.split('_')):
#         array_file, _ = librosa.load(os.path.join(load_path, audio_file), sr=16000)        
#         try:
#             audio_dataset['input_values'].append(array_file)
#             audio_dataset['id'].append(audio_file.split('.')[0])           
#         except:
#             audio_dataset['input_values'] = [array_file]
#             audio_dataset['id'] = [audio_file.split('.')[0]]                           
#     return audio_dataset

# pred_data = load_audiodata_for_submit(params.PATH_TEST)

# def preprocess_function2(examples):
#     audio_arrays = [x for x in examples]
#     inputs = feature_extractor(
#         audio_arrays, 
#         sampling_rate=feature_extractor.sampling_rate, 
#         max_length=int(feature_extractor.sampling_rate * params.max_duration), 
#         truncation=True, 
#     )
#     return inputs

# pred_data['input_values'] = preprocess_function2(pred_data['input_values'])['input_values']

# pred_data_for_model = Dataset.from_dict(pred_data)
# dict_predict = trainer.predict(pred_data_for_model)
# pred_data['predict'] = np.argmax(dict_predict.predictions, axis=1)
# pd_pred_data = pd.DataFrame.from_dict(pred_data)
# pd_pred_data['category'] = pd_pred_data.predict.apply(lambda x: params.ID2CLASS_NAMES[x])
# pd_pred_data.drop(labels=['input_values', 'predict'], axis=1).to_csv(params.SAVE_PATH + 'submission_best_loss.csv', index=False)
# pd_pred_data.to_csv('DATA/pd_pred_data_best_loss.csv')