<a href="https://colab.research.google.com/github/ssuxmin/tmp/blob/main/%ED%8C%8C%EC%9D%B4%EC%8D%AC%20%EB%94%A5%EB%9F%AC%EB%8B%9D/emodel(%EB%94%A5%EB%9F%AC%EB%8B%9D_%ED%94%84%EB%A1%9C%EC%A0%9D%ED%8A%B8).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import

In [9]:
!pip install transformers


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.1-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m62.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m118.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90

In [10]:
import pandas as pd
import warnings
import numpy as np
import librosa
import torch, random, os 
import torch.nn as nn
import torch.nn.functional as F
import torchaudio

from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split, StratifiedKFold

# from mixup import mixup_data, mixup_criterion
from transformers import AutoModelForAudioClassification
from transformers import AutoConfig, AutoModel, Wav2Vec2FeatureExtractor, PretrainedConfig, HubertForSequenceClassification,AutoProcessor, Wav2Vec2ForCTC

import IPython.display as ipd
from tqdm import tqdm

warnings.filterwarnings(action='ignore')
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [13]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Fixed Random-Seed

In [14]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed) 
    torch.backends.cudnn.deterministic = True  
    torch.backends.cudnn.benchmark = True 

seed_everything(42)  # Seed 고정

In [15]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/open/train.csv', index_col=None)
df['path'] = 'data' + df['path'].str[1:]

test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/open/test.csv', index_col=None)
test_df['path'] = 'data' + test_df['path'].str[1:]

In [22]:
model_name_or_path = 'facebook/hubert-large-ll60k'
config = AutoConfig.from_pretrained(model_name_or_path, num_labels = 3)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path)
sampling_rate = feature_extractor.sampling_rate


In [23]:
def speech_file_to_array_fn(path):
    speech_array, sampling_rate = torchaudio.load(path)    
    speech = speech_array.squeeze().numpy()
    return speech

In [24]:
class EModel(nn.Module):
    def __init__(self):
        super(EModel, self).__init__()
        self.backbone = HubertForSequenceClassification.from_pretrained(model_name_or_path, config=config)

    def forward(self, x):
        return self.backbone(x).logits

In [25]:
model = EModel().to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-large-ll60k and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
class EMDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        path = self.df.loc[idx, 'path']        
        signal = speech_file_to_array_fn(path)        
        label = self.df.loc[idx, 'label']
        return signal, label
        

In [27]:
k_split = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

In [28]:
def collate_fn(batch):
    signal = [i[0] for i in batch]    
    label = [i[1] for i in batch]

    return signal, torch.tensor(label)

In [29]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-5)
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=1)

In [30]:
def metrics(labels, preds):
    labels, preds = np.array(labels), np.array(preds)
    f1s = f1_score(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)

    return f1s, acc

In [None]:
def trainer(model, train_loader, loss_fn, optimizer, epoch):
    model.train()
    train_loss = 0
    step = 0
    for inputs, labels in train_loader:        
        inputs = feature_extractor(inputs, sampling_rate=sampling_rate, padding=True, return_tensors='pt')['input_values'].to(device)
        labels = labels.to(device)

        if step % 4 == 0:
            # if random.random() > 0.5:
            x_batch, y_batch_a, y_batch_b, lam = mixup_data(inputs, labels)
            # else:
                # x_batch, y_batch_a, y_batch_b, lam = cutmix_data(inputs, labels)

            outputs = model(x_batch)
            loss = mixup_criterion(loss_fn, outputs, y_batch_a.to(device), y_batch_b.to(device), lam)
        else:
            outputs = model(inputs)
            loss = loss_fn(outputs, labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.detach().cpu().item()
        step += 1
    print(f'EPOCH : {epoch} | train_loss : {train_loss/len(train_loader):.4f}')

In [None]:
def validator(model, valid_loader, loss_fn, epoch, k, scheduler):
    model.eval()
    best_score = 0
    valid_loss = 0
    valid_labels = []
    valid_preds = []
    for inputs, labels in valid_loader:
        inputs = feature_extractor(inputs, sampling_rate=sampling_rate, padding=True, return_tensors='pt')['input_values'].to(device)
        labels = labels.to(device)
        with torch.no_grad():
            outputs = model(inputs)
            loss = loss_fn(outputs, labels)
        
        valid_labels.extend(labels.cpu().tolist())
        valid_preds.extend(outputs.detach().cpu().argmax(1).tolist())
        valid_loss += loss.detach().cpu().item()
    f1s, acc = metrics(valid_labels, valid_preds)

    if acc > best_score:
        best_score = acc
        torch.save(model.state_dict(), f'{k}_best.pt')
    print(f'EPOCH : {epoch} | valid_loss : {valid_loss/len(valid_loader):.4f} | f1s : {f1s:.4f} | acc :{acc:.4f}')

    scheduler.step()

In [None]:
for k, (t_idx, v_idx) in enumerate(k_split.split(df, df['label'])):
    train_df, valid_df = df.loc[t_idx].reset_index(drop=True), df.loc[v_idx].reset_index(drop=True)    

    train_dataset = EMDataset(train_df)
    valid_dataset = EMDataset(valid_df)

    train_loader = DataLoader(train_dataset, num_workers=4, batch_size=8, shuffle=True, collate_fn=collate_fn)
    valid_loader = DataLoader(valid_dataset, num_workers=4, batch_size=8, shuffle=False, collate_fn=collate_fn)
    for epoch in range(50):
        trainer(model, train_loader, loss_fn, optimizer, epoch)
        validator(model, valid_loader, loss_fn, epoch, k, scheduler)


In [None]:
class TestDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        path = self.df.loc[idx, 'path']        
        signal = speech_file_to_array_fn(path)           
        return signal, -1
        

In [None]:
test_dataset = TestDataset(test_df)
test_loader = DataLoader(test_dataset, shuffle=False, num_workers=4, batch_size=1, collate_fn=collate_fn)

In [None]:
sub = pd.read_csv('data/sample_submission.csv', index_col=None)

In [None]:
k_test_preds = []
for k in range(5):
    model = EModel().to(device)
    model.load_state_dict(torch.load(f'{k}_best.pt', map_location='cpu'))
    model.eval()

    test_preds = []
    for inputs, _ in tqdm(test_loader):
        inputs = feature_extractor(inputs, sampling_rate=sampling_rate, padding=True, return_tensors='pt')['input_values'].to(device)

        with torch.no_grad():
            outputs = model(inputs)        
            
        test_preds.extend(outputs.detach().cpu().tolist())    
    k_test_preds.append(test_preds)
k_test_preds = torch.tensor(k_test_preds)

sub['label'] = torch.nn.functional.softmax(k_test_preds, 1).mean(0).argmax(1).tolist()
sub.to_csv('sub.csv', index=None)    