In [1]:
import datetime
import warnings
warnings.filterwarnings('ignore')
import torch
from glob import glob
import os
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from tqdm import tqdm
import numpy as np
from torchvision import transforms
import torchvision.models as models
import torch.nn as nn
from torch.nn import functional as F
from sklearn.model_selection import KFold
import time
from efficientnet_pytorch import EfficientNet
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
# from torch_poly_lr_decay import PolynomialLRDecay
import random
from torchvision import models
from sklearn.metrics import accuracy_score, log_loss
import math
import librosa


os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2'
torch.set_num_threads(8)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
sample_submission = pd.read_csv("C:\\Users\\Home\\Desktop\\영어\\sample_submission.csv")
africa_train_paths = sorted(glob("C:\\Users\\Home\\Desktop\\영어\\train\\africa\\*.wav"))
australia_train_paths = sorted(glob("C:\\Users\\Home\\Desktop\\영어\\train\\australia\\*.wav"))
canada_train_paths = sorted(glob("C:\\Users\\Home\\Desktop\\영어\\train\\canada\\*.wav"))
england_train_paths = sorted(glob("C:\\Users\\Home\\Desktop\\영어\\train\\england\\*.wav"))
hongkong_train_paths = sorted(glob("C:\\Users\\Home\\Desktop\\영어\\train\\hongkong\\*.wav"))
us_train_paths = sorted(glob("C:\\Users\\Home\\Desktop\\영어\\train\\us\\*.wav"))
test_paths = [f'C:\\Users\\Home\\Desktop\\영어\\test\\{k+1}.wav' for k in range(6100)]

def load_data(paths):
    result = []
    for path in tqdm(paths):
        data, sr = librosa.load(path, sr = 16000)
        result.append(data)
    result = np.array(result)


    return result




def get_feature(data, sr = 16000, n_fft = 2048, win_length = 200, hop_length = 160, n_mels = 64):
    mel = []
    for i in tqdm(data):
        mel_ = librosa.feature.melspectrogram(i, sr = sr, n_fft = n_fft, win_length = win_length, hop_length = hop_length, n_mels = n_mels)
        mel.append(mel_)
    mel = np.array(mel)
    mel = librosa.power_to_db(mel, ref = np.max)


    return mel


def set_length(data, d_mini):

    result = []
    for value in tqdm(data):
        value = value[:d_mini]
        if len(value)<d_mini:
            value = np.append(value, [0]*(d_mini-len(value)))
        result.append(value)
    result = np.array(result)

    return result

In [3]:
africa_train_data = load_data(africa_train_paths)
np.save("./dataset/africa-sorted.npy", africa_train_data)

australia_train_data = load_data(australia_train_paths)
np.save("./dataset/australia-sorted.npy", australia_train_data)

canada_train_data = load_data(canada_train_paths)
np.save("./dataset/canada-sorted.npy", canada_train_data)

england_train_data = load_data(england_train_paths)
np.save("./dataset/england-sorted.npy", england_train_data)

hongkong_train_data = load_data(hongkong_train_paths)
np.save("./dataset/hongkong-sorted.npy", hongkong_train_data)

us_train_data = load_data(us_train_paths)
np.save("./dataset/us-sorted.npy", us_train_data)
train_data_list = [africa_train_data, australia_train_data, canada_train_data, england_train_data, hongkong_train_data, us_train_data]




test_data = load_data(test_paths)
np.save("./dataset/test_npy.npy", test_data)

In [4]:
train_x = np.concatenate(train_data_list, axis=0)
train_x = set_length(train_x, 100000)

train_x_200 = get_feature(data = train_x, win_length=200)
train_x_400 = get_feature(data = train_x, win_length=400)
train_x_800 = get_feature(data = train_x, win_length=800)
train_x_1000 = get_feature(data = train_x, win_length=1000)


train_x_200 = train_x_200.reshape(train_x_200.shape[0], train_x_200.shape[1], train_x_200.shape[2], 1)
train_x_400 = train_x_400.reshape(train_x_400.shape[0], train_x_400.shape[1], train_x_400.shape[2], 1)
train_x_800 = train_x_800.reshape(train_x_800.shape[0], train_x_800.shape[1], train_x_800.shape[2], 1)
train_x_1000 = train_x_1000.reshape(train_x_1000.shape[0], train_x_1000.shape[1], train_x_1000.shape[2], 1)

train_x_multi = np.concatenate([train_x_200,
                                train_x_400,
                                train_x_800,
                                train_x_1000], -1)
np.save('C:\\Users\\Home\\Desktop\\영어\\dataset\\train_x_multi.npy', train_x_multi)


In [5]:
test_x = np.load('C:\\Users\\Home\\Desktop\\영어\\test_npy.npy', allow_pickle=True)
test_x = set_length(test_x, 100000)
test_x_200 = get_feature(data = test_x, win_length=200)
test_x_400 = get_feature(data = test_x, win_length=400)
test_x_800 = get_feature(data = test_x, win_length=800)
test_x_1000 = get_feature(data = test_x, win_length=1000)

test_x_200 = test_x_200.reshape(test_x_200.shape[0], test_x_200.shape[1], test_x_200.shape[2], 1)
test_x_400 = test_x_400.reshape(test_x_400.shape[0], test_x_400.shape[1], test_x_400.shape[2], 1)
test_x_800 = test_x_800.reshape(test_x_800.shape[0], test_x_800.shape[1], test_x_800.shape[2], 1)
test_x_1000 = test_x_1000.reshape(test_x_1000.shape[0], test_x_1000.shape[1], test_x_1000.shape[2], 1)
test_x_multi = np.concatenate([test_x_200,
                                test_x_400,
                                test_x_800,
                                test_x_1000], -1)
np.save('C:\\Users\\Home\\Desktop\\영어\\dataset\\test_x_multi.npy', test_x_mulati)

In [6]:
train_y = np.concatenate((np.zeros(len(africa_train_data), dtype = int),
                        np.ones(len(australia_train_data), dtype = int),
                         np.ones(len(canada_train_data), dtype = int) * 2,
                         np.ones(len(england_train_data), dtype = int) * 3,
                         np.ones(len(hongkong_train_data), dtype = int) * 4,
                         np.ones(len(us_train_data), dtype = int) * 5), axis = 0)
np.save('C:\\Users\\Home\\Desktop\\영어\\dataset\\train_y_sort.npy', train_y)


In [7]:
class conv_bn_relu(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size):
        super(conv_bn_relu, self).__init__()
        
        self.conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, padding=kernel_size // 2)
        self.BN = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU()
        
        
    def forward(self, x):
        x = self.conv(x)
        x = self.BN(x)
        x = self.relu(x)
        return x
    

class Network(nn.Module):
    def __init__(self, N):
        super(Network, self).__init__()
        self.N = N
        self.AveragePooling = nn.AvgPool2d(2)
        self.MaxPooling = nn.MaxPool2d(2)
        
        
        self.input_conv = conv_bn_relu(in_channels=4, out_channels=self.N, kernel_size=3)
        
        self.block1 = nn.Sequential(
            conv_bn_relu(in_channels=self.N*1, out_channels=self.N*2, kernel_size=3),
            conv_bn_relu(in_channels=self.N*2, out_channels=self.N*4, kernel_size=3),
            conv_bn_relu(in_channels=self.N*4, out_channels=self.N*2, kernel_size=3),
            conv_bn_relu(in_channels=self.N*2, out_channels=self.N*1, kernel_size=3),
        )
        
        self.block2 = nn.Sequential(
            conv_bn_relu(in_channels=self.N*1, out_channels=self.N*2, kernel_size=3),
            conv_bn_relu(in_channels=self.N*2, out_channels=self.N*4, kernel_size=3),
            conv_bn_relu(in_channels=self.N*4, out_channels=self.N*2, kernel_size=3),
            conv_bn_relu(in_channels=self.N*2, out_channels=self.N*1, kernel_size=3),
        )
        
        self.block3 = nn.Sequential(
            conv_bn_relu(in_channels=self.N*1, out_channels=self.N*2, kernel_size=3),
            conv_bn_relu(in_channels=self.N*2, out_channels=self.N*4, kernel_size=3),
            conv_bn_relu(in_channels=self.N*4, out_channels=self.N*2, kernel_size=3),
            conv_bn_relu(in_channels=self.N*2, out_channels=self.N*1, kernel_size=3),
        )
        
        self.block4 = nn.Sequential(
            conv_bn_relu(in_channels=self.N*1, out_channels=self.N*2, kernel_size=3),
            conv_bn_relu(in_channels=self.N*2, out_channels=self.N*4, kernel_size=3),
            conv_bn_relu(in_channels=self.N*4, out_channels=self.N*2, kernel_size=3),
            conv_bn_relu(in_channels=self.N*2, out_channels=self.N*1, kernel_size=3),
        )
        
        self.pool_block = nn.Sequential(
            conv_bn_relu(in_channels=self.N*1, out_channels=self.N*2, kernel_size=3),
            nn.Dropout(0.15),
            nn.AvgPool2d(2),
            conv_bn_relu(in_channels=self.N*2, out_channels=self.N*2, kernel_size=3),
            nn.Dropout(0.15),
            nn.AvgPool2d(2),
            conv_bn_relu(in_channels=self.N*2, out_channels=self.N*2, kernel_size=3),
            nn.Dropout(0.15),
            nn.AdaptiveAvgPool2d(1)
        )
        
        self.output = nn.Sequential(
            nn.Flatten(),
            nn.Linear(in_features=self.N*2, out_features=6),
        )
        
    def forward(self, x, out=''):
        x = self.input_conv(x)
        x = self.MaxPooling(x)
        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)
        x = self.block4(x)
        x = self.pool_block(x)
        x=self.output(x)
        
        if out=='sigmoid':
            x = F.sigmoid(x)
        return x
    

class VoiceDatasetSimple(Dataset):
        def __init__(self, X, y, transform, inference=False, roll=False):
            self.X = X
            self.y = y
            self.transform = transform
            self.inference = inference
            self.roll = roll
        def __len__(self):
            return len(self.X)
        
        def __getitem__(self, idx):
            X = self.X[idx]
            X = (X-train_x_min)/(train_x_max-train_x_min)
            
            if self.inference:
                X = self.transform(X)
                return X
            else:
                if (self.roll==True) and (random.randint(0, 1)==1):
                    X = np.roll(X,random.randint(-200, 200), axis=1)
                    
                X = self.transform(X)    
                y = self.y[idx]
                
                onehot = np.zeros(6)
                onehot[y] = 1.
                y = onehot
                return X, y

def model_save(model, path):
    torch.save({
        'model': model.state_dict(),
    }, path)

In [8]:
for person in range(3):
    train_x = np.load('C:\\Users\\Home\\Desktop\\영어\\dataset\\train_x_multi.npy')
    train_y = np.load('C:\\Users\\Home\\Desktop\\영어\\dataset\\train_y_sort.npy')
    train_x_min = train_x.min()
    train_x_max = train_x.max()


    idx = [k+person for k in range(0, len(train_x), 3)][:-1]
    train_x=train_x[idx]
    train_y=train_y[idx]


    skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
    folds=[]
    for train_idx, valid_idx in skf.split(train_x, train_y):
        folds.append((train_idx, valid_idx))

        
    for fold in range(5):
        epochs=35
        batch_size=128
        model_name = f'network-epoch{epochs}-person({person})-fold({fold}).pth'
        train_idx, valid_idx = folds[fold]

        transform = transforms.Compose([
            transforms.ToTensor(),
        ])

        train_dataset = VoiceDatasetSimple(X=train_x[train_idx], y=train_y[train_idx], transform=transform, roll=False)
        train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)

        valid_dataset = VoiceDatasetSimple(X=train_x[valid_idx], y=train_y[valid_idx], transform=transform)
        valid_loader = DataLoader(dataset=valid_dataset, batch_size=batch_size, shuffle=False)


        # model compile
        model = Network(16).to(device)
        model = nn.DataParallel(model, device_ids=[0])
        
        
        # optimizer
        optimizer = torch.optim.Adam(model.parameters(), lr =1e-3)
        Q = math.floor(len(train_dataset)/batch_size+1)*epochs/7
        lrs = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max = Q)
        
        # loss
        criterion = nn.CrossEntropyLoss()


        best = 9999
        for epoch in tqdm(range(epochs)):
            start = time.time()
            model.train()
            train_loss=0
            train_pred_list=[]
            train_true_list=[]
            train_log_loss=0
            for X, y in (train_loader):
                X = torch.tensor(X, dtype=torch.float32, device=device)
                y = torch.tensor(y, dtype=torch.float32 , device=device)

                optimizer.zero_grad()
                pred = model(X)
                loss = criterion(pred, y.argmax(1))
                loss.backward()
                optimizer.step()
                lrs.step()

                train_pred_list += F.softmax(pred).argmax(1).detach().cpu().numpy().tolist()
                train_true_list += y.argmax(1).detach().cpu().numpy().tolist()
                train_loss+=loss.item()
            train_accuracy=accuracy_score(train_true_list, train_pred_list)


            
            with torch.no_grad():
                model.eval()
                valid_loss=0
                valid_log_loss=0
                valid_pred_list=[]
                valid_true_list=[]
                for X, y in (valid_loader):
                    X = torch.tensor(X, dtype=torch.float32, device=device)
                    y = torch.tensor(y, dtype=torch.float32 , device=device)

                    pred = model(X)
                    loss = criterion(pred, y.argmax(1))

                    valid_pred_list += F.softmax(pred).argmax(1).detach().cpu().numpy().tolist()
                    valid_true_list += y.argmax(1).detach().cpu().numpy().tolist()
                    valid_loss+=loss.item()

            valid_accuracy=accuracy_score(valid_true_list, valid_pred_list)

            if valid_loss/len(valid_loader) < best:
                model_save(model, f'model/{model_name}')
                best = valid_loss/len(valid_loader)

            print(f'===================== Epoch : {epoch+1}/{epochs}    time : {time.time()-start:.0f}s =====================')
            print(f'TRAIN -> loss : {train_loss/len(train_loader):.5f}     accuracy : {train_accuracy:.5f}')
            print(f'VALID -> loss : {valid_loss/len(valid_loader):.5f}     accuracy : {valid_accuracy:.5f}    best : {best:.5f}\n\n')

In [13]:
X_test = np.load('C:\\Users\\Home\\Desktop\\영어\\dataset\\test_x_multi.npy')

test_dataset = VoiceDatasetSimple(X=X_test, y=None, transform=transform, inference=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=30, shuffle=False)

result=0
for person in range(3):
    for fold in range(5):
        with torch.no_grad():
            weights = torch.load(f'model/network-epoch{epochs}-person({person})-fold({fold}).pth')
            model.load_state_dict(weights['model'])
            model.eval()
            preds = []
            for X in tqdm(test_loader):
                X = torch.tensor(X, dtype=torch.float32, device=device)
                pred = F.softmax(model(X)).detach().cpu().numpy().tolist()
                preds+=pred
        preds = np.array(preds)
        result+=preds

100%|██████████| 204/204 [00:15<00:00, 13.31it/s]
100%|██████████| 204/204 [00:08<00:00, 24.97it/s]
100%|██████████| 204/204 [00:08<00:00, 24.81it/s]
100%|██████████| 204/204 [00:08<00:00, 24.70it/s]
100%|██████████| 204/204 [00:08<00:00, 23.09it/s]
100%|██████████| 204/204 [00:08<00:00, 24.91it/s]
100%|██████████| 204/204 [00:08<00:00, 24.93it/s]
100%|██████████| 204/204 [00:08<00:00, 25.14it/s]
100%|██████████| 204/204 [00:08<00:00, 24.86it/s]
100%|██████████| 204/204 [00:09<00:00, 22.16it/s]
100%|██████████| 204/204 [00:08<00:00, 24.58it/s]
100%|██████████| 204/204 [00:08<00:00, 24.77it/s]
100%|██████████| 204/204 [00:08<00:00, 24.84it/s]
100%|██████████| 204/204 [00:08<00:00, 24.65it/s]
100%|██████████| 204/204 [00:08<00:00, 25.12it/s]


In [14]:
submission = pd.read_csv('C:\\Users\\Home\\Desktop\\영어\\sample_submission.csv')
submission.iloc[:,1:] = result/15
submission.to_csv('submission.csv', index=False)