<a href="https://colab.research.google.com/github/ssuxmin/tmp/blob/main/%5BBaseline%5D_MFCC_%EA%B8%B0%EB%B0%98_Feature_%EC%B6%94%EC%B6%9C_%2B_Decision_Tree%EB%A5%BC_%ED%99%9C%EC%9A%A9%ED%95%9C_%EB%B6%84%EB%A5%98.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import

In [35]:
import random
import pandas as pd
import numpy as np
import os
from tqdm.auto import tqdm
import librosa

from sklearn.tree import DecisionTreeClassifier

import warnings
warnings.filterwarnings(action='ignore') 

## Hyperparameter Setting

In [36]:
CFG = {
    'SR':16000,
    'N_MFCC':32, # Melspectrogram 벡터를 추출할 개수
    'SEED':42
}

## Fixed Random-Seed

In [37]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(CFG['SEED']) # Seed 고정

## Data Pre-Processing

In [38]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [44]:
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')

In [54]:
%cd /content/drive/MyDrive/Colab Notebooks/open/

/content/drive/MyDrive/Colab Notebooks/open


In [55]:
pwd

'/content/drive/MyDrive/Colab Notebooks/open'

In [56]:
train_df

Unnamed: 0,id,path,label
0,TRAIN_0000,./train/TRAIN_0000.wav,1
1,TRAIN_0001,./train/TRAIN_0001.wav,2
2,TRAIN_0002,./train/TRAIN_0002.wav,4
3,TRAIN_0003,./train/TRAIN_0003.wav,5
4,TRAIN_0004,./train/TRAIN_0004.wav,4
...,...,...,...
4996,TRAIN_4996,./train/TRAIN_4996.wav,5
4997,TRAIN_4997,./train/TRAIN_4997.wav,0
4998,TRAIN_4998,./train/TRAIN_4998.wav,1
4999,TRAIN_4999,./train/TRAIN_4999.wav,1


In [57]:
test_df

Unnamed: 0,id,path
0,TEST_0000,./test/TEST_0000.wav
1,TEST_0001,./test/TEST_0001.wav
2,TEST_0002,./test/TEST_0002.wav
3,TEST_0003,./test/TEST_0003.wav
4,TEST_0004,./test/TEST_0004.wav
...,...,...
1876,TEST_1876,./test/TEST_1876.wav
1877,TEST_1877,./test/TEST_1877.wav
1878,TEST_1878,./test/TEST_1878.wav
1879,TEST_1879,./test/TEST_1879.wav


In [58]:
def get_mfcc_feature(df):
    features = []
    for path in tqdm(df['path']):
        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=CFG['SR'])
        # librosa패키지를 사용하여 mfcc 추출
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CFG['N_MFCC'])
        y_feature = []
        # 추출된 MFCC들의 평균을 Feature로 사용
        for e in mfcc:
            y_feature.append(np.mean(e))
        features.append(y_feature)

    mfcc_df = pd.DataFrame(features, columns=['mfcc_'+str(x) for x in range(1,CFG['N_MFCC']+1)])
    return mfcc_df

In [59]:
train_x = get_mfcc_feature(train_df)
test_x = get_mfcc_feature(test_df)

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/1881 [00:00<?, ?it/s]

In [53]:
pwd

'/content'

In [60]:
train_y = train_df['label']

## Classification Model Fit

In [61]:
model = DecisionTreeClassifier(random_state=CFG['SEED'])
model.fit(train_x, train_y)

## Inference

In [62]:
preds = model.predict(test_x)

## Submission

In [63]:
submission = pd.read_csv('./sample_submission.csv')
submission['label'] = preds
submission.to_csv('./baseline_submission.csv', index=False)