##Import dataset & Kaggle API

Si utilizzano le API di Kaggle per scaricare direttamente il dataset in colab e successivamente viene copiato il file .zip sulla cartella drive condivisa 

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [None]:
!pip install -q kaggle
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d toponowicz/spoken-language-identification

In [None]:
!cp spoken-language-identification.zip '/content/gdrive/MyDrive/DSIM/audio_classification'

Downloading spoken-language-identification.zip to /content
100% 14.9G/14.9G [04:09<00:00, 52.2MB/s]
100% 14.9G/14.9G [04:09<00:00, 64.3MB/s]


#Libraries and Functions

In [None]:
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
import zipfile
import shutil 
import os
import soundfile as sf 
import librosa
import random
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import Adam
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

from datetime import datetime as dt

Dopo aver importato le librerie necessarie per l'analisi, si procede copiando il file .zip precedentemente copiato e viene estratto.

In [None]:
shutil.copyfile('/content/gdrive/MyDrive/DSIM/audio_classification/spoken-language-identification.zip', 
                'spoken-language-identification.zip')

'spoken-language-identification.zip'

In [None]:
spoken_lang_zipped = zipfile.ZipFile('spoken-language-identification.zip')
spoken_lang_zipped.extractall()
spoken_lang_zipped.close()

Si procede quindi allo sviluppo del data loader (estrazione dei file .flac, divisione nelle partizioni di train e test e normalizzazione) e alla costruzione delle funzioni per estrarre e concatenare le caratteristiche audio che andranno a costituire il dataset su cui verranno addestrati i modelli. I dati erano inizialmente divisi in due cartelle (train e test) ma, per esigenze computazionali, abbiamo creato le partizioni dividendo direttamente la cartella di train, secondo una percentuale di circa 80%-10%-10% 

In [None]:
# media dei valori assoluti dei segnali
def aavg(input):
    return np.mean(np.abs(input), keepdims=True)

# deviazione standard dei valori dei segnali
def sdev(input):
    return np.std(input, keepdims= True)

# energia del segnale
def energy(input):
    return np.sum((input*1.0)**2, keepdims=True)

# frequenze di Mell
def mfcc(input, rate=22050, sampling=5):
    # Sample values
    signal = input[::sampling]
    # Compute MFCC coefficients
    mfcc = librosa.feature.mfcc(signal*1.0, sr=int(rate/sampling))
    # Flatten into monodimensional vector for the SVM
    mfcc = mfcc.flatten()
    return mfcc

# concatenazione delle features utilizzando le funzioni precedenti
def combo(input):
    return np.concatenate((aavg(input),sdev(input),energy(input), mfcc(input)))

In [None]:
# no FE
def identity(input):
    return input

# Data loader 
def load_data(feature_extractor=identity, normalize=False):

  # inizializzazione
  labels = []
  features = []
  i = 0

  # itero sui file
  for f in sorted(os.listdir('./train/train')):
    i += 1
    if i%1000 == 0:
      print(i)
        
    if f.endswith('.flac'):

      # carico il file ed estraggo le features
      data, samplerate = sf.read('./train/train/' + f)
      cur_features = feature_extractor(data)
      features.append(cur_features)

      # estrazione labels dal nome del file audio
      label = f.split('_')[0]
      labels.append(label)
    
  # train - validation - test split
  X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.1, random_state=1)
  X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=1)

  # standardizzazione
  if normalize:
    eps = 0.001
    X_train = np.array(X_train)
    X_train_mean = X_train.mean(axis=0)
    X_train_std = X_train.std(axis=0)
    X_train = (X_train - X_train_mean + eps)/(X_train_std + eps)
    X_train = [row for row in X_train]

    X_test = [row for row in (np.array(X_test) - X_train_mean + eps)/(X_train_std + eps)]
    X_val = [row for row in (np.array(X_val) - X_train_mean + eps)/(X_train_std + eps)]


  return X_train, X_test, X_val, y_train, y_test, y_val, X_train_mean, X_train_std

#Train-Test Split 

Viene richiamata la funzione per le partizioni, salvato il file compresso e infine caricato.

In [None]:
X_train, X_test, X_val, y_train, y_test, y_val, X_train_mean, X_train_std = load_data(feature_extractor=combo, normalize=True)

np.savez_compressed('features_audio_classification', X_train = X_train, X_test = X_test,
                    X_val = X_val, y_train = y_train, y_test = y_test, y_val = y_val,
                    X_train_mean = X_train_mean, X_train_std=X_train_std )

!cp features_audio_classification.npz '/content/gdrive/MyDrive/DSIM/audio_classification'

In [None]:
loaded = np.load('/content/gdrive/MyDrive/DSIM/audio_classification/features_audio_classification.npz')
type(loaded['X_train'])

numpy.ndarray

In [None]:
X_train = loaded['X_train']
y_train = loaded['y_train']
X_test = loaded['X_test']
y_test = loaded['y_test']
X_val = loaded['X_val']
y_val = loaded['y_val']
X_train_mean = loaded['X_train_mean']
X_train_std = loaded['X_train_std']

Si procede con la classificazione dei file audio.

#Random Forest

In [None]:
# RF con parametri di default
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print('Classification report:')
print(classification_report(y_test, y_pred))

print('Confusion matrix:')
cm = confusion_matrix(y_test, y_pred)
print(cm)

Classification report:
              precision    recall  f1-score   support

          de       0.81      0.89      0.85      2468
          en       0.86      0.84      0.85      2321
          es       0.89      0.83      0.86      2519

    accuracy                           0.85      7308
   macro avg       0.85      0.85      0.85      7308
weighted avg       0.85      0.85      0.85      7308

Confusion matrix:
[[2189  137  142]
 [ 267 1942  112]
 [ 253  171 2095]]


#Neural Network

In [None]:
# one hot encoding del target
le = LabelEncoder()
y_train_transformed = to_categorical(le.fit_transform(y_train))
y_val_transformed = to_categorical(le.fit_transform(y_val))

In [None]:
y_train_transformed

array([[0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.]], dtype=float32)

In [None]:
# fully connected ANN: due layers densi, ognuno seguito da un layer di dropout
model = Sequential()
model.add(Dense(256, input_dim=1743, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))

# fit & compile
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

num_epochs = 20
num_batch_size = 32
model.fit(X_train, y_train_transformed, batch_size=num_batch_size, 
          epochs=num_epochs, validation_data=(X_val, y_val_transformed), verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f51c6f0d470>

In [None]:
# ANN predictions
nn_pred = model.predict_classes(X_test)
nn_pred = pd.Series(nn_pred).map({0:'de', 1:'en', 2:'es'})
nn_pred = np.array(nn_pred)



In [None]:
# ANN performances

print('Classification report:')
print(classification_report(y_test, nn_pred))

print('Confusion matrix:')
cm = confusion_matrix(y_test, nn_pred)
print(cm)

Classification report:
              precision    recall  f1-score   support

          de       0.96      0.90      0.93      2468
          en       0.96      0.90      0.93      2321
          es       0.87      0.97      0.92      2519

    accuracy                           0.92      7308
   macro avg       0.93      0.92      0.92      7308
weighted avg       0.93      0.92      0.92      7308

Confusion matrix:
[[2220   45  203]
 [  67 2079  175]
 [  33   39 2447]]


In [None]:
# memorizzazione del modello
shutil.rmtree('nnet')
model.save("nnet/nnet")
print("Saved model to disk")

INFO:tensorflow:Assets written to: nnet/nnet/assets
Saved model to disk


In [None]:
shutil.make_archive('nnet', 'zip', 'nnet')

'/content/nnet.zip'

#Score

Poichè i modelli hanno portato a risultati notevoli, in particolare l'approccio neurale raggiunge il 92% di accuratezza, si procede con lo score sull'intera cartella inizialmente destinata al test secondo la competizione Kaggle. 

In [None]:
# data loader per il test set
def load_test(X_train_mean, X_train_std, feature_extractor=combo):

  labels = []
  features = []
  eps = 0.001

  # itero sui file
  for f in sorted(os.listdir('./test/test')):
    if f.endswith('.flac'):
      
      # carico l'oudio e estraggo le features
      data, samplerate = sf.read('./test/test/' + f)
      cur_features = feature_extractor(data)
      features.append(cur_features)

      # output labels
      label = f.split('_')[0]
      labels.append(label)

  # standardizzazione
  features = [row for row in (np.array(features) - X_train_mean + eps)/(X_train_std + eps)]

  return features, labels

In [None]:
features, labels  =  load_test(X_train_mean, X_train_std, feature_extractor =combo)

In [None]:
# output labels ANN
predict = model.predict_classes(np.array(features))
print(predict)

In [None]:
# codifica output labels
nn_pred = pd.Series(predict).map({0:'de', 1:'en', 2:'es'})
nn_pred = np.array(nn_pred)

In [None]:
# CF matrix
confusion_matrix(labels, nn_pred)

array([[ 99,  21,  60],
       [ 30,  83,  67],
       [ 54,   2, 124]])

In [None]:
# performances sul test set
print(classification_report(labels, nn_pred))

              precision    recall  f1-score   support

          de       0.54      0.55      0.55       180
          en       0.78      0.46      0.58       180
          es       0.49      0.69      0.58       180

    accuracy                           0.57       540
   macro avg       0.61      0.57      0.57       540
weighted avg       0.61      0.57      0.57       540



In [None]:
# predicted labels RF
predict = rf.predict(np.array(features))
print(predict)

In [None]:
# performances sul test set di RF
print(confusion_matrix(labels, predict))
print(classification_report(labels, predict))

[[113  35  32]
 [101  39  40]
 [ 96   8  76]]
              precision    recall  f1-score   support

          de       0.36      0.63      0.46       180
          en       0.48      0.22      0.30       180
          es       0.51      0.42      0.46       180

    accuracy                           0.42       540
   macro avg       0.45      0.42      0.41       540
weighted avg       0.45      0.42      0.41       540



#Spettrogramma

In [None]:
from scipy.io import wavfile as wav
import librosa
import librosa.display
from matplotlib import pyplot as plt

from tensorflow.keras import layers
from tensorflow.keras import Model
from tensorflow.keras.preprocessing import image 
from tensorflow.keras import backend as K
from tensorflow.keras import applications
from tensorflow.keras import utils
from tensorflow.keras import regularizers
from tensorflow.keras import optimizers
from tensorflow.keras import callbacks 
from tensorflow.keras.models import load_model
import tensorflow
import numpy as np

Si costruisce la funzione per creare lo spettrogramma del file audio. 

In [None]:
def get_spectrogram(path, output_shape = (224, 224)):
  data, samplerate = sf.read(path)
  audio_stft = librosa.amplitude_to_db(librosa.stft(data.astype(float)))
  scaled_stft = audio_stft + abs(np.min(audio_stft))
  image = scaled_stft/np.max(scaled_stft)*255
  image = np.repeat(np.expand_dims(image, 2), 3, 2).astype('uint8')
  image = cv2.resize(image, output_shape)
  return image

In [None]:
def build_model(pre_trained, cut, optimizer, regularization = False):
  
  # elenco dei nomi dei layers
  layer_names = [layer.name for layer in pre_trained.layers]
  layer_idx = layer_names.index(cut) + 1
  
  inputs = layers.Input(X_train.shape[1:])
  x = inputs
  for layer in pre_trained.layers[layer_idx:]:
    x = layer (x)
  
  x = layers.Dropout(0.5)(x)
  x = layers.Dense(3, activation='softmax')(x)

  model = Model(inputs = inputs, outputs = x)
  '''
  if regularization:
    for layer in model.layers:
      if isinstance(layer, layers.Conv2D) | isinstance(layer, layers.DepthwiseConv2D):
        model.get_layer(layer.name).kernel_regularizer = regularizers.l2(0.01) 
  '''
  model.compile(
      loss = 'categorical_crossentropy',
      optimizer = optimizer,
      metrics = ["accuracy"]
      )
  
  return model

In [None]:
train_paths = ['train/train/' + x for x in os.listdir('train/train')]
test_paths = ['test/test/' + x for x in os.listdir('test/test')]

Si salvano le immagini degli spettrogrammi con le rispettive etichette.

In [None]:
if os.path.isdir('spectro_images'):
  shutil.rmtree('spectro_images')

i = 0
t1 = dt.now()
for path in sorted(train_paths + test_paths):
#path = sorted(train_paths + test_paths)[0]
  split_slash = path.split('/')
  label = split_slash[-1].split('_')[0]
  output_folder = '/'.join(['spectro_images/spectro_images', split_slash[0], label])
  output_fname = output_folder + '/' + split_slash[-1][:-4] + 'jpg'

  if not os.path.isdir(output_folder):
    os.makedirs(output_folder)

  image = get_spectrogram(path)
  cv2.imwrite(output_fname, image) 

  i += 1
  '''
  if i%1000 == 0:
    print(i)
  '''
t2 = dt.now()

shutil.make_archive('spectro_images', 'zip', 'spectro_images')
shutil.copyfile('spectro_images.zip',
                '/content/gdrive/MyDrive/DSIM/audio_classification/spectro_images.zip')

In [None]:
shutil.copyfile('/content/gdrive/MyDrive/DSIM/audio_classification/spectro_images.zip',
                'spectro_images.zip')

'spectro_images.zip'

In [None]:
spectro_zip = zipfile.ZipFile('spectro_images.zip')
spectro_zip.extractall()
spectro_zip.close()

In [None]:
# Data Generator
processing = image.ImageDataGenerator(
      preprocessing_function = applications.mobilenet.preprocess_input,
      validation_split = 0.2
      )

train_generator = processing.flow_from_directory(
      directory = '/content/spectro_images/train',
      target_size = (128, 128), 
      color_mode = 'rgb', 
      batch_size = 16,
      class_mode = 'categorical',
      shuffle = False,
      subset = 'training',
      seed = 3
      )
valid_generator = processing.flow_from_directory(
      directory = '/content/spectro_images/train',
      target_size = (128, 128), 
      color_mode = 'rgb', 
      batch_size = 16,
      class_mode = 'categorical',
      shuffle = False,
      subset = 'validation',
      seed = 3
      )

Found 58464 images belonging to 3 classes.
Found 14616 images belonging to 3 classes.


In [None]:
K.set_floatx('float16')

In [None]:
# Import mobilenet
mobilenet = applications.MobileNet((128, 128,3),include_top=False, pooling='avg')
mobilenet.summary()

Model: "mobilenet_1.00_128"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 128, 128, 3)]     0         
_________________________________________________________________
conv1 (Conv2D)               (None, 64, 64, 32)        864       
_________________________________________________________________
conv1_bn (BatchNormalization (None, 64, 64, 32)        128       
_________________________________________________________________
conv1_relu (ReLU)            (None, 64, 64, 32)        0         
_________________________________________________________________
conv_dw_1 (DepthwiseConv2D)  (None, 64, 64, 32)        288       
_________________________________________________________________
conv_dw_1_bn (BatchNormaliza (None, 64, 64, 32)        128       
_________________________________________________________________
conv_dw_1_relu (ReLU)        (None, 64, 64, 32) 

In [None]:
# Taglio dell'architettura
cut = 'conv_dw_12'
mobilenet_pre_trained = Model(mobilenet.input, outputs = mobilenet.get_layer(cut).output)

In [None]:
K.floatx()

'float16'

In [None]:
# mescolo le obs di training
train_obs = len(train_generator.filenames)
shuffle_train = np.random.RandomState(seed = 42).permutation(train_obs)

X_train = mobilenet_pre_trained.predict(train_generator)[shuffle_train]
X_val = mobilenet_pre_trained.predict(valid_generator)

In [None]:
# output labels
y_train = utils.to_categorical(np.reshape(train_generator.classes[shuffle_train], -1), 3)
y_valid = utils.to_categorical(np.reshape(valid_generator.classes, -1), 3)

In [None]:
base_model = build_model(
    mobilenet,
    cut = cut,
    optimizer = 'adamax',
    regularization = True
    )
base_model.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 4, 4, 512)]       0         
_________________________________________________________________
conv_dw_12_bn (BatchNormaliz (None, 4, 4, 512)         2048      
_________________________________________________________________
conv_dw_12_relu (ReLU)       (None, 4, 4, 512)         0         
_________________________________________________________________
conv_pw_12 (Conv2D)          (None, 4, 4, 1024)        524288    
_________________________________________________________________
conv_pw_12_bn (BatchNormaliz (None, 4, 4, 1024)        4096      
_________________________________________________________________
conv_pw_12_relu (ReLU)       (None, 4, 4, 1024)        0         
_________________________________________________________________
conv_dw_13 (DepthwiseConv2D) (None, 4, 4, 1024)        9216

In [None]:
base_model.fit(
    X_train, y_train,
    validation_data=(X_val, y_valid),
    epochs = 20,
    batch_size = 32
    )

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20

KeyboardInterrupt: ignored

Dopo diversi tentativi questo approccio è stato abbandonato in quanto non si ottenevano i risultati sperati.