<a href="https://colab.research.google.com/github/mehdi-nait/ENSIM_AI_Lab/blob/master/Audio_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Feature extraction

## Feature exploration

In [None]:
!unzip Dataset\ part\ 1.zip
!unzip Dataset\ part\ 2.zip

In [None]:
import pandas as pd
import librosa as lr
import numpy as np
import librosa.display
import matplotlib.pyplot as plt
import pywt
import random
import torch
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from torch.utils.data import DataLoader,TensorDataset,Dataset
import os
from torchvision import transforms
!pip install wandb
import wandb

In [None]:
train_path = "Dataset/"
#Reading data Info file

df = pd.read_csv(train_path+"0Info.txt")
df.head()

In [None]:
#adding column names to df to make querying easier
df.columns = ["filename","link","timestamp","lang"]
df.head()

In [None]:
filename = train_path+df["filename"][0]
sr = 16000


x,freq = lr.load(filename,sr)
print (" The duration of FR_001 .wav in seconds :",len (x)/ freq )
plt.plot(x)
plt.show()

In [None]:
x_mfcc = lr.feature.mfcc(x,sr=freq,n_mfcc=40)
print(x_mfcc.shape)
plt.plot(x_mfcc)
plt.show()

In [None]:
s_c = lr.feature.spectral_centroid(x)
print(s_c.shape)
plt.plot(s_c[0])
plt.show()

In [None]:
spect_roll = lr.feature.spectral_rolloff(x,sr=sr)
print(spect_roll.shape)
plt.plot(spect_roll[0])
plt.show()

In [None]:
chroma=lr.feature.chroma_stft(y=x, sr=sr)
print(chroma.shape)
librosa.display.specshow(chroma, y_axis='chroma', x_axis='time')
plt.colorbar()
plt.title('Chromagram')
plt.tight_layout()

In [None]:
S = lr.magphase(lr.stft(x, window=np.ones, center=False))[0]
RMSEn= lr.feature.rms(S=S)
print(chroma.shape)
plt.plot(RMSEn[0])
plt.show()

## Dataset preparation helping functions

In [None]:
def feature_extractor(audio_file_dir):

  SAMPLE_RATE = 16000
  x,freq = lr.load(audio_file_dir,SAMPLE_RATE)

  mfcc = lr.feature.mfcc(x,sr=freq,n_mfcc=20)
  mean_mfccs = np.mean(mfcc,axis=1)
  var_mfccs = np.var(mfcc,axis=1)
  
  return list(mean_mfccs)+list(var_mfccs)

In [None]:
def trim_audio(X,freq,duration):

  #print(f"target {freq*duration} samples")
  #print(f"current {len(X)} samples")

  target_sample_len = freq*duration
  current_len = len(X)
  
  if current_len >=target_sample_len:
    
    X_duration = X[:target_sample_len]
    return X_duration
    
  else:
    
    pad_len = target_sample_len-current_len
    X_pad =pywt.pad(X,(pad_len,0),"zero")
    return X_pad 


# Classification

In [None]:
#Composing filenames from 0Info.txt dataframe

filenames = list(df["filename"])
SAMPLE_RATE = 16000
root = "Dataset/"
filenames = [root + X for X in filenames]

In [None]:
#Dataset preparation
X_train = [feature_extractor(X) for X in filenames]
Y_train = list(df["lang"])

#Encoding the targets
lang_dict = {"EN":0, "FR":1, "AR":2, "JP":3}
Y_train = [lang_dict[X] for X in Y_train]

#Shuffle

zipped_list = list(zip(X_train,Y_train))
random.shuffle(zipped_list)
X_train,Y_train = zip(*zipped_list)

## Dummy classifier (Baseline)

The following classifier is used to benchmark our models, to see if they do better than a most frequent strategy dummy classifier

In [None]:
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train,Y_train)
print(" Accuracy ", dummy_clf.score(X_train,Y_train))

## random forest clasifier

In [None]:
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf = clf.fit(X_train,Y_train)
print(" Accuracy ", clf.score(X_train,Y_train))

In [None]:
param_grid = { 
    'n_estimators': [200, 500,1000],
    'max_features': ['auto', 'sqrt', 'log2'],
    'criterion' :['gini', 'entropy']
}

In [None]:
from sklearn.model_selection import GridSearchCV

CV_rfc = GridSearchCV(estimator=clf, param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train, Y_train)

In [None]:
CV_rfc.best_params_

In [None]:
rfc1=RandomForestClassifier(random_state=0, max_features='auto', n_estimators= 500, max_depth=3, criterion='entropy')
rfc1 = rfc1.fit(X_train,Y_train)
print(" Accuracy ", rfc1.score(X_train,Y_train))

## Dense Neural network

In [None]:
def one_hot(Y):

  lang_dict = {"EN":0, "FR":1, "AR":2, "JP":3}
  one_hot_mat = np.eye(len(lang_dict))

  Y = [list(one_hot_mat[lang_dict[x]]) for x in Y]
  return Y

In [None]:
df = df[df["filename"]!="0Info.txt"]
df

In [None]:
class Audio_Dataset(Dataset):

  def __init__(self,data_dir,txt_filename,sample_rate = 16000):
    
    self.data_dir = data_dir
    self.sample_rate = sample_rate
    
    
    df = pd.read_csv(data_dir+txt_filename)
    
    df.columns = ["filename","link","timestamp","lang"]
    
    filenames = list(df["filename"])
    labels = list(df["lang"])
    self.filenames = [data_dir+x for x in filenames]
    self.labels = one_hot(labels)

  def __len__(self):
    return len(self.filenames)

  def __getitem__(self,idx):

    
    X,freq = lr.load(self.filenames[idx],sr = self.sample_rate)
    
    X = lr.feature.mfcc(x,sr=freq,n_mfcc=20)
    Y = self.labels[idx]
    
    X = torch.Tensor(X)
    Y = torch.Tensor(Y)

    return X,Y

In [None]:
dataset = Audio_Dataset(data_dir,"0Info.txt")
batch_size = 1
dataset_train,dataset_validation = torch.utils.data.random_split(dataset, [299, 100 ], generator=torch.Generator().manual_seed(42))
train_loader = DataLoader(dataset_train,batch_size=batch_size,shuffle = True)
validation_loader = DataLoader(dataset_validation,batch_size=batch_size,shuffle = True)

In [None]:
from torch import nn

class NeuralNetwork(nn.Module):
  def __init__(self):
    super(NeuralNetwork,self).__init__()

    self.flatten = nn.Flatten()
    self.linear_stack = nn.Sequential(
        nn.Linear(20*157,512),
        nn.ReLU(),
        nn.Linear(512,512),
        nn.ReLU(),
        nn.Linear(512,256),
        nn.ReLU(),
        nn.Linear(256,4)
    )

    self.softmax = nn.Softmax(dim=1)
  def forward(self,X):

    X = self.flatten(X)
    logits = self.linear_stack(X)
    logits = self.softmax(logits)
    return logits

In [None]:
def train(model,train_loader,valid_loader,loss_fn,optimizer,epochs,batch_size =1):

  for i in range(epochs):
    
    model.train()
    total_train_loss = 0
    total_test_loss = 0
    val_correct = 0
    train_correct = 0
    for _, (X,Y) in enumerate(train_loader):

      pred = model(X)
      #y_pred = torch.log_softmax(pred,dim=1)
      loss = loss_fn(pred,Y)

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      total_train_loss+=loss.item()
      train_correct += (pred.argmax(1)==1).type(torch.float).sum().item()

    with torch.no_grad():
      model.eval()
      
      for _, (X,Y) in enumerate(valid_loader):
        pred = model(X)
        total_test_loss += loss_fn(pred, Y).item()
        val_correct += (pred.argmax(1)==1).type(torch.float).sum().item()


    train_steps = len(train_loader.dataset) // batch_size
    test_steps = len(valid_loader.dataset) // batch_size

    avgTrainLoss = total_train_loss / train_steps
    avgTestLoss = total_test_loss / test_steps
    val_accuracy = val_correct/len(validation_loader.dataset)
    train_accuracy = train_correct/len(train_loader.dataset)

    print("[INFO] EPOCH: {}/{}".format(i+ 1, epochs))
    print("Train loss: {:.6f}, Train accuracy {:.3f},Test loss: {:.4f}, Validation accuracy {:.4f}".format(avgTrainLoss, train_accuracy,avgTestLoss,val_accuracy))

In [None]:
model = NeuralNetwork()
loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(),lr=1e-2,momentum = 0.99)

In [None]:
train(model,train_loader,validation_loader,loss_fn,optimizer,10,1)

In [None]:
X,Y = next(iter(train_loader))
output = model(X)
softmax = nn.Softmax(dim=1)
#output = softmax(output)
correct =0
correct += (output.argmax(1)==1).type(torch.float).sum().item()
print(output)
print(Y)

In [None]:
output.argmax(1)==1