In [None]:
#importing packages
import matplotlib.pyplot as plt
from matplotlib.backend_bases import RendererBase
from scipy import signal
from scipy.io import wavfile
import os
import numpy as np
from PIL import Image
from scipy.fftpack import fft
%matplotlib inline
from pydub import AudioSegment
import librosa
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.utils import to_categorical
from keras.preprocessing import image
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import PIL

In [None]:
#reading files containing audio and the path where the image has to be stored 
audio_path = '..input/kan_audio_2'
pict_Path = '..input/picturedata'

loading audio

In [None]:
subFolderList = []
for x in os.listdir(audio_path):
    if os.path.isdir(audio_path + '/' + x):
        subFolderList.append(x)

In [None]:
if not os.path.exists(pict_Path):
    os.makedirs(pict_Path)
subFolderList = []
for x in os.listdir(audio_path):
    if os.path.isdir(audio_path + '/' + x):
        subFolderList.append(x)
        if not os.path.exists(pict_Path + '/' + x):
            os.makedirs(pict_Path +'/'+ x)

In [None]:
sample_audio = []
for i in subFolderList:
    files=librosa.util.find_files(audio_path+"/"+i)
    files=np.asarray(files)
    for j in files:
        data=librosa.util.find_files(j)
        sample_audio.append(j)

converting audio to spectrogram

In [None]:
def log_specgram(audio, sample_rate, window_size=20,
                 step_size=10, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, _, spec = signal.spectrogram(audio,
                                    fs=sample_rate,
                                    window='hann',
                                    nperseg=nperseg,
                                    noverlap=noverlap,
                                    detrend=False)
    return freqs, np.log(spec.T.astype(np.float32) + eps)

In [None]:
fig = plt.figure(figsize=(10,10))

# for each of the samples
for i, filepath in enumerate(sample_audio[:9]):
    # Make subplots
    plt.subplot(3,3,i+1)
    
    # pull the labels
    label = filepath.split('/')
    plt.title(label)
    
    # create spectogram
    samplerate, test_sound  = wavfile.read(filepath)
    _, spectrogram = log_specgram(test_sound, samplerate)
    
    plt.imshow(spectrogram.T, aspect='auto', origin='lower')
    plt.axis('off')

In [None]:
def wav2img(wav_path, targetdir='', figsize=(4,4)):
    fig = plt.figure(figsize=figsize)    
    # use soundfile library to read in the wave files
    samplerate, test_sound  = wavfile.read(filepath)
    _, spectrogram = log_specgram(test_sound, samplerate)
    
    ## create output path
    output_file = wav_path.split('/')[-1].split('.wav')[0]
    output_file = targetdir +'/'+ output_file
    #plt.imshow(spectrogram.T, aspect='auto', origin='lower')
    plt.imsave('%s.jpg' % output_file, spectrogram)
    plt.close()

In [None]:
# get all the spectro image in one folder
for i, x in enumerate(subFolderList[:10]):
    print(i, ':', x)
    all_files = [y for y in os.listdir(audio_path +'/'+ x) if '.wav' in y]
    for file in all_files[:30]:
        wav2img(audio_path + x + '/' + file, pict_Path +'/'+ x)

create a .csv containing the images and name of labels.

In [None]:
train=pd.read_csv("..input/image_class.csv")

In [None]:
train_image = []
for i in tqdm(range(train.shape[0])):
    img = image.load_img('..input/picturedata'+'/'+train['id'][i]+'.jpg')
    img = image.img_to_array(img)
    img = img/255
    train_image.append(img)

In [None]:
#splitting train & test
X = np.array(train_image)
y = np.array(train.drop(['id', 'label'],axis=1))
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [None]:
#defining conv2d model
model = Sequential()
model.add(Conv2D(filters=16, kernel_size=(5, 5), activation="relu"))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Conv2D(filters=32, kernel_size=(5, 5), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Conv2D(filters=64, kernel_size=(5, 5), activation="relu"))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Conv2D(filters=64, kernel_size=(5, 5), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(10, activation='sigmoid'))

In [None]:
model.summary()

In [None]:
#compiling the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
#fitting model
model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test), batch_size=100)

testing the model

In [None]:
voice_file='..input/test_audio_file'
image_file='..input/test_image'
wav2img(voice_file,image_file)
for i in os.listdir(image_file):
    img=Image.open(image_file+"/"+i)
    img = image.img_to_array(img)
    img = img/255

In [None]:
classes = np.array(train.columns[2:])
proba = model.predict(img.reshape(1,266,480,3))
top3 = np.argsort(proba[0])[:-4:-1]
for i in range(3):
    print("{}".format(classes[top_3[i]])+" ({:.3})".format(proba[0][top_3[i]]))