In [1]:
import pandas as pd
import librosa
import numpy as np
from tqdm import tqdm_notebook
import os

In [2]:
english_meta = pd.read_csv('./data/en/validated.tsv', sep='\t')
spanish_meta = pd.read_csv('./data/es/validated.tsv', sep='\t')
chinese_meta = pd.read_csv('./data/zh-CN/validated.tsv', sep='\t')

In [3]:
import IPython.display as ipd
ipd.Audio('./data/zh-CN/clips/common_voice_zh-CN_18646658.mp3')

In [4]:
chinese_meta

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accent
0,0697ece1f99a08477906d0f3b4e74e1d6ffca76c20a7db...,common_voice_zh-CN_18646658.mp3,局部干涩的例子包括有口干、眼睛干燥、及阴道干燥。,2,1,,,
1,077c6d812d0152245b7577ae430775781d98306ed0d264...,common_voice_zh-CN_18776118.mp3,嘉靖三十八年，登进士第三甲第二名。,2,1,,,
2,0bdcd4f3d4450cd23bd4c47f46d645b0e797b083e3f158...,common_voice_zh-CN_18840501.mp3,为了惩罚西扎城和塞尔柱的结盟，盟军在抵达后将外城烧毁。,2,0,,,
3,3b7575fdad7077cfd29a23938fc0bd6c284245b38f92f9...,common_voice_zh-CN_18832787.mp3,滨江县先后隶属于吉林省西北路道和滨江道。,2,1,,,
4,3c288ef58ce0780634a972e1b92b04c444bfa963dfcb2d...,common_voice_zh-CN_18774708.mp3,大使馆全年都为各类政治人物和公众举办了许多活动，包括庆祝以色列独立日独立日。,2,0,twenties,male,440000.0
5,3d28475fd46db162ffa6a79edea556c143a9b0e53f6415...,common_voice_zh-CN_18903199.mp3,感冒茶起源于中国岭南一带。,2,0,,,
6,48292f36099f3b7b3543e24b1f62aa69d71e76ad1bb1dd...,common_voice_zh-CN_18653869.mp3,福茨斯普林斯是位于美国加利福尼亚州科卢萨县的一个非建制地区。,2,1,,,
7,48bf3483a222100483ba249a1fc42a116bd2c530070afc...,common_voice_zh-CN_18733748.mp3,当她去世时，讣告也称她为「永不沉没的布朗夫人」。,2,1,,,
8,4f1b77f2d3f6dd0deaa3355f0c6105cc36d80feee1dd0e...,common_voice_zh-CN_18817154.mp3,有论者批评词中出现现代通俗口语「劲歌」，与整首歌的古典风格格格不入。,2,1,,,
9,57763d3cf9178950a1cf76212d3fd3f4bf8359787715e6...,common_voice_zh-CN_18524189.mp3,正巧母亲往外探头,2,0,,,


In [5]:
chinese_meta[pd.notna(chinese_meta['gender'])][['client_id','gender']].groupby('gender').count()

Unnamed: 0_level_0,client_id
gender,Unnamed: 1_level_1
female,32
male,4909


In [6]:
def sample_split(df, records=1000):
    sample = df[df['gender']=='male'].sort_values('up_votes', ascending=False).head(records)[['path']]
    sample['language'] = sample['path'].apply(lambda x: x.split('_')[2]) # add label
    sample = sample.sample(frac=1).reset_index(drop=True) # randomise
    return sample.iloc[0:0.6*records],
           sample.iloc[0.6*records:0.8*records],
           sample.iloc[0.6*records:records] # returns train, test, valid

IndentationError: unexpected indent (<ipython-input-6-47488611ee63>, line 6)

In [7]:
from os.path import isfile
if isfile('./train.csv') and isfile('./validation.csv') and isfile('./test.csv'):
    train = pd.read_csv('./train.csv')
    test = pd.read_csv('./test.csv')
    valid = pd.read_csv('./validation.csv')
else:
    en_train, en_test, en_valid = sample_split(english_meta)
    es_train, es_test, es_valid = sample_split(spanish_meta)
    cn_train, cn_test, cn_valid = sample_split(chinese_meta)
    train = pd.concat([en_train, es_train, cn_train]).to_csv('./train.csv', index=False)
    test = pd.concat([en_test, es_test, cn_test]).to_csv('./test.csv', index=False)
    valid = pd.concat([en_valid, es_valid, cn_valid]).to_csv('./validation.csv', index=False)

In [13]:
y, sr = librosa.load('./data/zh-CN/clips/common_voice_zh-CN_18772619.mp3')
yt, index = librosa.effects.trim(y, top_db=10)
librosa.feature.melspectrogram(y=yt, sr=sr)

# Passing through arguments to the Mel filters
# S = librosa.feature.melspectrogram(y=yt, sr=sr, n_mels=128, fmin=20, fmax=8000)
S = librosa.feature.melspectrogram(y=yt, sr=sr, n_mels=224, fmin=20, fmax=8000)
S_dB = librosa.power_to_db(S, ref=np.max)


import matplotlib.pyplot as plt
import librosa.display
plt.figure(figsize=(10, 4))
S_dB = librosa.power_to_db(S, ref=np.max)
librosa.display.specshow(S_dB, x_axis='time',
                         y_axis='mel', sr=sr,
                         fmin=20, fmax=8000)
plt.colorbar(format='%+2.0f dB')
plt.title('Mel-frequency spectrogram')
plt.tight_layout()
plt.show()

<Figure size 1000x400 with 2 Axes>

In [14]:
from skimage import io
from sklearn.preprocessing import MinMaxScaler

def scale(input_array, mini, maxi):
    scaler = MinMaxScaler(feature_range=(mini, maxi))
    return scaler.fit_transform(input_array)

def spectrogram_image(in_file, out_file, n_mels=224):
    y, sr = librosa.load(in_file) # load file
    yt, index = librosa.effects.trim(y, top_db=10) # trim file
    
    # use log-melspectrogram
    S = librosa.feature.melspectrogram(y=yt, sr=sr, n_mels=n_mels, fmin=20, fmax=8000)
    S_dB = librosa.power_to_db(S, ref=np.max)

    # RGB grayscale
    img = scale(S_dB, 0, 255).astype(np.uint8)
    img = np.flip(img, axis=0)
    img = 255-img
    assert len(img[img < 0]) == 0, "sub zero"
    assert len(img[img > 255]) == 0, "greater than 255"

    # save as PNG
    io.imsave(out_file, img)

In [51]:
from parallel import batch_multi

train_tuple_list = [(row.language, row.path) for index, row in train.iterrows()]
test_tuple_list = [(row.language, row.path) for index, row in test.iterrows()]
valid_tuple_list = [(row.language, row.path) for index, row in valid.iterrows()]

def create_melspect_img(lang_path_tuple_list, dataset_type):
    for tup_ in lang_path_tuple_list:
        language = tup_[0]   
        path = tup_[1]
        in_file = os.path.join("./data", language, "clips", path)
        out_file = os.path.join(f"./data/{dataset_type}-melspectogram", language, path.split('.')[0]+".png")
        spectrogram_image(in_file=in_file, out_file=out_file)

In [52]:
batch_multi(test_tuple_list, create_melspect_img, batch_size=20, results=False, dataset_type='test')

HBox(children=(IntProgress(value=0, max=30), HTML(value='')))

In [53]:
batch_multi(valid_tuple_list, create_melspect_img, batch_size=20, results=False, dataset_type='validation')

HBox(children=(IntProgress(value=0, max=30), HTML(value='')))

In [54]:
batch_multi(train_tuple_list, create_melspect_img, batch_size=20, results=False, dataset_type='train')

HBox(children=(IntProgress(value=0, max=90), HTML(value='')))

In [20]:
from keras.preprocessing.image import ImageDataGenerator

Using TensorFlow backend.


In [3]:
from keras.applications.resnet50 import preprocess_input
from keras.preprocessing import image
from keras.utils import np_utils
from sklearn.datasets import load_files
import numpy as np

# define function to load train, test, and validation datasets
def load_dataset(path):
    data = load_files(path)
    img_files = np.array(data['filenames'])
    img_targets = np_utils.to_categorical(np.array(data['target']), 3)
    return img_files, img_targets

# load train, test, and validation datasets
train_files, train_targets = load_dataset('data/train-melspectogram')
valid_files, valid_targets = load_dataset('data/validation-melspectogram')
test_files, test_targets = load_dataset('data/test-melspectogram')

def path_to_tensor(img_path):
    # loads RGB image as PIL.Image.Image type
    img = image.load_img(img_path, target_size=(224, 224))
    # convert PIL.Image.Image type to 3D tensor with shape (224, 224, 3)
    x = image.img_to_array(img)
    # convert 3D tensor to 4D tensor with shape (1, 224, 224, 3) and return 4D tensor
    return np.expand_dims(x, axis=0).astype('float32')/255

def paths_to_tensor(img_paths):
    list_of_tensors = tqdm_notebook([path_to_tensor(img_path) for img_path in img_paths])
    return np.vstack(list_of_tensors)

# calculate the image input. you will learn more about how this works the project!
train_img_input = preprocess_input(paths_to_tensor(train_files))
valid_img_input = preprocess_input(paths_to_tensor(valid_files))
test_img_input = preprocess_input(paths_to_tensor(test_files))

print(train_img_input.shape)

HBox(children=(IntProgress(value=0, max=7200), HTML(value='')))






HBox(children=(IntProgress(value=0, max=2400), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2400), HTML(value='')))


(7200, 224, 224, 3)


In [2]:
from keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D
from keras.layers import Dropout, Flatten, Dense
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint
from keras.applications.resnet50 import ResNet50
ResNet50_model = ResNet50(include_top=False)

# freeze layers except last three
for layer in ResNet50_model.layers[:-4]:
    layer.trainable = False

model = Sequential()
model.add(ResNet50_model)
model.add(GlobalAveragePooling2D())
model.add(Dropout(0.2))
model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))

model.summary()

W1107 19:36:46.593142 4697339328 deprecation.py:506] From /usr/local/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
resnet50 (Model)             (None, None, None, 2048)  23587712  
_________________________________________________________________
global_average_pooling2d_1 ( (None, 2048)              0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 2048)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1024)              2098176   
_________________________________________________________________
dropout_2 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 3075      
Total params: 25,688,963
Trainable params: 3,155,971
Non-trainable params: 22,532,992
________________________________________________________

In [106]:
test_bottleneck = ResNet50_model.predict(test_img_input)

In [209]:
train_bottleneck.shape

(1800, 7, 7, 2048)

In [3]:
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

W1107 19:39:53.251195 4697339328 deprecation_wrapper.py:119] From /usr/local/lib/python3.7/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.



In [4]:
checkpointer = ModelCheckpoint(filepath='saved_models/weights.best.unfreeze.ResNet50.hdf5', 
                               verbose=1, save_best_only=True)

model.fit(train_img_input, train_targets, 
          validation_data=(valid_img_input, valid_targets),
          epochs=100, batch_size=25, callbacks=[checkpointer], verbose=1)

NameError: name 'train_img_input' is not defined

In [184]:
model.load_weights('saved_models/weights.best.unfreeze.ResNet50.hdf5')

In [185]:
predictions = [np.argmax(model.predict(np.expand_dims(feature, axis=0))) for feature in test_img_input]

test_accuracy = 100*np.sum(np.array(predictions)==np.argmax(test_targets, axis=1))/len(predictions)
print('Test accuracy: %.4f%%' % test_accuracy)

Test accuracy: 58.0000%


In [186]:
from sklearn.metrics import confusion_matrix
confusion_matrix(testy, predictions, labels=[0, 1, 2])

array([[ 80,  73,  47],
       [ 44, 134,  22],
       [ 30,  36, 134]])

In [187]:
testy = []
for ray in test_targets:
    for i, el in enumerate(ray):
        if el==1:
            testy.append(i)

In [204]:
language_mapping = {
    0: 'zh-CN',
    1: 'en',
    2: 'es'
}

error_list = []
correct_list = []


for i, test_path in enumerate(test_files):
    if predictions[i] != testy[i]:
        error_list.append((test_path, language_mapping[testy[i]], language_mapping[predictions[i]]))
    else:
        correct_list.append((test_path, language_mapping[testy[i]], language_mapping[predictions[i]]))

In [214]:
from PIL import Image

okay_list = []

h_w_error_list = []
for i in error_list:
    im = Image.open(i[0])
    width, height = im.size
    h_w_error_list.append((i[0],i[1],i[2],width,height))
    if width >= height:
        okay_list.append(i[0])

h_w_correct_list = []
for i in correct_list:
    im = Image.open(i[0])
    width, height = im.size
    h_w_correct_list.append((i[0],i[1],i[2],width,height))
    if width >= height:
        okay_list.append(i[0])

In [207]:
print("average error width {}".format(sum([tup_[3] for tup_ in h_w_error_list])/len(h_w_error_list)))
print("average correct width {}".format(sum([tup_[3] for tup_ in h_w_correct_list])/len(h_w_correct_list)))

average error width 126.15873015873017
average correct width 148.11206896551724


In [216]:
len(okay_list)


88

In [143]:
predictions

[2,
 1,
 1,
 2,
 2,
 2,
 0,
 0,
 1,
 2,
 1,
 0,
 0,
 2,
 0,
 1,
 1,
 0,
 1,
 0,
 2,
 0,
 0,
 1,
 2,
 1,
 2,
 2,
 2,
 2,
 1,
 0,
 1,
 2,
 2,
 2,
 2,
 0,
 0,
 2,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 2,
 1,
 1,
 0,
 1,
 1,
 2,
 1,
 1,
 2,
 1,
 2,
 2,
 2,
 1,
 1,
 1,
 2,
 1,
 1,
 2,
 2,
 1,
 2,
 2,
 2,
 2,
 0,
 0,
 1,
 2,
 2,
 2,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 2,
 2,
 0,
 0,
 1,
 1,
 0,
 2,
 0,
 0,
 1,
 2,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 1,
 2,
 1,
 2,
 1,
 0,
 2,
 2,
 0,
 2,
 2,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 2,
 0,
 1,
 1,
 2,
 1,
 1,
 2,
 2,
 1,
 0,
 2,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 2,
 1,
 1,
 2,
 0,
 0,
 1,
 2,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 2,
 1,
 2,
 0,
 2,
 1,
 0,
 2,
 1,
 1,
 2,
 1,
 1,
 2,
 2,
 2,
 1,
 2,
 1,
 1,
 2,
 2,
 1,
 1,
 1,
 0,
 2,
 1,
 1,
 1,
 0,
 1,
 1,
 2,
 1,
 1,
 1,
 2,
 0,
 1,
 1,
 2,
 0,
 1,
 2,
 2,
 1,
 0,
 2,
 1,
 1,
 1,
 0,
 2,
 2,
 2,
 2,
 2,
 1,
 1,
 0,
 2,
 2,
 2,
 1,
 1,
 2,
 1,
 0,
 2,
 2,
 1,
 2,
 1,
 1,
 0,
 1,
 2,
 2,
 2,
 0,
 0,
 1,
 1,
 2,
 1,


In [144]:
testy

[2,
 0,
 1,
 2,
 1,
 2,
 2,
 2,
 2,
 0,
 1,
 2,
 0,
 1,
 0,
 2,
 2,
 0,
 0,
 1,
 2,
 2,
 1,
 0,
 2,
 0,
 1,
 2,
 2,
 2,
 1,
 2,
 1,
 0,
 0,
 2,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 2,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 2,
 2,
 2,
 0,
 1,
 1,
 2,
 2,
 1,
 2,
 2,
 0,
 1,
 0,
 0,
 2,
 0,
 1,
 1,
 2,
 2,
 2,
 2,
 1,
 0,
 2,
 1,
 0,
 0,
 0,
 0,
 0,
 2,
 2,
 2,
 1,
 2,
 1,
 2,
 0,
 1,
 0,
 0,
 1,
 2,
 1,
 0,
 0,
 2,
 2,
 2,
 2,
 1,
 1,
 1,
 0,
 2,
 0,
 1,
 2,
 1,
 2,
 0,
 1,
 0,
 0,
 1,
 1,
 2,
 2,
 2,
 2,
 1,
 1,
 2,
 1,
 0,
 2,
 1,
 1,
 1,
 2,
 0,
 2,
 0,
 1,
 2,
 2,
 0,
 1,
 2,
 0,
 0,
 1,
 2,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 2,
 2,
 2,
 1,
 2,
 1,
 1,
 2,
 0,
 1,
 2,
 1,
 1,
 2,
 2,
 0,
 1,
 2,
 2,
 1,
 2,
 2,
 0,
 1,
 1,
 0,
 2,
 0,
 1,
 0,
 0,
 0,
 1,
 2,
 1,
 1,
 0,
 2,
 1,
 1,
 0,
 2,
 1,
 1,
 2,
 2,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 2,
 0,
 2,
 2,
 1,
 2,
 1,
 1,
 2,
 2,
 2,
 0,
 1,
 0,
 2,
 0,
 2,
 2,
 2,
 2,
 2,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
