# Setting Up

In [1]:
# importing libraries
import pandas as pd
import numpy as np
import librosa, librosa.display
from IPython.display import Audio
import matplotlib.pyplot as plt
import soundfile as sf
from pydub import AudioSegment
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.applications.vgg19 import VGG19
from tensorflow.keras.models import Sequential
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras import regularizers
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.activations import softmax
from tensorflow.keras.optimizers import Adam



# Importing Data

In [2]:
train_df = pd.read_csv('/kaggle/input/common-voice/cv-valid-train.csv')

In [3]:
train_df.shape

(195776, 8)

In [4]:
train_df.head()

Unnamed: 0,filename,text,up_votes,down_votes,age,gender,accent,duration
0,cv-valid-train/sample-000000.mp3,learn to recognize omens and follow them the o...,1,0,,,,
1,cv-valid-train/sample-000001.mp3,everything in the universe evolved he said,1,0,,,,
2,cv-valid-train/sample-000002.mp3,you came so that you could learn about your dr...,1,0,,,,
3,cv-valid-train/sample-000003.mp3,so now i fear nothing because it was those ome...,1,0,,,,
4,cv-valid-train/sample-000004.mp3,if you start your emails with greetings let me...,3,2,,,,


# Data Preprocessing

In [5]:
def show_cat(df):
    print('teens', df.age.loc[df.age == 'teens'].count())
    print('twenties', df.age.loc[df.age == 'twenties'].count())
    print('thirties', df.age.loc[df.age == 'thirties'].count())
    print('fourties', df.age.loc[df.age == 'fourties'].count())
    print('fifties', df.age.loc[df.age == 'fifties'].count())
    print('sixties', df.age.loc[df.age == 'sixties'].count())
    print('seventies', df.age.loc[df.age == 'seventies'].count())
    print('eighties', df.age.loc[df.age == 'eighties'].count())
    return

In [6]:
# selecting the required fields
train_age_df = train_df.loc[:,['filename','age']]

In [7]:
train_age_df.fillna(0.0, inplace=True)

In [8]:
train_age_df = train_age_df[train_age_df['age']!=0.0]

In [9]:
train_age_df.loc[(train_age_df['age']=='eighties'),'age'] ='seventies'

In [10]:
# selecting 1800 samples from each category
age_groups = train_age_df['age'].unique()

In [11]:
final_df = pd.DataFrame(columns= ['filepath', 'age'])
for age_grp in age_groups:
    final_df = pd.concat([final_df, pd.DataFrame(train_age_df[train_age_df['age']==age_grp].sample(1800))], axis =0, ignore_index=True)
final_df.shape

(12600, 3)

In [12]:
show_cat(train_age_df)

teens 5441
twenties 23003
thirties 18303
fourties 11100
fifties 9466
sixties 4584
seventies 1871
eighties 0


In [13]:
show_cat(final_df)

teens 1800
twenties 1800
thirties 1800
fourties 1800
fifties 1800
sixties 1800
seventies 1800
eighties 0


In [14]:
def length_fixing(dataset):
    
    # each input is defined to have a 1s (1000ms) length
    segment_length = 3000
    
    temp_df = pd.DataFrame(columns= ['filepath', 'age'])
    
    for _,sample in dataset.iterrows():
    
        audio_file_path =  '/kaggle/input/common-voice/cv-valid-train/' + sample['filename']
        file_name = sample['filename'][-17:-4]

        # loading audio using AudioSegment
        audio = AudioSegment.from_file(audio_file_path, format="mp3")

        #print(len(audio))
        if len(audio) < 2500:
            continue
        if len(audio) < 3000:
            padding = AudioSegment.silent(duration=(segment_length - len(audio)))
            audio += padding
            # print(len(audio))
            audio.export("/kaggle/working/"+ file_name +"segment_1.wav", format="wav")
            new_record ={
                    'filepath' : ["/kaggle/working/"+ file_name +"segment_1.wav"],
                    'age' : [sample['age']]
            }
            temp_df = pd.concat([temp_df, pd.DataFrame(new_record)], ignore_index=True)    

        segments = [audio[i:i+segment_length] for i in range(0, len(audio), segment_length)]

        # padding the last segment to match the fixed length
        last_seg_len = len(segments[-1])
        if last_seg_len > 2500:
            padding = AudioSegment.silent(duration=(segment_length - last_seg_len))
            segments[-1] += padding
        else:
            segments = segments[:-1]
        # print(len(segments[-1]))
        for i,segment in enumerate(segments):
            segment.export("/kaggle/working/"+ file_name +f"segment_{i}.wav", format="wav")
            new_record ={
                'filepath' : ["/kaggle/working/"+ file_name +f"segment_{i}.wav"],
                'age' : [sample['age']]
            }
            temp_df = pd.concat([temp_df, pd.DataFrame(new_record)], ignore_index=True)  
    return temp_df

In [15]:
train_df_seg = length_fixing(final_df)
    

In [16]:
show_cat(train_df_seg)

teens 1860
twenties 2013
thirties 2021
fourties 2166
fifties 2355
sixties 2455
seventies 2568
eighties 0


In [17]:
show_cat(train_df_seg)

teens 1860
twenties 2013
thirties 2021
fourties 2166
fifties 2355
sixties 2455
seventies 2568
eighties 0


## Train Val Split

In [18]:
train_df = pd.DataFrame(columns= ['filepath', 'age'])
for age_grp in age_groups:
    train_df = pd.concat([train_df, pd.DataFrame(train_df_seg[train_df_seg['age']==age_grp].sample(1440))], axis =0, ignore_index=True)
train_df.shape

(10080, 2)

In [19]:
test_df = train_df_seg.merge(train_df, how='left', indicator=True).query('_merge == "left_only"').drop(columns=['_merge'])
test_df.shape

(5358, 2)

In [20]:
val_df = pd.DataFrame(columns= ['filepath', 'age'])
for age_grp in age_groups:
    val_df = pd.concat([val_df, pd.DataFrame(test_df[test_df['age']==age_grp].sample(360))], axis =0, ignore_index=True)
val_df.shape

(2520, 2)

In [21]:
def extract_features(dataset):
    
    temp_feature_list = []
    temp_label_list = []
    
    # loop through th erows of dataframe
    
    for _, row in dataset.iterrows():
        
        file_path = row['filepath']
        label = row['age']
        sample,_ = librosa.load(file_path, sr=28000)
        # feature extraction using stft
        features = librosa.amplitude_to_db(np.abs(librosa.core.stft(sample, n_fft=1024, hop_length=256)))
        #print(features.shape)
        temp_feature_list.append(features)
        temp_label_list.append(label)
        
    return np.array(temp_feature_list), np.array(temp_label_list)

In [22]:
X_train, Y_train = extract_features(train_df)

In [23]:
X_train.shape, Y_train.shape

((10080, 513, 329), (10080,))

In [24]:
X_val, Y_val = extract_features(val_df)

In [25]:
X_val.shape, Y_val.shape

((2520, 513, 329), (2520,))

In [26]:
X_train_ = X_train.reshape(10080,171,329,3)

In [27]:
X_val_ = X_val.reshape(2520, 171,329,3)

In [28]:
# converting the target class into one-hot-encoded vectors
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()

Y_train_lb = lb.fit_transform(Y_train)
Y_val_lb = lb.fit_transform(Y_val)

In [29]:
Y_train_lb.shape, Y_val_lb.shape

((10080, 7), (2520, 7))

# Models

In [72]:
vgg_19 = VGG19(
    include_top = False, 
    weights = 'imagenet',
    input_shape=(171,329) + (3,),
)

# freeze layers
for layer in vgg_19.layers:
    layer.trainable = False
    
x = Flatten()(vgg_19.output)
pred_layer = Dense(7, activation='sigmoid')(x)

model = Model(inputs=vgg_19.input, outputs=pred_layer)
model.summary()

Model: "model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_7 (InputLayer)        [(None, 171, 329, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 171, 329, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 171, 329, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 85, 164, 64)       0         
                                                                 
 block2_conv1 (Conv2D)       (None, 85, 164, 128)      73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 85, 164, 128)      147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 42, 82, 128)       0   

# Training the Model

In [73]:
from sklearn.metrics import accuracy_score
model.compile(
    loss='categorical_crossentropy',
    optimizer=Adam(learning_rate=1e-4),
    metrics=['accuracy']
)

In [74]:
fit_history = model.fit(X_train_, Y_train_lb, epochs=7, validation_data=(X_val_, Y_val_lb))

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [41]:
import gc

In [71]:
del vgg_19, model
gc.collect()

NameError: name 'vgg_19' is not defined

In [78]:
from sklearn.metrics import confusion_matrix

In [98]:
predictions = model.predict(X_val_)
predicted_classes = np.argmax(predictions, axis=1)



In [99]:
Y_val_lb

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0]])

In [101]:
output = tf.reshape(tf.argmax(Y_val_lb, axis=1), [-1]).numpy()

In [92]:
predicted_classes

array([6, 3, 6, ..., 5, 1, 5])

In [102]:
confusion = confusion_matrix(output, predicted_classes)
print("Confusion Matrix:")
print(confusion)

Confusion Matrix:
[[148  37  26  32  26  38  53]
 [ 35 146  21  29  20  53  56]
 [ 27  30 214  24  16  25  24]
 [ 36  57  54 134  13  24  42]
 [ 24  52  24  14 124  62  60]
 [ 25  47  25  23  24 165  51]
 [ 28  46  33  17  33  40 163]]


# Saving the model

# Downloading Models

In [33]:
import joblib

In [75]:
model.save('VGG_age_7532_4341.h5') # 0.44 -> 0.40
joblib.dump(model, 'VGG_age_7532_4341.pkl')

['VGG_age_7532_4341.pkl']

In [35]:
from IPython.display import FileLink

In [76]:
FileLink(r'VGG_age_7532_4341.h5')

In [77]:
FileLink(r'VGG_age_7532_4341.pkl')

# Reference

https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8434188/#:~:text=Several%20methods%20use%20machine%20learning,cancellation%20during%20the%20preprocessing%20phase.

75% - 90% accuracy

~9 million parameters