# Setting Up

In [1]:
# importing libraries
import pandas as pd
import numpy as np
import torch
import librosa, librosa.display
from IPython.display import Audio
import matplotlib.pyplot as plt
import soundfile as sf
from pydub import AudioSegment
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torchvision import models, datasets, transforms

# Importing Data

In [2]:
train_df = pd.read_csv('/kaggle/input/common-voice/cv-valid-train.csv')

In [3]:
train_df.shape

(195776, 8)

In [4]:
train_df.head()

Unnamed: 0,filename,text,up_votes,down_votes,age,gender,accent,duration
0,cv-valid-train/sample-000000.mp3,learn to recognize omens and follow them the o...,1,0,,,,
1,cv-valid-train/sample-000001.mp3,everything in the universe evolved he said,1,0,,,,
2,cv-valid-train/sample-000002.mp3,you came so that you could learn about your dr...,1,0,,,,
3,cv-valid-train/sample-000003.mp3,so now i fear nothing because it was those ome...,1,0,,,,
4,cv-valid-train/sample-000004.mp3,if you start your emails with greetings let me...,3,2,,,,


# Data Preprocessing

In [5]:
def show_cat(df):
    print('teens', df.age.loc[df.age == 'teens'].count())
    print('twenties', df.age.loc[df.age == 'twenties'].count())
    print('thirties', df.age.loc[df.age == 'thirties'].count())
    print('fourties', df.age.loc[df.age == 'fourties'].count())
    print('fifties', df.age.loc[df.age == 'fifties'].count())
    print('sixties', df.age.loc[df.age == 'sixties'].count())
    print('seventies', df.age.loc[df.age == 'seventies'].count())
    print('eighties', df.age.loc[df.age == 'eighties'].count())
    return

In [6]:
# selecting the required fields
train_age_df = train_df.loc[:,['filename','age']]

In [7]:
train_age_df.fillna(0.0, inplace=True)

In [8]:
train_age_df = train_age_df[train_age_df['age']!=0.0]

In [9]:
train_age_df.loc[(train_age_df['age']=='eighties'),'age'] ='seventies'

In [10]:
# selecting 1800 samples from each category
age_groups = train_age_df['age'].unique()

In [11]:
final_df = pd.DataFrame(columns= ['filepath', 'age'])
for age_grp in age_groups:
    final_df = pd.concat([final_df, pd.DataFrame(train_age_df[train_age_df['age']==age_grp].sample(1800))], axis =0, ignore_index=True)
final_df.shape

(12600, 3)

In [12]:
show_cat(train_age_df)

teens 5441
twenties 23003
thirties 18303
fourties 11100
fifties 9466
sixties 4584
seventies 1871
eighties 0


In [13]:
show_cat(final_df)

teens 1800
twenties 1800
thirties 1800
fourties 1800
fifties 1800
sixties 1800
seventies 1800
eighties 0


In [14]:
def length_fixing(dataset):
    
    # each input is defined to have a 1s (1000ms) length
    segment_length = 3000
    
    temp_df = pd.DataFrame(columns= ['filepath', 'age'])
    
    for _,sample in dataset.iterrows():
    
        audio_file_path =  '/kaggle/input/common-voice/cv-valid-train/' + sample['filename']
        file_name = sample['filename'][-17:-4]

        # loading audio using AudioSegment
        audio = AudioSegment.from_file(audio_file_path, format="mp3")

        #print(len(audio))
        if len(audio) < 2500:
            continue
        if len(audio) < 3000:
            padding = AudioSegment.silent(duration=(segment_length - len(audio)))
            audio += padding
            # print(len(audio))
            audio.export("/kaggle/working/"+ file_name +"segment_1.wav", format="wav")
            new_record ={
                    'filepath' : ["/kaggle/working/"+ file_name +"segment_1.wav"],
                    'age' : [sample['age']]
            }
            temp_df = pd.concat([temp_df, pd.DataFrame(new_record)], ignore_index=True)    

        segments = [audio[i:i+segment_length] for i in range(0, len(audio), segment_length)]

        # padding the last segment to match the fixed length
        last_seg_len = len(segments[-1])
        if last_seg_len > 2500:
            padding = AudioSegment.silent(duration=(segment_length - last_seg_len))
            segments[-1] += padding
        else:
            segments = segments[:-1]
        # print(len(segments[-1]))
        for i,segment in enumerate(segments):
            segment.export("/kaggle/working/"+ file_name +f"segment_{i}.wav", format="wav")
            new_record ={
                'filepath' : ["/kaggle/working/"+ file_name +f"segment_{i}.wav"],
                'age' : [sample['age']]
            }
            temp_df = pd.concat([temp_df, pd.DataFrame(new_record)], ignore_index=True)  
    return temp_df

In [None]:
train_df_seg = length_fixing(final_df)
    

In [None]:
show_cat(train_df_seg)

In [None]:
show_cat(train_df_seg)

## Train Val Split

In [None]:
train_df = pd.DataFrame(columns= ['filepath', 'age'])
for age_grp in age_groups:
    train_df = pd.concat([train_df, pd.DataFrame(train_df_seg[train_df_seg['age']==age_grp].sample(1440))], axis =0, ignore_index=True)
train_df.shape

In [None]:
test_df = train_df_seg.merge(train_df, how='left', indicator=True).query('_merge == "left_only"').drop(columns=['_merge'])
test_df.shape

In [None]:
val_df = pd.DataFrame(columns= ['filepath', 'age'])
for age_grp in age_groups:
    val_df = pd.concat([val_df, pd.DataFrame(test_df[test_df['age']==age_grp].sample(360))], axis =0, ignore_index=True)
val_df.shape

In [None]:
def extract_features(dataset):
    
    temp_feature_list = []
    temp_label_list = []
    
    # loop through th erows of dataframe
    
    for _, row in dataset.iterrows():
        
        file_path = row['filepath']
        label = row['age']
        #print('filepath:', file_path)
        #print('label:', label)
        # loading the audion file
        audio, sr = librosa.load(file_path, sr=28000)
        if len(audio) < sr*3:
            audio = librosa.util.pad_center(audio, size=sr*3)
        # print(len(audio))
        MFCCs = librosa.feature.mfcc(y=audio[:sr*3],sr=sr, n_fft=1024,hop_length=128,n_mfcc=128)
        #print(MFCCs_.shape)
        # log spectro of the MFCCs
        MFCCs_ = librosa.amplitude_to_db(MFCCs)
        temp_feature_list.append(MFCCs_)
        temp_label_list.append(label)
        
    return np.array(temp_feature_list), np.array(temp_label_list)

In [None]:
X_train, Y_train = extract_features(train_df)

In [None]:
X_train.shape, Y_train.shape

In [None]:
X_val, Y_val = extract_features(val_df)

In [None]:
X_val.shape, Y_val.shape

In [None]:
from sklearn.preprocessing import MaxAbsScaler
scaler = MaxAbsScaler()


## Creating Data Loaders

In [33]:
X_test = X_test.view(2520,-1)

In [32]:
X_train = X_train.view(10080,-1)

In [29]:
# converting the target class into one-hot-encoded vectors
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()

Y_train_lb = lb.fit_transform(Y_train)
Y_val_lb = lb.fit_transform(Y_val)

In [30]:
Y_train_lb.shape, Y_val_lb.shape

((10080, 7), (2520, 7))

In [34]:
X_train = scaler.fit_transform(X_train)

In [35]:
X_test = scaler.fit_transform(X_test)

In [31]:
X_train = torch.tensor(X_train, dtype=torch.float32)
Y_train = torch.tensor(Y_train_lb, dtype=torch.float32)
X_test = torch.tensor(X_val, dtype=torch.float32)
Y_test = torch.tensor(Y_val_lb, dtype=torch.float32)

In [38]:
#X_train_double = torch.as_tensor(X_train, dtype=torch.double)

In [39]:
#X_test_double = torch.as_tensor(X_test, dtype=torch.double)

In [37]:
#tensor_double

tensor([[ 0.8439,  0.8364,  0.8346,  ..., -0.9276,  0.0349,  0.0226],
        [ 0.9250,  0.9250,  0.9250,  ..., -0.2030, -0.1443, -0.7518],
        [ 0.9264,  0.9264,  0.9264,  ..., -0.0344,  0.0128, -0.5601],
        ...,
        [ 0.9265,  0.9265,  0.9265,  ..., -0.1544, -0.4243, -0.3394],
        [ 0.9208,  0.9208,  0.9208,  ...,  0.2419,  0.0352, -0.1892],
        [ 0.9455,  0.9455,  0.9455,  ..., -0.8526, -0.7845, -0.8432]],
       dtype=torch.float64)

In [41]:
train_loader = DataLoader(TensorDataset(X_train_double,Y_train),batch_size=64, shuffle=True)
val_loader = DataLoader(TensorDataset(X_test_double,Y_test), batch_size=64)

In [50]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x7dbe154b7a90>

In [36]:
# freeing up memory using garbage collector
import gc

#X_val = None
#X_train = None
#X_test = None
#gc.collect()

1663

# Models

In [42]:
# importing wav2vec base model
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

# Load the pre-trained Wav2Vec model
model = Wav2Vec2ForCTC.from_pretrained('facebook/wav2vec2-base')

# Modify the model head for classification (replace the last layer)
# Here is a simple example, you may need to adapt this according to your requirements
#

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.84k [00:00<?, ?B/s]



Downloading pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.config.hidden_size

In [None]:
model_2 = nn.Sequential(# 102,256,3--> 3channels
    
    nn.Conv2d(3, 32,  kernel_size=3, stride=1, padding=1),#102,256
    nn.MaxPool2d(2, stride=2), # 51,128
    
    nn.Conv2d(32, 64,  kernel_size=3, stride=1, padding=1),#51,128
    nn.MaxPool2d((3,2), stride=2), # 25,64
    
    nn.Conv2d(64, 32,  kernel_size=3, stride=1, padding=1),#25,64
    nn.MaxPool2d((3,2), stride=2), # 12,32
    
    nn.Flatten(),
    
    nn.Linear(in_features= 32*12*32, out_features=4096),
    nn.ReLU(),
    nn.Linear(in_features=4096, out_features=1024),
    nn.ReLU(),
    nn.Linear(1024, 7)  # Output 7 classes
)

## Resnet Model Pretrained

In [43]:
resnet = models.resnet50(pretrained=True)

# Freeze the layers except the final fully connected layers
for param in resnet.parameters():
    param.requires_grad = False


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:01<00:00, 90.4MB/s]


In [44]:
resnet.fc = nn.Linear(resnet.fc.in_features, 7)


In [48]:
# defining the optimizers and loss functions
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(resnet.parameters(), lr=0.00003)

# Training the Model

In [45]:
def evaluate(model_1,model_2, iterator, criterion):
    model_2.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in iterator:
            src, trg = batch
            batch_size = src.shape[0]
            src = src.view(batch_size,-1)
            logits = model_1.wav2vec2(src)
            input_2 = logits.last_hidden_state.view(batch_size,3,102,256)
            output = model_2(input_2)
            loss = criterion(output, trg)
            total_loss += loss.item()
    return total_loss / len(iterator)

In [46]:
# Training loop
model_2.train()
num_epochs = 5
for epoch in range(num_epochs):
    for batch in train_loader:
        src, trg = batch
        batch_size = src.shape[0]
        src = src.view(batch_size,-1)
        #print(src.shape)
        optimizer.zero_grad()
        logits = 0
        with torch.no_grad():
            logits = model.wav2vec2(src)
        #print(logits)
        print(logits.last_hidden_state.shape)
        input_2 = logits.last_hidden_state.view(batch_size,3,102,256)
        output = model_2(input_2)
        loss = criterion(output, trg)  # Adapt loss computation based on your setup
        loss.backward()
        optimizer.step()
        print('loss:',loss)
        
    val_loss = evaluate(model, model_2, val_loader, criterion)
    print(f'\tValidation Loss: {val_loss:.4f}')

NameError: name 'model_2' is not defined

## Training ResNet + Wav2vec

In [49]:
# defining the optimizers and loss functions
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(resnet.parameters(), lr=0.00003)

In [50]:
batch_size = 64

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
resnet = resnet.to(device)

In [1]:
def evaluate_2(model_1,model_2, iterator, criterion):
    model_2.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in iterator:
            src, trg = batch
            src = src.to(device)
            trg = trg.to(device)
            logits = model_1.wav2vec2(src)
            input_2 = logits.last_hidden_stateview(batch_size,3,262,256)
            output = model_2(input_2)
            loss = criterion(output, trg)
            total_loss += loss.item()
    return total_loss / len(iterator)

In [None]:
# Training loop
resnet.train()
num_epochs = 5
for epoch in range(num_epochs):
    for batch in train_loader:
        src, trg = batch
        src = src.to(device)
        trg = trg.to(device)
        optimizer.zero_grad()
        logits = 0
        print(type(src))
        with torch.no_grad():
            logits = model.wav2vec2(src)
        #print(logits)
        #print(logits.last_hidden_state.shape)
        input_2 = logits.last_hidden_state.view(batch_size,3,262,256)
        #print(input_2.shape)
        output = resnet(input_2)
        loss = criterion(output, trg)  # Adapt loss computation based on your setup
        loss.backward()
        optimizer.step()
        print('loss:',loss)
        
    val_loss = evaluate_2(model, resnet, val_loader, criterion)
    print(f'\tValidation Loss: {val_loss:.4f}')

# Saving the model

In [None]:
import pickle

In [None]:
torch.save(CNN_model, '/kaggle/working/Age_wav2vec_CNN.pth')

with open('/kaggle/working/Age_wav2vec_CNN.pkl', 'wb') as file:
    pickle.dump(CNN_model, file)
    
torch.save(CNN_model, '/kaggle/working/Age_wav2vec_CNN.h5')

# Downloading Models

In [25]:
from IPython.display import FileLink

In [None]:
FileLink(r'Age_wav2vec_CNN.pth')

In [None]:
FileLink(r'Age_wav2vec_CNN.pkl')

In [None]:
FileLink(r'Age_wav2vec_CNN.h5')

In [None]:
!zip -r file.zip /kaggle/working

In [28]:
FileLink(r'file.zip')