<a href="https://colab.research.google.com/github/romitbarua/MultiModalDeepFake/blob/main/TestingNotebooks/tSNE_MFCC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# t-SNE for MFCC

In [44]:
%matplotlib inline
from matplotlib import pyplot as plt
import matplotlib.cm as cm
import fnmatch
import os
import numpy as np
import librosa
import matplotlib.pyplot as plt
import librosa.display
from sklearn.manifold import TSNE
import json
# Importing library 
import csv 
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import IPython.display as ipd
import plotly.express as px

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
filename = '/content/drive/MyDrive/FakeAVCeleb/FakeAVCeleb_v1.2/wav2lip_metadata.csv'
metadata_df = pd.read_csv(filename)

In [6]:
id76_df = metadata_df[metadata_df.source=='id00076']

In [9]:
id76_wav_filepaths = id76_df.wav_filepath.to_list()

In [71]:
def generate_scaled_mfcc_features(audio_file_path, plot=False):
  #generate audio time series
  audio,sample_rate = librosa.load(audio_file_path)

  #truncating feature for now 
  #may need to consider padding or interpolation
  #audio = audio[0:(3*sample_rate)]

  #replaced above with taking average across time

  #generate mfcc features
  mfcc_features = librosa.feature.mfcc(y=audio,sr=sample_rate,n_mfcc=50)
  #convert to double to avoid numeric computation issues
  mfcc_features = mfcc_features.astype(np.double)
  #scale the MFCCs such that each coefficient dimension has zero mean and unit variance
  scaled_mfcc_features = sklearn.preprocessing.scale(mfcc_features, axis=1)

  if plot==True:
    fig, ax = plt.subplots(nrows=1, figsize=(15, 7), sharex=True, sharey=True)
    mfcc_plot = display.specshow(mfcc_features, sr=sample_rate, x_axis='time', ax=ax)
    ax.set(title='MFCC spectrogram for {}'.format(audio_file_path.split('/')[-2]))
    fig.colorbar(mfcc_plot, ax=ax)
    return scaled_mfcc_features, mfcc_plot
  
  return scaled_mfcc_features

In [39]:
feature_vectors = []
avg_mfccs = []
#iterate through all wav files for id
for wav_file in id76_wav_filepaths:
  #generate mfcc
  mfccs = generate_scaled_mfcc_features(wav_file)
  #full mfcc feature vector without padding or truncation
  feature_vectors.append(mfccs)
  #average mfcc feature vector
  avg_mfccs.append(np.mean(mfccs,1))

In [40]:
len(feature_vectors), feature_vectors[0].shape

(21, (20, 435))

In [41]:
#averaging for tSNE
for vector in feature_vectors:
  print(vector.shape)
  avg_mfcc = np.mean(vector,1)
  print(avg_mfcc.shape)

(20, 435)
(20,)
(20, 280)
(20,)
(20, 392)
(20,)
(20, 340)
(20,)
(20, 202)
(20,)
(20, 183)
(20,)
(20, 247)
(20,)
(20, 199)
(20,)
(20, 190)
(20,)
(20, 221)
(20,)
(20, 280)
(20,)
(20, 435)
(20,)
(20, 359)
(20,)
(20, 314)
(20,)
(20, 321)
(20,)
(20, 185)
(20,)
(20, 223)
(20,)
(20, 354)
(20,)
(20, 255)
(20,)
(20, 221)
(20,)
(20, 158)
(20,)


In [42]:
len(avg_mfccs), avg_mfccs[0].shape

(21, (20,))

In [43]:
model = TSNE(n_components=2, 
             learning_rate=200, #default value 
             perplexity=5, #must be lower than num samples
             verbose=2).fit_transform(avg_mfccs)



[t-SNE] Computing 16 nearest neighbors...
[t-SNE] Indexed 21 samples in 0.000s...
[t-SNE] Computed neighbors for 21 samples in 0.003s...
[t-SNE] Computed conditional probabilities for sample 21 / 21
[t-SNE] Mean sigma: 0.000000
[t-SNE] Computed conditional probabilities in 0.010s
[t-SNE] Iteration 50: error = 45.4027634, gradient norm = 0.4395739 (50 iterations in 0.020s)
[t-SNE] Iteration 100: error = 50.8502464, gradient norm = 0.3700581 (50 iterations in 0.017s)
[t-SNE] Iteration 150: error = 47.4991875, gradient norm = 0.5061514 (50 iterations in 0.015s)
[t-SNE] Iteration 200: error = 44.1463051, gradient norm = 0.4709113 (50 iterations in 0.018s)
[t-SNE] Iteration 250: error = 44.5103683, gradient norm = 0.4816028 (50 iterations in 0.017s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 44.510368
[t-SNE] Iteration 300: error = 0.5664833, gradient norm = 0.0009703 (50 iterations in 0.015s)
[t-SNE] Iteration 350: error = 0.5042081, gradient norm = 0.0003030 (50 i

In [45]:
x_axis=model[:,0]
y_axis=model[:,1]
fig = px.scatter(x=x_axis, y=y_axis)
fig.show()

## For multiple IDs


In [49]:
metadata_df.columns

Index(['Unnamed: 0', 'source', 'target1', 'target2', 'method', 'category',
       'type', 'race', 'gender', 'filename', 'directory_path', 'mp4_filepath',
       'utterance_file', 'wav_filepath'],
      dtype='object')

In [67]:
all_uniq_ids = metadata_df.source.unique()
#select 20
analysis_ids = all_uniq_ids[:20]
#list for filepaths
wav_filepaths = []
sources = []
color = []
targets = []
category = []
#get data
col = 1
for id in analysis_ids:
  temp_df = metadata_df[metadata_df.source==id]
  wav_filepaths.extend(temp_df.wav_filepath.to_list())
  sources.extend(temp_df.source.to_list())
  targets.extend(temp_df.target1.to_list())
  category.extend(temp_df.category.to_list())
  color.extend(len(temp_df.source.to_list())*[col])
  col += 1

In [65]:
len(wav_filepaths), len(sources), len(targets), len(category), len(color)

(427, 427, 427, 427, 427)

In [72]:
feature_vectors = []
avg_mfccs = []
#iterate through all wav files for id
for wav_file in wav_filepaths:
  #generate mfcc
  mfccs = generate_scaled_mfcc_features(wav_file)
  #full mfcc feature vector without padding or truncation
  feature_vectors.append(mfccs)
  #average mfcc feature vector
  avg_mfccs.append(np.mean(mfccs,1))

In [73]:
model = TSNE(n_components=2, 
             learning_rate=200, #default value 
             perplexity=5, #must be lower than num samples
             verbose=2).fit_transform(avg_mfccs)


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.



[t-SNE] Computing 16 nearest neighbors...
[t-SNE] Indexed 427 samples in 0.001s...
[t-SNE] Computed neighbors for 427 samples in 0.017s...
[t-SNE] Computed conditional probabilities for sample 427 / 427
[t-SNE] Mean sigma: 0.000000
[t-SNE] Computed conditional probabilities in 0.025s
[t-SNE] Iteration 50: error = 77.3051910, gradient norm = 0.4947553 (50 iterations in 0.133s)
[t-SNE] Iteration 100: error = 79.7716980, gradient norm = 0.4509508 (50 iterations in 0.051s)
[t-SNE] Iteration 150: error = 80.2058029, gradient norm = 0.4194918 (50 iterations in 0.050s)
[t-SNE] Iteration 200: error = 81.1755905, gradient norm = 0.4384472 (50 iterations in 0.052s)
[t-SNE] Iteration 250: error = 79.1245804, gradient norm = 0.4274250 (50 iterations in 0.049s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 79.124580
[t-SNE] Iteration 300: error = 1.5308634, gradient norm = 0.0023963 (50 iterations in 0.042s)
[t-SNE] Iteration 350: error = 1.3689334, gradient norm = 0.0016609 (

In [80]:
x_axis=model[:,0]
y_axis=model[:,1]
fig = px.scatter(x=x_axis, 
                 y=y_axis,
                 symbol=category,
                 color=sources)
fig.show()

## 3D t-SNE

In [77]:
model = TSNE(n_components=3, learning_rate=150, perplexity=10, verbose=2, angle=0.1).fit_transform(avg_mfccs)


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.



[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 427 samples in 0.001s...
[t-SNE] Computed neighbors for 427 samples in 0.010s...
[t-SNE] Computed conditional probabilities for sample 427 / 427
[t-SNE] Mean sigma: 0.000000
[t-SNE] Computed conditional probabilities in 0.045s
[t-SNE] Iteration 50: error = 89.0854263, gradient norm = 0.2862971 (50 iterations in 0.218s)
[t-SNE] Iteration 100: error = 99.8505173, gradient norm = 0.2539310 (50 iterations in 0.156s)
[t-SNE] Iteration 150: error = 104.3126755, gradient norm = 0.2343051 (50 iterations in 0.137s)
[t-SNE] Iteration 200: error = 108.0907288, gradient norm = 0.2083364 (50 iterations in 0.138s)
[t-SNE] Iteration 250: error = 110.4643250, gradient norm = 0.1868647 (50 iterations in 0.177s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 110.464325
[t-SNE] Iteration 300: error = 3.4251502, gradient norm = 0.0008573 (50 iterations in 0.132s)
[t-SNE] Iteration 350: error = 2.8467317, gradient norm = 0.00029

In [81]:
x_axis=model[:,0]
y_axis=model[:,1]
z_axis=model[:,2]
import plotly.express as px
fig = px.scatter_3d(x=x_axis, y=y_axis, z=z_axis,color=sources,symbol=category,opacity=0.7)

fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()


## Kaggle Version 

[Source](https://www.kaggle.com/code/ashkhagan/audio-dataset-analysis-4)

In [83]:
def get_features(y, sr):
    y = y[0:sr]  # analyze just first second
    S = librosa.feature.melspectrogram(y, sr=sr, n_mels=128)
    log_S = librosa.amplitude_to_db(S, ref=np.max)
    mfcc = librosa.feature.mfcc(S=log_S, n_mfcc=13)
    delta_mfcc = librosa.feature.delta(mfcc, mode='nearest')
    delta2_mfcc = librosa.feature.delta(mfcc, order=2, mode='nearest')
    feature_vector = np.concatenate((np.mean(mfcc,1), np.mean(delta_mfcc,1), np.mean(delta2_mfcc,1)))
    feature_vector = (feature_vector-np.mean(feature_vector)) / np.std(feature_vector)
    return feature_vector

In [84]:
feat_vecs = []
for wav in wav_filepaths:
  y, sr = librosa.load(wav)
  feat = get_features(y, sr)
  feat_vecs.append(feat)

In [87]:
model4 = TSNE(n_components=2, learning_rate=150, perplexity=5, verbose=2, angle=0.1).fit_transform(feat_vecs)


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.



[t-SNE] Computing 16 nearest neighbors...
[t-SNE] Indexed 427 samples in 0.000s...
[t-SNE] Computed neighbors for 427 samples in 0.009s...
[t-SNE] Computed conditional probabilities for sample 427 / 427
[t-SNE] Mean sigma: 0.000000
[t-SNE] Computed conditional probabilities in 0.014s
[t-SNE] Iteration 50: error = 75.8653717, gradient norm = 0.5754599 (50 iterations in 0.154s)
[t-SNE] Iteration 100: error = 65.5800476, gradient norm = 0.5759994 (50 iterations in 0.069s)
[t-SNE] Iteration 150: error = 61.5863533, gradient norm = 0.5477695 (50 iterations in 0.065s)
[t-SNE] Iteration 200: error = 57.8048401, gradient norm = 0.5764118 (50 iterations in 0.059s)
[t-SNE] Iteration 250: error = 56.6864967, gradient norm = 0.5700104 (50 iterations in 0.056s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 56.686497
[t-SNE] Iteration 300: error = 0.4064005, gradient norm = 0.0042203 (50 iterations in 0.059s)
[t-SNE] Iteration 350: error = 0.2299969, gradient norm = 0.0010469 (

In [88]:
x_axis=model4[:,0]
y_axis=model4[:,1]
fig = px.scatter(x=x_axis, 
                 y=y_axis,
                 symbol=category,
                 color=sources)
fig.show()