In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

The IndicTTS Deepfake Challenge is a competition focused on detecting AI-generated speech across 16 Indian languages. Participants were to develop a model that predict the probability of an audio sample being synthesized. 

In [None]:
import torch
import torchaudio
import numpy as np
import pandas as pd
from datasets import load_dataset
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

dataset = load_dataset("SherryT997/IndicTTS-Deepfake-Challenge-Data", split="train", streaming=False)

def zero_crossing_rate(waveform):
    return np.mean(np.abs(np.diff(np.sign(waveform.numpy()))))

def zero_crossing_rate(waveform):
    return np.mean(np.abs(np.diff(np.sign(waveform.numpy()))))

def extract_features(sample):
    audio = sample["audio"]
    waveform = torch.tensor(audio["array"], dtype=torch.float32)
    sample_rate = audio["sampling_rate"]

    if waveform.shape[0] < 2048:
        waveform = torch.nn.functional.pad(waveform, (0, 2048 - waveform.shape[0]))

    mfcc = torchaudio.transforms.MFCC(sample_rate=sample_rate, n_mfcc=20)(waveform)
    mfcc = mfcc.squeeze(0).numpy()

    delta = torchaudio.functional.compute_deltas(torch.tensor(mfcc)).numpy()
    delta_delta = torchaudio.functional.compute_deltas(torch.tensor(delta)).numpy()

    n_fft = min(2048, waveform.shape[0]) 
    hop_length = n_fft // 2
    win_length = n_fft

    spectral_centroid = torchaudio.functional.spectral_centroid(
        waveform.unsqueeze(0), sample_rate=sample_rate, pad=0, 
        window=torch.hann_window(win_length), n_fft=n_fft, 
        hop_length=hop_length, win_length=win_length
    ).numpy().flatten()

    zcr = zero_crossing_rate(waveform)

    features = np.hstack([
        mfcc.mean(axis=1), delta.mean(axis=1), delta_delta.mean(axis=1),
        spectral_centroid.mean(), zcr
    ])

    return features, sample["is_tts"]

X, y = [], []
for sample in dataset:
    features, label = extract_features(sample)
    X.append(features)
    y.append(label)

X = np.array(X)
y = np.array(y)

imputer = SimpleImputer(strategy="mean")
X = imputer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = xgb.XGBClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")


###Predicting


test_dataset = load_dataset("SherryT997/IndicTTS-Deepfake-Challenge-Data", split="test", streaming=False)

def zero_crossing_rate(waveform):
    return np.mean(np.abs(np.diff(np.sign(waveform.numpy()))))

def extract_test_features(sample):
    audio = sample["audio"]
    waveform = torch.tensor(audio["array"], dtype=torch.float32)
    sample_rate = audio["sampling_rate"]

    if waveform.shape[0] < 2048:
        waveform = torch.nn.functional.pad(waveform, (0, 2048 - waveform.shape[0]))

    mfcc = torchaudio.transforms.MFCC(sample_rate=sample_rate, n_mfcc=20)(waveform)
    mfcc = mfcc.squeeze(0).numpy()

    delta = torchaudio.functional.compute_deltas(torch.tensor(mfcc)).numpy()
    delta_delta = torchaudio.functional.compute_deltas(torch.tensor(delta)).numpy()

    n_fft = min(2048, waveform.shape[0]) 
    hop_length = n_fft // 2
    win_length = n_fft

    spectral_centroid = torchaudio.functional.spectral_centroid(
        waveform.unsqueeze(0), sample_rate=sample_rate, pad=0, 
        window=torch.hann_window(win_length), n_fft=n_fft, 
        hop_length=hop_length, win_length=win_length
    ).numpy().flatten()

    zcr = zero_crossing_rate(waveform)

    features = np.hstack([
        mfcc.mean(axis=1), delta.mean(axis=1), delta_delta.mean(axis=1),
        spectral_centroid.mean(), zcr
    ])

    return features, sample["id"]


X_test, ids = [], []
for sample in test_dataset:
    features, sample_id = extract_test_features(sample)
    X_test.append(features)
    ids.append(sample_id)


X_test = np.array(X_test)
X_test = imputer.fit_transform(X_test)

y_proba = model.predict_proba(X_test)[:, 1] 


predictions_df = pd.DataFrame({"id": ids, "is_tts": y_proba})
predictions_df.to_csv('submission.csv',index=False)

README.md:   0%|          | 0.00/2.81k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/35 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/35 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/35 [00:00<?, ?files/s]

train-00000-of-00035.parquet:   0%|          | 0.00/453M [00:00<?, ?B/s]

train-00001-of-00035.parquet:   0%|          | 0.00/461M [00:00<?, ?B/s]

train-00002-of-00035.parquet:   0%|          | 0.00/464M [00:00<?, ?B/s]

train-00003-of-00035.parquet:   0%|          | 0.00/443M [00:00<?, ?B/s]

train-00004-of-00035.parquet:   0%|          | 0.00/470M [00:00<?, ?B/s]

train-00005-of-00035.parquet:   0%|          | 0.00/475M [00:00<?, ?B/s]

train-00006-of-00035.parquet:   0%|          | 0.00/447M [00:00<?, ?B/s]

train-00007-of-00035.parquet:   0%|          | 0.00/516M [00:00<?, ?B/s]

train-00008-of-00035.parquet:   0%|          | 0.00/557M [00:00<?, ?B/s]

train-00009-of-00035.parquet:   0%|          | 0.00/521M [00:00<?, ?B/s]

train-00010-of-00035.parquet:   0%|          | 0.00/491M [00:00<?, ?B/s]

train-00011-of-00035.parquet:   0%|          | 0.00/426M [00:00<?, ?B/s]

train-00012-of-00035.parquet:   0%|          | 0.00/414M [00:00<?, ?B/s]

train-00013-of-00035.parquet:   0%|          | 0.00/473M [00:00<?, ?B/s]

train-00014-of-00035.parquet:   0%|          | 0.00/481M [00:00<?, ?B/s]

train-00015-of-00035.parquet:   0%|          | 0.00/467M [00:00<?, ?B/s]

train-00016-of-00035.parquet:   0%|          | 0.00/532M [00:00<?, ?B/s]

train-00017-of-00035.parquet:   0%|          | 0.00/510M [00:00<?, ?B/s]

train-00018-of-00035.parquet:   0%|          | 0.00/471M [00:00<?, ?B/s]

train-00019-of-00035.parquet:   0%|          | 0.00/501M [00:00<?, ?B/s]

train-00020-of-00035.parquet:   0%|          | 0.00/559M [00:00<?, ?B/s]

train-00021-of-00035.parquet:   0%|          | 0.00/541M [00:00<?, ?B/s]

train-00022-of-00035.parquet:   0%|          | 0.00/558M [00:00<?, ?B/s]

train-00023-of-00035.parquet:   0%|          | 0.00/599M [00:00<?, ?B/s]

train-00024-of-00035.parquet:   0%|          | 0.00/576M [00:00<?, ?B/s]

train-00025-of-00035.parquet:   0%|          | 0.00/547M [00:00<?, ?B/s]

train-00026-of-00035.parquet:   0%|          | 0.00/537M [00:00<?, ?B/s]

train-00027-of-00035.parquet:   0%|          | 0.00/421M [00:00<?, ?B/s]

train-00028-of-00035.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00029-of-00035.parquet:   0%|          | 0.00/287M [00:00<?, ?B/s]

train-00030-of-00035.parquet:   0%|          | 0.00/282M [00:00<?, ?B/s]

train-00031-of-00035.parquet:   0%|          | 0.00/688M [00:00<?, ?B/s]

train-00032-of-00035.parquet:   0%|          | 0.00/613M [00:00<?, ?B/s]

train-00033-of-00035.parquet:   0%|          | 0.00/309M [00:00<?, ?B/s]

train-00034-of-00035.parquet:   0%|          | 0.00/424M [00:00<?, ?B/s]

test-00000-of-00004.parquet:   0%|          | 0.00/356M [00:00<?, ?B/s]

test-00001-of-00004.parquet:   0%|          | 0.00/364M [00:00<?, ?B/s]

test-00002-of-00004.parquet:   0%|          | 0.00/410M [00:00<?, ?B/s]

test-00003-of-00004.parquet:   0%|          | 0.00/291M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/31102 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2635 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/35 [00:00<?, ?it/s]



Model Accuracy: 0.9765


Resolving data files:   0%|          | 0/35 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/35 [00:00<?, ?it/s]

