<a href="https://colab.research.google.com/github/pvst07/Fake-Talk-Detector-Project/blob/main/deploy_gradio_to_huggingface.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install gradio huggingface_hub torchvision librosa moviepy pydub matplotlib Pillow


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.6.0->torchvision)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.6.0->torchvision)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.6.0->torchvision)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.6.0->torchvision)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch==2.6.0->torchvision)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch==2.6.0->torchvision)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86

In [10]:
with open("requirements.txt", "w") as f:
    f.write("""
gradio
torch
torchvision
librosa
moviepy
pydub
matplotlib
Pillow
    """)

In [14]:
code = '''
import gradio as gr
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
import librosa
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import io
import os
import tempfile
from pydub import AudioSegment

def extract_audio_from_video(video_path):
    from moviepy.editor import VideoFileClip  # <--- lazy import
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_audio_file:
        audio_path = tmp_audio_file.name
    clip = VideoFileClip(video_path)
    clip.audio.write_audiofile(audio_path, codec='pcm_s16le', fps=16000, verbose=False, logger=None)
    return audio_path

def get_resnet34_model():
    model = models.resnet34(weights=models.ResNet34_Weights.DEFAULT)
    model.fc = nn.Sequential(
        nn.Linear(model.fc.in_features, 256),
        nn.ReLU(),
        nn.Dropout(0.5),
        nn.Linear(256, 64),
        nn.ReLU(),
        nn.Dropout(0.5),
        nn.Linear(64, 2)
    )
    return model

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = get_resnet34_model().to(device)
model.load_state_dict(torch.load("final_model.pt", map_location=device))
model.eval()


def get_loudest_segment(audio_path, sr=16000, window_size=5):
    audio, _ = librosa.load(audio_path, sr=sr)
    duration = len(audio) / sr
    window_len = int(window_size * sr)
    if len(audio) < window_len:
        return 0, duration
    max_energy = 0
    best_start = 0
    for start in range(0, len(audio) - window_len, sr):
        end = start + window_len
        energy = np.sum(audio[start:end] ** 2)
        if energy > max_energy:
            max_energy = energy
            best_start = start
    return best_start / sr, (best_start + window_len) / sr

def crop_audio_segment(audio_path, start_sec, end_sec):
    audio = AudioSegment.from_file(audio_path).set_frame_rate(16000).set_channels(1)
    cropped = audio[start_sec * 1000:end_sec * 1000]
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
        cropped.export(tmp.name, format="wav")
        return tmp.name

def audio_to_melspectrogram(audio_path, sr=16000, n_mels=128, fmax=8000):
    y, _ = librosa.load(audio_path, sr=sr)
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels, fmax=fmax)
    mel_db = librosa.power_to_db(mel, ref=np.max)
    mel_norm = (mel_db - mel_db.min()) / (mel_db.max() - mel_db.min())
    mel_img = (mel_norm * 255).astype(np.uint8)
    return mel_img

def preprocess_mel(mel_img):
    img = Image.fromarray(mel_img).convert("RGB").resize((224, 224))
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
    return transform(img).unsqueeze(0)

def predict_audio_file(audio_path):
    mel_img = audio_to_melspectrogram(audio_path)
    input_tensor = preprocess_mel(mel_img).to(device)
    with torch.no_grad():
        outputs = model(input_tensor)
        probs = torch.softmax(outputs, dim=1).cpu().numpy()[0]
    classes = ["Real", "Fake"]
    pred_idx = np.argmax(probs)
    pred_label = classes[pred_idx]
    confidence = round(probs[pred_idx] * 100, 2)
    buf = io.BytesIO()
    plt.figure(figsize=(4, 4))
    plt.imshow(mel_img, cmap='magma', origin='lower')
    plt.axis('off')
    plt.tight_layout()
    plt.savefig(buf, format='png')
    plt.close()
    buf.seek(0)
    return pred_label, confidence, Image.open(buf)

def run_full_predict(audio_file):
    if audio_file is None:
        return None, "<b style='color:red;'>❌ No file</b>", "", None
    ext = os.path.splitext(audio_file)[-1].lower()
    is_video = ext in [".mp4", ".mov", ".avi", ".mkv"]
    extracted_path = extract_audio_from_video(audio_file) if is_video else audio_file
    duration = librosa.get_duration(filename=extracted_path)
    if duration > 5.1:
        start_sec, end_sec = get_loudest_segment(extracted_path)
        cropped_path = crop_audio_segment(extracted_path, start_sec, end_sec)
        trim_info = f"<div style='color:gray;'>⏱️ Auto-trimmed to {start_sec:.2f}–{end_sec:.2f} sec</div>"
    else:
        cropped_path = extracted_path
        trim_info = ""
    label, conf, mel_img = predict_audio_file(cropped_path)
    label_color = "green" if label == "Real" else "red"
    html_label = f"<div style='font-size:28px; font-weight:bold; color:{label_color};'>{label}</div>"
    html_conf = f"<div style='font-size:28px; font-weight:bold;'>{conf:.2f}%</div>"
    final_html = f"<div style='display:flex; justify-content:center; gap:40px;'>{html_label}{html_conf}</div>{trim_info}"
    return cropped_path, final_html, mel_img

with gr.Blocks() as demo:
    gr.Markdown("## FakeTalk Detector")
    with gr.Row():
      file_input = gr.File(label="🎵 Upload Audio/Video", file_types=[".wav", ".mp3", ".mp4", ".mov", ".avi", ".mkv"])
    with gr.Row():
        trimmed_audio = gr.Audio(label="🔊 Trimmed 5s Preview", type="filepath", interactive=False)
    with gr.Row():
        submit_btn = gr.Button("Submit for Analysis", variant="primary")
    result_html = gr.HTML()
    mel_output = gr.Image(type="pil", label="Log Mel Spectrogram")
    def show_trimmed_audio(audio_file):
        if audio_file is None:
            return None
        ext = os.path.splitext(audio_file)[-1].lower()
        is_video = ext in [".mp4", ".mov", ".avi", ".mkv"]
        extracted_path = extract_audio_from_video(audio_file) if is_video else audio_file
        duration = librosa.get_duration(filename=extracted_path)
        if duration > 5.1:
            start_sec, end_sec = get_loudest_segment(extracted_path)
            cropped_path = crop_audio_segment(extracted_path, start_sec, end_sec)
        else:
            cropped_path = extracted_path
        return cropped_path

    file_input.change(fn=show_trimmed_audio, inputs=file_input, outputs=trimmed_audio)
    submit_btn.click(fn=run_full_predict, inputs=file_input, outputs=[trimmed_audio, result_html, mel_output])

demo.launch()
'''
with open("app.py", "w") as f:
    f.write(code)

In [3]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from huggingface_hub import HfApi

api = HfApi()
api.create_repo(
    repo_id="pvs-tw/fake-talk-detector",
    repo_type="space",
    space_sdk="gradio"
)


In [16]:
api = HfApi()
repo_id = "pvs-tw/fake-talk-detector"

api.upload_file(
    path_or_fileobj="app.py",
    path_in_repo="app.py",
    repo_id=repo_id,
    repo_type="space"
)


CommitInfo(commit_url='https://huggingface.co/spaces/pvs-tw/fake-talk-detector/commit/3ea51a9c3b021bc114e941282a8d394496b2f545', commit_message='Upload app.py with huggingface_hub', commit_description='', oid='3ea51a9c3b021bc114e941282a8d394496b2f545', pr_url=None, repo_url=RepoUrl('https://huggingface.co/spaces/pvs-tw/fake-talk-detector', endpoint='https://huggingface.co', repo_type='space', repo_id='pvs-tw/fake-talk-detector'), pr_revision=None, pr_num=None)