In [None]:
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
from google.oauth2.service_account import Credentials
import io
import zipfile
import os
!pip install gdown

# Authenticate using the service account JSON file
def authenticate_with_service_account(json_keyfile):
    credentials = Credentials.from_service_account_file(json_keyfile, scopes=["https://www.googleapis.com/auth/drive"])
    service = build('drive', 'v3', credentials=credentials)
    return service

# Download file by its ID
def download_file(service, file_id, output_path):
    request = service.files().get_media(fileId=file_id)
    with io.FileIO(output_path, 'wb') as file:
        downloader = MediaIoBaseDownload(file, request)
        done = False
        while not done:
            status, done = downloader.next_chunk()
            print(f"Download {int(status.progress() * 100)}%.")

# Main script
if __name__ == "__main__":
    !gdown "https://drive.google.com/uc?id=############################333"
    # Path to your service account JSON key file
    json_keyfile = "avss7.json"

    file_list = [
    {"id": "1-3DA8qf4j3pj4HzK_z1HttDqThjYaNyk", "output": "group_7.zip"},
    {"id": "1-6WsiR5lDBr9AxNlAGaTBBJIxI5TcPL6", "output": "group_8.zip"},
    {"id": "1-3CJKgNtF9fZqj5bUeIhGQ2PLlC-J99W", "output": "group_9.zip"},

    ]


    # Directory to store the extracted files
    extraction_dir = "dataset"
    os.makedirs(extraction_dir, exist_ok=True)

    # Authenticate and download
    service = authenticate_with_service_account(json_keyfile)

    for file in file_list:
        file_id = file["id"]
        output_path = file["output"]

        download_file(service, file_id, output_path)

        # Check if the file is a ZIP file
        if output_path.endswith(".zip"):
            print(f"Extracting {output_path}...")
            with zipfile.ZipFile(output_path, 'r') as zip_ref:
                zip_ref.extractall(extraction_dir)

            # Remove the ZIP file after extraction
            print(f"Removing {output_path}...")
            os.remove(output_path)

    print(f"Download of {output_path} complete.")
print(f"All files downloaded and extracted to {extraction_dir}.")

Downloading...
From: https://drive.google.com/uc?id=#################################33
To: /content/avss7.json
100% 2.38k/2.38k [00:00<00:00, 9.31MB/s]
Download 1%.
Download 2%.
Download 3%.
Download 4%.
Download 5%.
Download 6%.
Download 7%.
Download 8%.
Download 9%.
Download 10%.
Download 11%.
Download 12%.
Download 13%.
Download 14%.
Download 15%.
Download 16%.
Download 17%.
Download 18%.
Download 19%.
Download 20%.
Download 21%.
Download 22%.
Download 23%.
Download 24%.
Download 25%.
Download 26%.
Download 27%.
Download 28%.
Download 29%.
Download 30%.
Download 31%.
Download 32%.
Download 33%.
Download 34%.
Download 35%.
Download 36%.
Download 37%.
Download 38%.
Download 39%.
Download 40%.
Download 41%.
Download 42%.
Download 43%.
Download 44%.
Download 45%.
Download 46%.
Download 47%.
Download 48%.
Download 49%.
Download 50%.
Download 51%.
Download 52%.
Download 53%.
Download 54%.
Download 55%.
Download 56%.
Download 57%.
Download 58%.
Download 59%.
Download 60%.
Download 61%.
Do

In [None]:
!rm -rf dataset

In [None]:
!git clone https://github.com/zexupan/reentry.git

Cloning into 'reentry'...
remote: Enumerating objects: 199, done.[K
remote: Counting objects: 100% (199/199), done.[K
remote: Compressing objects: 100% (142/142), done.[K
remote: Total 199 (delta 94), reused 149 (delta 53), pack-reused 0 (from 0)[K
Receiving objects: 100% (199/199), 1.08 MiB | 3.45 MiB/s, done.
Resolving deltas: 100% (94/94), done.


In [None]:
import os
import subprocess
import json

def has_audio(video_file):
    """
    Check if the video file has an audio stream using ffprobe.
    """
    cmd = [
        'ffprobe',
        '-v', 'error',
        '-select_streams', 'a:0',  # select the first audio stream
        '-show_entries', 'stream=codec_type',
        '-of', 'json',
        video_file
    ]
    result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    try:
        info = json.loads(result.stdout)
        streams = info.get('streams', [])
        return len(streams) > 0
    except Exception as e:
        print(f"Error checking audio for {video_file}: {e}")
        return False

def get_video_duration(video_file):
    """
    Retrieve the duration of the video file in seconds using ffprobe.
    """
    cmd = [
        'ffprobe',
        '-v', 'error',
        '-show_entries', 'format=duration',
        '-of', 'json',
        video_file
    ]
    result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    try:
        info = json.loads(result.stdout)
        duration = float(info['format']['duration'])
        return duration
    except Exception as e:
        print(f"Error reading duration for {video_file}: {e}")
        return 0

def extract_audio(video_file, output_file, duration=6):
    """
    Extract a clip of audio (default: first 6 seconds) from the video file using ffmpeg.
    The '-vn' option disables video, and the audio is saved as a WAV file.
    """
    cmd = [
        'ffmpeg',
        '-y',                  # overwrite output file if it exists
        '-i', video_file,
        '-ss', '0',            # start time at 0 seconds
        '-t', str(duration),   # clip duration
        "-ar", str(16000),      # Set sample rate to 16kHz
        "-filter:a", "volume=1.0",           # Normalize audio power (optional)
        "-af", "loudnorm",                   # Apply loudness normalization
        '-vn',                 # disable video recording
        '-acodec', 'pcm_s16le',# audio codec (WAV format)
        output_file
    ]
    result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    if result.returncode != 0:
        print(f"Error extracting audio from {video_file}:\n{result.stderr}")
    else:
        print(f"Extracted audio to {output_file}")

def main(input_root, output_audio_dir, clip_duration=6):
    # Create the output directory if it does not exist.
    os.makedirs(output_audio_dir, exist_ok=True)
    audio_counter = 0
    aud = 0
    file_counter = 0
    # Walk through all subdirectories
    # print((list(os.walk(input_root))))
    # print(len(list(os.walk(input_root))))
    for root, dirs, files in os.walk(input_root):
        for file in files:
            if file.lower().endswith('.mp4'):
                print("audio_counter", audio_counter)
                print("aud", aud)
                audio_counter = audio_counter +1
                if 60000<audio_counter<90000:
                  video_path = os.path.join(root, file)
                  print(f"Processing: {video_path}")

                  # Check if video has an audio track
                  if not has_audio(video_path):
                      print("  Skipping (no audio track)")
                      continue

                  # Get video (audio) duration
                  video_duration = get_video_duration(video_path)
                  if video_duration < clip_duration:
                      print(f"  Skipping (duration {video_duration:.2f} sec is less than {clip_duration} sec)")

                      continue

                  # Define output filename
                  output_filename = f"audio_c_{file_counter:05d}.wav"
                  file_counter = file_counter + 1
                  output_path = os.path.join(output_audio_dir, output_filename)

                  # Extract the first 6 seconds of audio
                  extract_audio(video_path, output_path, duration=clip_duration)
                aud += 1

if __name__ == "__main__":
    # Change these paths to match your environment.
    input_root = '/content/dataset'
    output_audio_dir = 'audio'

    main(input_root, output_audio_dir, clip_duration=6)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  Skipping (duration 5.92 sec is less than 6 sec)
audio_counter 71487
aud 65756
Processing: /content/dataset/id04246/NlYU0lQkcX4/NlYU0lQkcX4#00022#4432-4542_cropped.mp4
  Skipping (duration 4.44 sec is less than 6 sec)
audio_counter 71488
aud 65756
Processing: /content/dataset/id04246/NlYU0lQkcX4/NlYU0lQkcX4#00026#7137-7314_cropped.mp4
Extracted audio to audio/audio_c_05756.wav
audio_counter 71489
aud 65757
Processing: /content/dataset/id04246/NlYU0lQkcX4/NlYU0lQkcX4#00023#4710-4835_cropped.mp4
  Skipping (duration 5.04 sec is less than 6 sec)
audio_counter 71490
aud 65757
Processing: /content/dataset/id04246/bP5O-MrsZrk/bP5O-MrsZrk#00058#9731-9836_cropped.mp4
  Skipping (duration 4.23 sec is less than 6 sec)
audio_counter 71491
aud 65757
Processing: /content/dataset/id04246/bP5O-MrsZrk/bP5O-MrsZrk#00060#10189-10313_cropped.mp4
  Skipping (duration 4.99 sec is less than 6 sec)
audio_counter 71492
aud 65757
Processing: /co

In [None]:
import os
import subprocess

def resample_normalize_and_clip(input_dir, output_dir, target_sample_rate=16000, clip_duration_sec=6):
    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    for filename in os.listdir(input_dir):
        if filename.endswith(".wav"):
            file_path = os.path.join(input_dir, filename)
            output_path = os.path.join(output_dir, filename)

            # Construct the FFmpeg command to resample, normalize, and clip the audio
            command = [
                "ffmpeg",
                "-i", file_path,                    # Input file
                "-ar", str(target_sample_rate),      # Set sample rate to 16kHz
                "-t", str(clip_duration_sec),        # Clip the audio to 6 seconds
                "-filter:a", "volume=1.0",           # Normalize audio power (optional)
                "-af", "loudnorm",                   # Apply loudness normalization
                output_path                          # Output file
            ]

            # Run the command
            subprocess.run(command, check=True)
            print(f"Processed: {filename}")

            # Remove the original file after processing
            os.remove(file_path)
            print(f"Deleted original: {filename}")

# Example usage
input_directory = '/content/audio'
output_directory = '/content/audio1'
resample_normalize_and_clip(input_directory, output_directory)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Processed: audio_10123.wav
Deleted original: audio_10123.wav
Processed: audio_03226.wav
Deleted original: audio_03226.wav
Processed: audio_27098.wav
Deleted original: audio_27098.wav
Processed: audio_11756.wav
Deleted original: audio_11756.wav
Processed: audio_02728.wav
Deleted original: audio_02728.wav
Processed: audio_10052.wav
Deleted original: audio_10052.wav
Processed: audio_18326.wav
Deleted original: audio_18326.wav
Processed: audio_10336.wav
Deleted original: audio_10336.wav
Processed: audio_08035.wav
Deleted original: audio_08035.wav
Processed: audio_00173.wav
Deleted original: audio_00173.wav
Processed: audio_26244.wav
Deleted original: audio_26244.wav
Processed: audio_11385.wav
Deleted original: audio_11385.wav
Processed: audio_12155.wav
Deleted original: audio_12155.wav
Processed: audio_24583.wav
Deleted original: audio_24583.wav
Processed: audio_01733.wav
Deleted original: audio_01733.wav
Processed: audio_253

KeyboardInterrupt: 

In [None]:
!zip -r aud1.zip /content/audio1

30G	/content/dataset


In [None]:
!du -sh /content/audio1

1.1G	/content/audio1


In [None]:
!rm -rf /content/audio