# CVSS Dataset Preprocessing - Colab

**Purpose**: Preprocess CVSS-T dataset for training
**GPU**: L4 (to save compute units)
**Storage**: Google Drive

This notebook handles:
- Dataset upload and extraction
- Metadata creation
- Audio preprocessing
- Google Drive storage setup


In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Create directory structure
import os
base_path = '/content/drive/MyDrive/Thesis_Training'
os.makedirs(f'{base_path}/datasets', exist_ok=True)
os.makedirs(f'{base_path}/models', exist_ok=True)
os.makedirs(f'{base_path}/results', exist_ok=True)

print("Google Drive mounted successfully!")
print(f"Base path: {base_path}")


In [None]:
# Install required packages
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install transformers soundfile librosa

import torch
import torchaudio
import torchaudio.transforms as T
import numpy as np
import json
import soundfile as sf
from pathlib import Path
from tqdm import tqdm

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")


In [None]:
# Upload and extract CVSS dataset
from google.colab import files
import zipfile

print("Please upload your cvss_dataset.zip file:")
uploaded = files.upload()

# Extract the dataset
for filename in uploaded.keys():
    if filename.endswith('.zip'):
        print(f"Extracting {filename}...")
        with zipfile.ZipFile(filename, 'r') as zip_ref:
            zip_ref.extractall('.')
        print(f"Extracted {filename} successfully!")

# Move to Google Drive
import shutil
if os.path.exists('professional_cvss_dataset'):
    shutil.move('professional_cvss_dataset', f'{base_path}/datasets/')
    print("Dataset moved to Google Drive!")

# List extracted contents
!find {base_path}/datasets -name "*.wav" | head -10
