
#Dataset Preparation
 ‚ö†Ô∏è
**Purpose: Prepare and validate the dataset by downloading, organizing, extracting, and verifying data integrity before proceeding to any deep learning model training.**

In [None]:
# =========================
# 1) Config
# Sets up the main configuration
# =========================
from pathlib import Path

# Sensor folders (same structure for every user)
SENSORS = ["s1asc", "s1dsc", "s2asc", "s2dsc"]

# Base directory in user's Google Drive
BASE = Path("/content/drive/MyDrive/")

# Where downloaded archives will be stored
ARCHIVE_DIRNAME = "archives"

# Where extracted data will be stored
EXTRACT_DIRNAME = "extracted"

# File types used for verification
VERIFY_EXTS = (".nc", ".tif", ".tiff", ".npz")

print("BASE:", BASE)
print("SENSORS:", SENSORS)


BASE: /content/drive/MyDrive
SENSORS: ['s1asc', 's1dsc', 's2asc', 's2dsc']


In [None]:
# =========================
# 2) Drive Mount
# =========================

import os
from google.colab import drive

def ensure_drive_mount(preferred="/content/drive"):
    """
    Safely mount Google Drive.
    - If already mounted ‚Üí do nothing
    - If mountpoint is not empty ‚Üí use alternative mountpoint
    """
    # Drive already mounted
    if os.path.isdir(preferred) and "MyDrive" in os.listdir(preferred):
        print("‚úÖ Drive already mounted at", preferred)
        return preferred

    # Try preferred mountpoint
    try:
        os.makedirs(preferred, exist_ok=True)
        if os.listdir(preferred):  # not empty ‚Üí unsafe
            raise RuntimeError("Mountpoint not empty")

        drive.mount(preferred)
        return preferred

    except Exception:
        # Fallback mountpoint
        alt = "/content/drive_mount"
        os.makedirs(alt, exist_ok=True)
        print(f"‚ö†Ô∏è Using alternative mountpoint: {alt}")
        drive.mount(alt)
        return alt

MOUNTPOINT = ensure_drive_mount()
print("MOUNTPOINT:", MOUNTPOINT)


Mounted at /content/drive
MOUNTPOINT: /content/drive


In [None]:
# =========================
# 3) Create Folder Structure
# =========================
for s in SENSORS:
    (BASE / s / ARCHIVE_DIRNAME).mkdir(parents=True, exist_ok=True)
    (BASE / s / EXTRACT_DIRNAME).mkdir(parents=True, exist_ok=True)

print("‚úÖ Folder structure created under:", BASE)


‚úÖ Folder structure created under: /content/drive/MyDrive


In [None]:
# =========================
# 4) HuggingFace Login
# =========================

!pip -q install -U huggingface_hub==0.23.4

from huggingface_hub import login
login()  # secure interactive login (no token stored in notebook)


[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/402.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[91m‚ï∏[0m[90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m307.2/402.6 kB[0m [31m9.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m402.6/402.6 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
peft 0.18.0 requires huggingface_hub>=0.25.0, but you have huggingface-hub 0.23.4 which is incompatible.
transformers 4.57.3 requires huggingface-hub<1.0,>=0.34.0, but you have huggingface-hub 0.23.4 whi

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv‚Ä¶

In [None]:
# =========================
# 5a) Download from HuggingFace Hub
# =========================
from huggingface_hub import hf_hub_download
from pathlib import Path

# CHANGE THESE
repo_id = "OWNER/REPO"       # e.g. "username/dataset-name"
repo_type = "dataset"        # or "model"

FILES_BY_SENSOR = {
    "s1asc": ["s1asc_part01.tar.gz", "s1asc_part02.tar.gz"],
    "s1dsc": ["s1dsc_part01.tar.gz"],
    "s2asc": ["s2asc_part01.tar.gz"],
    "s2dsc": ["s2dsc_part01.tar.gz"],
}

def download_if_missing(sensor, filename):
    out_dir = BASE / sensor / ARCHIVE_DIRNAME
    out_path = out_dir / filename

    if out_path.exists() and out_path.stat().st_size > 0:
        print("‚úÖ already exists:", out_path.name)
        return out_path

    print("‚¨áÔ∏è downloading:", filename)
    tmp = hf_hub_download(
        repo_id=repo_id,
        repo_type=repo_type,
        filename=filename
    )
    out_path.write_bytes(Path(tmp).read_bytes())
    print("‚úÖ saved:", out_path)
    return out_path

for sensor, files in FILES_BY_SENSOR.items():
    for f in files:
        download_if_missing(sensor, f)


In [None]:
# =========================
# 6) Safe Extract (idempotent)
# Defines extraction utilities using tarfile
# =========================

import tarfile
from pathlib import Path

def safe_extract_tar(tar_path: Path, out_dir: Path):
    marker = out_dir / f"._EXTRACTED_{tar_path.name}.done"
    if marker.exists():
        print("‚úÖ skip:", tar_path.name)
        return

    print("üì¶ extracting:", tar_path.name)
    with tarfile.open(tar_path, "r:*") as tf:
        try:
            tf.extractall(path=out_dir, filter="data")  # secure (Python ‚â•3.12)
        except TypeError:
            tf.extractall(path=out_dir)

    marker.write_text("ok")
    print("‚úÖ done:", tar_path.name)

def extract_sensor(sensor):
    arch_dir = BASE / sensor / ARCHIVE_DIRNAME
    out_dir  = BASE / sensor / EXTRACT_DIRNAME

    archives = (
        list(arch_dir.glob("*.tar")) +
        list(arch_dir.glob("*.tar.gz")) +
        list(arch_dir.glob("*.tgz"))
    )

    if not archives:
        print(f"‚ö†Ô∏è {sensor}: no archives found")
        return

    for a in sorted(archives):
        safe_extract_tar(a, out_dir)

for s in SENSORS:
    extract_sensor(s)


In [None]:
# =========================
# 7) Verification & Report
# Scans extracted folders and prints a report.
# =========================
import time

def quick_stats(folder, exts=VERIFY_EXTS, sample_n=3):
    t0 = time.time()
    counts = {e: 0 for e in exts}
    size = 0
    samples = []

    for ext in exts:
        for p in folder.rglob(f"*{ext}"):
            try:
                st = p.stat()
            except FileNotFoundError:
                continue
            counts[ext] += 1
            size += st.st_size
            if len(samples) < sample_n:
                samples.append(str(p))

    return counts, size, samples, time.time() - t0

print("\n=========== DATA VERIFICATION ===========")
for s in SENSORS:
    out = BASE / s / EXTRACT_DIRNAME
    if not out.exists():
        print(f"{s}: ‚ùå missing extracted directory")
        continue

    counts, size, samples, secs = quick_stats(out)
    gb = size / (1024**3)

    print(f"\n{s} | {gb:.2f} GB | scan {secs:.1f}s")
    print(" counts:", {k:v for k,v in counts.items() if v})
    for x in samples:
        print("  sample:", x)


In [None]:
# =========================
# 8) Verification & Report
# List dataset .nc files
# =========================

from google.colab import drive
drive.mount("/content/drive")

# Sanity check
!ls -la /content/drive | head
!ls -la /content/drive/MyDrive | head

from pathlib import Path

BASE = Path("/content/drive/MyDrive/attention_unet/sen12landslidedata")


for folder in ["s1asc", "s1dsc", "s2"]:
    path = BASE / folder
    nc_files = sorted(path.glob("*.nc"))

    print(f"\nüìÅ {folder}")
    print(f"Total .nc files: {len(nc_files)}")

    for f in nc_files[:10]:
        print("  ", f.name)

In [None]:
# =========================
# 10) Path validation
# Validate dataset paths
# =========================

from pathlib import Path
import os

# Update this to YOUR folder that contains: s1asc/, s1dsc/, s2/
BASE = Path("/content/drive/MyDrive/attention_unet/sen12landslidedata")  # <-- change if needed

assert str(BASE).startswith("/content/drive/"), "BASE must point inside Google Drive (/content/drive/MyDrive/...)"
assert os.path.exists("/content/drive/MyDrive"), "Drive not mounted. Run drive.mount('/content/drive') first."

S1ASC_DIR = BASE / "s1asc"
S1DSC_DIR = BASE / "s1dsc"
S2_DIR    = BASE / "s2"

assert S1ASC_DIR.exists(), f"Missing: {S1ASC_DIR}"
assert S1DSC_DIR.exists(), f"Missing: {S1DSC_DIR}"
assert S2_DIR.exists(),    f"Missing: {S2_DIR}"

print("OK ‚úÖ Found folders")
print("s1asc:", S1ASC_DIR)
print("s1dsc:", S1DSC_DIR)
print("s2   :", S2_DIR)
