# Data Collecting

In [None]:
import os
import shutil
import requests
import kagglehub

from tqdm import tqdm
from zipfile import ZipFile
from torchvision import datasets
from torchvision import transforms

In [None]:
os.makedirs("data", exist_ok=True)

## COCO

## https://cocodataset.org

In [None]:
def download_and_extract(url, extract_path):
    os.makedirs(extract_path, exist_ok=True)
    zip_path = os.path.join(extract_path, "temp.zip")
    
    # Download
    print(f"Download: {url}")
    response = requests.get(url, stream=True)
    with open(zip_path, "wb") as f:
        for chunk in response.iter_content(chunk_size=1024):
            f.write(chunk)
    
    # Extract
    with ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    
    os.remove(zip_path)
    print(f"Complete! Extracted all files: {extract_path}")

In [None]:
# COCO URLs
coco_urls = {
    "train2017": "http://images.cocodataset.org/zips/train2017.zip",
    "val2017": "http://images.cocodataset.org/zips/val2017.zip",
    "test2017": "http://images.cocodataset.org/zips/test2017.zip",
    "annotations": "http://images.cocodataset.org/annotations/annotations_trainval2017.zip"
}

In [None]:
# Letöltés és kicsomagolás
output_dir = "data/coco"
for name, url in coco_urls.items():
    download_and_extract(url, os.path.join(output_dir, name))

## Flickr30

In [None]:
# Download latest version
flickr30k_source_directory = kagglehub.dataset_download("eeshawn/flickr30k")

print("Path to dataset files:", flickr30k_source_directory)

In [None]:
flickr30k_target_directory = "data/flickr30k"
os.makedirs("data/flickr30k", exist_ok=True)

In [None]:
flickr30k_source_directory, flickr30k_target_directory

In [None]:
for item in tqdm(os.listdir(flickr30k_source_directory)):
    source_path = os.path.join(flickr30k_source_directory, item)
    target_path = os.path.join(flickr30k_target_directory, item)

    shutil.move(source_path, target_path)

## CIFAR-10, CIFAR-100

In [None]:
cifar_target_dir = "data/cifar"
os.makedirs(cifar_target_dir, exist_ok=True)

In [None]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [None]:
cifar10 = datasets.CIFAR10(root=cifar_target_dir, train=False, download=True, transform=transform)

In [None]:
cifar100 = datasets.CIFAR100(root=cifar_target_dir, train=False, download=True, transform=transform)