In [None]:
# download_and_save_data.ipynb

import torch
from torchvision import datasets, transforms
import pandas as pd
import os
import numpy as np # Import numpy

print("Starting data download and conversion...")

# Define transformations: just ToTensor for initial download
transform = transforms.ToTensor()

# Define the root directory for data download
# This will be fashion/data if run from fashion/notebooks/ or fashion/
data_root_dir = '../data' # Go up one level to 'fashion/', then into 'data/'
                             # Or use 'data' if you intend to run this from the project root (fashion/)

# Create the data/raw directory if it doesn't exist
raw_data_dir = os.path.join(data_root_dir, 'raw')
os.makedirs(raw_data_dir, exist_ok=True)
print(f"Ensured raw data directory exists at: {os.path.abspath(raw_data_dir)}")

# 1. Download FashionMNIST training data
print("Downloading FashionMNIST training data...")
train_dataset_original = datasets.FashionMNIST(
    root=data_root_dir,
    train=True,
    download=True,
    transform=transform
)
print("FashionMNIST training data downloaded.")

# 2. Download FashionMNIST test data
print("Downloading FashionMNIST test data...")
test_dataset_original = datasets.FashionMNIST(
    root=data_root_dir,
    train=False,
    download=True,
    transform=transform
)
print("FashionMNIST test data downloaded.")

# --- Process and Save Training Data ---
print("Processing and saving training data to CSVs...")
X_train_list = []
y_train_list = []
for img, label in train_dataset_original:
    # Flatten the 28x28 image (which is now a 1x28x28 tensor after ToTensor) to a 784-element array
    X_train_list.append(img.view(-1).numpy())
    y_train_list.append(label)

X_train_df = pd.DataFrame(X_train_list)
y_train_df = pd.DataFrame(y_train_list)

X_train_csv_path = os.path.join(raw_data_dir, 'X_train.csv')
y_train_csv_path = os.path.join(raw_data_dir, 'y_train.csv')

X_train_df.to_csv(X_train_csv_path, index=False, header=False) # No header, no index
y_train_df.to_csv(y_train_csv_path, index=False, header=False) # No header, no index
print(f"Training data saved to: {X_train_csv_path} and {y_train_csv_path}")


# --- Process and Save Test Data ---
print("Processing and saving test data to CSVs...")
X_test_list = []
y_test_list = []
for img, label in test_dataset_original:
    # Flatten the 28x28 image to a 784-element array
    X_test_list.append(img.view(-1).numpy())
    y_test_list.append(label)

X_test_df = pd.DataFrame(X_test_list)
y_test_df = pd.DataFrame(y_test_list)

X_test_csv_path = os.path.join(raw_data_dir, 'X_test.csv')
y_test_csv_path = os.Path.join(raw_data_dir, 'y_test.csv')

X_test_df.to_csv(X_test_csv_path, index=False, header=False) # No header, no index
y_test_df.to_csv(y_test_csv_path, index=False, header=False) # No header, no index
print(f"Test data saved to: {X_test_csv_path} and {y_test_csv_path}")

print("\nData download and conversion complete. Your CSVs are in data/raw/.")