In [7]:
import numpy as np
import boto3
import os
from pathlib import Path
import sagemaker
from sklearn.datasets import fetch_openml



In [8]:
session = sagemaker.Session()
bucket_name = 'sagemaker-mlops-demo-dev-2024'  # Reemplaza con tu nombre de bucket
s3 = boto3.client('s3')


In [9]:
print(f"Using bucket: {bucket_name}")

Using bucket: sagemaker-mlops-demo-dev-2024


In [10]:
print("Downloading MNIST dataset from scikit-learn...")
X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)


Downloading MNIST dataset from scikit-learn...


In [11]:
X = X / 255.0



In [12]:
train_images = X[:60000]
test_images = X[60000:]
train_labels = y[:60000].astype(np.uint8)
test_labels = y[60000:].astype(np.uint8)

print("Dataset shapes:")
print(f"Train images: {train_images.shape}")
print(f"Train labels: {train_labels.shape}")
print(f"Test images: {test_images.shape}")
print(f"Test labels: {test_labels.shape}")


Dataset shapes:
Train images: (60000, 784)
Train labels: (60000,)
Test images: (10000, 784)
Test labels: (10000,)


In [13]:
os.makedirs('processed_data', exist_ok=True)

print("\nSaving processed data...")
np.save('processed_data/train_images.npy', train_images)
np.save('processed_data/train_labels.npy', train_labels)
np.save('processed_data/test_images.npy', test_images)
np.save('processed_data/test_labels.npy', test_labels)

print("Data processing complete!")



Saving processed data...
Data processing complete!


In [14]:
print(f"\nUploading data to S3 bucket: {bucket_name}")



Uploading data to S3 bucket: sagemaker-mlops-demo-dev-2024


In [15]:
for file_path in Path('processed_data').glob('*.npy'):
    s3_key = f'mnist/data/{file_path.name}'
    print(f"Uploading {file_path} to s3://{bucket_name}/{s3_key}")
    s3.upload_file(
        str(file_path),
        bucket_name,
        s3_key
    )

print("\nUpload complete!")
print(f"Data is now available in s3://{bucket_name}/mnist/data/")


Uploading processed_data/train_images.npy to s3://sagemaker-mlops-demo-dev-2024/mnist/data/train_images.npy
Uploading processed_data/train_labels.npy to s3://sagemaker-mlops-demo-dev-2024/mnist/data/train_labels.npy
Uploading processed_data/test_images.npy to s3://sagemaker-mlops-demo-dev-2024/mnist/data/test_images.npy
Uploading processed_data/test_labels.npy to s3://sagemaker-mlops-demo-dev-2024/mnist/data/test_labels.npy

Upload complete!
Data is now available in s3://sagemaker-mlops-demo-dev-2024/mnist/data/


In [16]:
response = s3.list_objects_v2(
    Bucket=bucket_name,
    Prefix='mnist/data/'
)

print("Files available in S3:")
for obj in response['Contents']:
    print(f"- {obj['Key']}")

Files available in S3:
- mnist/data/test_images.npy
- mnist/data/test_labels.npy
- mnist/data/train_images.npy
- mnist/data/train_labels.npy
