# Data Exploration

In this notebook, we will explore the fashion dataset to understand its structure, visualize some samples, and gather statistics.

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from src.data.dataset import FashionDataset

# Set the style for seaborn
sns.set(style='whitegrid')

In [2]:
# Load the dataset
data_dir = '/content/drive/MyDrive/path_to_your_dataset/'  # Update this path
fashion_data = FashionDataset(data_dir)

# Display the first few samples
fashion_data_df = fashion_data.load_data()  # Assuming load_data returns a DataFrame
fashion_data_df.head()

In [3]:
# Visualize some samples from the dataset
def visualize_samples(data, num_samples=5):
    plt.figure(figsize=(15, 5))
    for i in range(num_samples):
        plt.subplot(1, num_samples, i + 1)
        plt.imshow(data[i]['image'])  # Assuming each sample has an 'image' key
        plt.title(data[i]['label'])  # Assuming each sample has a 'label' key
        plt.axis('off')
    plt.show()

visualize_samples(fashion_data_df.sample(5).to_dict(orient='records'))

In [4]:
# Gather statistics about the dataset
def dataset_statistics(data):
    print(f'Total samples: {len(data)}')
    print(f'Classes: {data['label'].unique()}')
    print(f'Class distribution:')
    print(data['label'].value_counts())

dataset_statistics(fashion_data_df)

In [5]:
# Plot class distribution
plt.figure(figsize=(12, 6))
sns.countplot(y='label', data=fashion_data_df, order=fashion_data_df['label'].value_counts().index)
plt.title('Class Distribution')
plt.xlabel('Number of Samples')
plt.ylabel('Class')
plt.show()