In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import cv2
import matplotlib.image as mpimg

import os
import PIL
import PIL.Image
import pathlib


## Dataset Extraction

In [None]:
DATASET_PATH = '/kaggle/input/fashion-product-images-dataset/fashion-dataset/fashion-dataset'

print(os.listdir(DATASET_PATH))

The dataset contains the following files:

- **images.csv**: A CSV file containing the image filename and the link the image from which it is scrapped.
- **images**: A folder that contains all the images listed in the CSV file.
- **styles.csv**: A CSV file containing the textual details of each image like product name, colour, etc.
- **styles**: A folder containing JSON files for each product that stores style attributes of each product seperately.

## Dataset Exploration (Exploratory Data Analysis)

In [None]:
styles_csv = pd.read_csv(os.path.join(DATASET_PATH, "styles.csv"), error_bad_lines=False)

In [None]:
styles_csv.head()

We can see that few lines were skipped while reading this CSV. The warning prompt isn't quite intuitive. We can see the count of skipped lines later.

In [None]:
images_csv = pd.read_csv(os.path.join(DATASET_PATH, "images.csv"), error_bad_lines=False)
styles_csv.shape, images_csv.shape

Ideally we should have same rows in both images and styles, but we have *44424* rows in styles_csv and *44446* rows in images_csv. This explains that around *22* rows were skipped while reading styles_csv.

We can skip those images that doesn't textual information while recommending based on text.

In [None]:
styles_csv.head()

We can see that styles_csv contains all the textual information required to define a product. It has gender data, various categories, colour, display name, etc.



In [None]:
images_csv.head()

## Textual EDA

We'll now explore the textual information from styles.csv.

### Gender Distribution

In [None]:
import plotly.express as px

# fig = px.colors.sequential.swatches_continuous()
# fig.show()

In [None]:
fig = px.pie(styles_csv, styles_csv['gender'],color_discrete_sequence=px.colors.sequential.dense, opacity=0.9)
fig.show()

### Master Category Distribution


This feature tells us about the primary category that the product belongs to (Apparel, Accessories, Footwear, etc.)

In [None]:
import plotly.graph_objects as go

catcounts = pd.value_counts(styles_csv['masterCategory'])
fig = go.Figure([go.Bar(x=catcounts.index, y=catcounts.values ,text=catcounts.values, marker_color=px.colors.sequential.Aggrnyl)])
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.show()

### Sub Category Distribution

In [None]:
catcounts=pd.value_counts(styles_csv['subCategory'])
fig = go.Figure([go.Bar(x=catcounts.index, y=catcounts.values ,text=catcounts.values, marker_color=px.colors.sequential.Aggrnyl)])
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.show()

### Article Type Distribution

In [None]:
catcounts=pd.value_counts(styles_csv['articleType'])
fig = go.Figure([go.Bar(x=catcounts.index, y=catcounts.values ,text=catcounts.values, marker_color=px.colors.sequential.Aggrnyl)])
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.show()

### Season Distribution

In [None]:
seasons=pd.value_counts(styles_csv['season'])

fig = go.Figure(data=[go.Scatter(
    x=seasons.index, y=seasons.values,
    mode='markers',
    marker=dict(
        color=px.colors.sequential.Aggrnyl,
        opacity=[1, 0.8, 0.6, 0.4],
        size=[40, 60, 80, 100])
)]
               )

fig.show()

### Usage Distribution

In [None]:
catcounts=pd.value_counts(styles_csv['usage'])
fig = go.Figure([go.Bar(x=catcounts.index, y=catcounts.values ,text=catcounts.values, marker_color=px.colors.sequential.Aggrnyl)])
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.show()

### Base Colour Distribution

In [None]:
catcounts=pd.value_counts(styles_csv['baseColour'])
fig = go.Figure([go.Bar(x=catcounts.index, y=catcounts.values ,text=catcounts.values, marker_color=px.colors.sequential.Aggrnyl)])
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.show()

## Image EDA

We'll now explore the image data.

In [None]:
data_dir = pathlib.Path(DATASET_PATH).with_suffix('')
images = list(data_dir.glob('*/*.jpg'))

In [None]:
PIL.Image.open(str(images[0]))

In [None]:
PIL.Image.open(str(images[1]))

We'll now modify the styles_csv to contain the image path column.

In [None]:
styles_csv['image'] = styles_csv.apply(lambda row: os.path.join(DATASET_PATH, 'images', str(row['id']) + ".jpg"), axis=1)
styles_csv.head()

We'll look at few sample images for each category.

In [None]:
def load_image(img_path, resized_fac = 0.1):
    img_object = plt.imread(img_path)
    w, h, c = img_object.shape
    resized = cv2.resize(img_object, (int(h*resized_fac), int(w*resized_fac)))
    return resized

In [None]:
def plot_grid(image_list, group):
    fig = plt.figure(figsize=(40, 10), facecolor="#e1ddbf")
    plt.suptitle(group, fontsize=40)
    
    samples = len(image_list)
    
    for i in range(samples):
        ax = plt.subplot(1, 4, i + 1)
        plt.imshow(load_image(image_list[i][0]))
        plt.title(image_list[i][1], fontsize=8)
        plt.axis("off")
    plt.show()  

In [None]:
def plot_grouped_images(dataframe, column, samples=4):
    groups = dataframe[column].unique()
    grouped_dataframe = dataframe.groupby(column)
    
    for group in groups:
        
        try:
            image_list = grouped_dataframe.get_group(group).sample(samples)[['image', 'productDisplayName']].values
            plot_grid(image_list, group)
        except:
            image_list = grouped_dataframe.get_group(group).sample(1)[['image', 'productDisplayName']].values
            plot_grid(image_list, group) 

In [None]:
plot_grouped_images(styles_csv, 'masterCategory')

In [None]:
plot_grouped_images(styles_csv, 'subCategory')