In [27]:
import glob
import os
from typing import Dict, List

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import seaborn
from plotly.subplots import make_subplots

In [130]:
config = {
    "data_folder": "../data/products/",
    "output_folder": "../output/",
}

In [131]:
def get_product_names():
    """Get the product names from the folders in the data folder."""
    folders = glob.glob(config["data_folder"] + "*/")
    product_names = [
        (folder, " ".join(folder.rstrip("/").split("_")[1:])) for folder in folders
    ]
    return product_names

In [197]:
products = get_product_names()
print(products)

[('../data/products/03_bead_bracelets_and_necklaces/', 'bead bracelets and necklaces'), ('../data/products/08_chinese_mid_autumn_gift_sets/', 'chinese mid autumn gift sets'), ('../data/products/12_chinese_pottery/', 'chinese pottery'), ('../data/products/07_chinese_incense/', 'chinese incense'), ('../data/products/13_chinese_magnets/', 'chinese magnets'), ('../data/products/04_paper_lanterns/', 'paper lanterns'), ('../data/products/06_chinese_bamboo_art/', 'chinese bamboo art'), ('../data/products/11_chinese_washi_tape/', 'chinese washi tape'), ('../data/products/10_chinese_art_stickers/', 'chinese art stickers'), ('../data/products/05_brushes_and_calligraphy_tools/', 'brushes and calligraphy tools'), ('../data/products/01_calligraphy_prints/', 'calligraphy prints'), ('../data/products/09_chinese_bookmarks/', 'chinese bookmarks'), ('../data/products/02_name_seals/', 'name seals')]


In [199]:
product_names = [product[1] for product in products]
product_names

['bead bracelets and necklaces',
 'chinese mid autumn gift sets',
 'chinese pottery',
 'chinese incense',
 'chinese magnets',
 'paper lanterns',
 'chinese bamboo art',
 'chinese washi tape',
 'chinese art stickers',
 'brushes and calligraphy tools',
 'calligraphy prints',
 'chinese bookmarks',
 'name seals']

In [200]:
chinese_translations = {
    "bead bracelets and necklaces": "珠子手链和项链",
    "chinese mid autumn gift sets": "中秋节礼品套装",
    "chinese pottery": "中国陶器",
    "chinese incense": "中国香",
    "chinese magnets": "中国冰箱贴",
    "paper lanterns": "纸灯笼",
    "chinese bamboo art": "中国竹艺",
    "chinese washi tape": "中国和纸胶带",
    "chinese art stickers": "中国艺术贴纸",
    "brushes and calligraphy tools": "笔和书法工具",
    "calligraphy prints": "书法印刷品",
    "chinese bookmarks": "中国书签",
    "name seals": "印章",
}

In [219]:
def get_search_term_from_file_name(file_name: str) -> str:
    """Get the search term from the file name."""
    return file_name.replace("_product_detail.csv", "").replace("_", " ")


def get_product_data(data_folder: str, product_name: str) -> pd.DataFrame:
    """Get the data for a product, loaded into a Polars DataFrame."""
    files = glob.glob(data_folder + "/*.csv")
    dataframes = []
    for file in files:
        df = pd.read_csv(file)
        # Add the search term and product name to the dataframe
        df["search_term"] = get_search_term_from_file_name(os.path.basename(file))
        df["product_name"] = product_name
        df["Tags"] = df["Tags"].apply(lambda x: x.split(","))
        df['Price("$")'] = df['Price("$")'].astype(str)
        df["price"] = df['Price("$")'].str.replace(",", "").astype(float)
        df["has_sales"] = df["Total Sales"] > 0
        df["product_name_chinese_name"] = (
            df["product_name"]
            + " ("
            + df["product_name"].map(chinese_translations)
            + ")"
        )
        dataframes.append(df)

    # Concatenate the dataframes
    joined_df = pd.concat(dataframes)
    # Drop duplicates by URL
    joined_df = joined_df.drop_duplicates(subset=["Product URL"])
    return joined_df


def get_all_product_data(products: List[str]) -> pd.DataFrame:
    """Get all the product data for a list of products."""
    dataframes = []
    for product in products:
        dataframes.append(get_product_data(product[0], product[1]))
    return pd.concat(dataframes)

In [220]:
all_product_data = get_all_product_data(products)

In [221]:
all_product_data

Unnamed: 0,Title,Category,"Price(""$"")",7-day sales,Total Sales,Total Reviews,7-day Reviews,Total Favorites,7-day Favorites,Tags,...,Etsy Pick,Raving,Store Name,Product URL,Image URL,search_term,product_name,price,has_sales,product_name_chinese_name
0,"Jade Plate, Type A Genuine Jade, Customizable ...",Craft Supplies & Tools,176.12,0,0,0,0,0,0,"[Jade, Handmade, Traditional, Loose Stone, Loo...",...,False,False,GranskyJewellery,https://www.etsy.com/listing/1632492895/jade-p...,Upgrade Pro to Unlock,traditional stone bead jewelry,bead bracelets and necklaces,176.12,False,bead bracelets and necklaces (珠子手链和项链)
1,"Green/Pink Bangle set, 22k Gold Plated, White ...",Jewelry > Bracelets > Bangles,59.0,0,7,0,0,16,0,"[Bangles Set, Indian Bangles, Traditional Bang...",...,False,False,NemaliJewelry,https://www.etsy.com/listing/762738828/greenpi...,Upgrade Pro to Unlock,traditional stone bead jewelry,bead bracelets and necklaces,59.00,True,bead bracelets and necklaces (珠子手链和项链)
2,Catholic Rosary Beads. Semi Precious Turquoise...,Home & Living > Spirituality & Religion > Pray...,59.99,0,0,0,0,44,0,"[Miraculous Mary, Womans Rosary, Catholic Gift...",...,False,False,RosariesByHeidi,https://www.etsy.com/listing/587645665/catholi...,Upgrade Pro to Unlock,traditional stone bead jewelry,bead bracelets and necklaces,59.99,False,bead bracelets and necklaces (珠子手链和项链)
3,Seed Bead Stitching - Creative Variations On T...,Craft Supplies & Tools,8.75,0,5,0,0,4,0,"[Beading Book, Beaded Necklace, Making Jewelry...",...,False,False,NeedANeedle,https://www.etsy.com/listing/1024845656/seed-b...,Upgrade Pro to Unlock,traditional stone bead jewelry,bead bracelets and necklaces,8.75,True,bead bracelets and necklaces (珠子手链和项链)
4,"Jade Plate, Type A Genuine Jade, Customizable ...",Jewelry > Necklaces > Charm Necklaces,251.88,0,0,0,0,3,0,"[Jade, Handmade, Traditional, Gemstones, Loose...",...,False,False,GranskyJewellery,https://www.etsy.com/listing/1475770717/jade-p...,Upgrade Pro to Unlock,traditional stone bead jewelry,bead bracelets and necklaces,251.88,False,bead bracelets and necklaces (珠子手链和项链)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,Vintage Boxed Pair of Chinese Stone Chop Seals...,Home & Living > Home Decor > Ornaments & Accen...,63.27,0,3,0,0,12,0,[False],...,False,False,VintageVarietyStudio,https://www.etsy.com/listing/1292904295/vintag...,Upgrade Pro to Unlock,chinese calligraphy seals,name seals,63.27,True,name seals (印章)
94,Craft Chinese Chop Cinnabar Stamp Ink Pad,Craft Supplies & Tools,9.0,0,0,0,0,0,0,"[Craft Supplies & Tools, Stamps & Seals, Stamp...",...,False,False,WorldofBacara,https://www.etsy.com/listing/1533034599/craft-...,Upgrade Pro to Unlock,chinese calligraphy seals,name seals,9.00,False,name seals (印章)
43,Personalized Korean Name Stamp Dojang Customiz...,Craft Supplies & Tools,45.99,0,16,0,0,65,0,"[Square, Korean Name Stamp, Dojang, Korean Sta...",...,False,False,SemliCalligraphy,https://www.etsy.com/listing/703128950/persona...,Upgrade Pro to Unlock,chinese name stamps,name seals,45.99,True,name seals (印章)
215,Customized Chinese name stone seal (Oval)/ Chi...,Craft Supplies & Tools,53.21,0,0,4,0,94,0,"[Stone Seal, Seal, Stamps, Chinese, Oriental, ...",...,False,False,Baisimu,https://www.etsy.com/listing/603844244/customi...,Upgrade Pro to Unlock,chinese name stamps,name seals,53.21,False,name seals (印章)


In [222]:
# Output the data to a CSV file
all_product_data.to_csv(config["output_folder"] + "all_product_data.csv", index=False)

In [225]:
def format_col_for_title(col: str) -> str:
    """Format a column name for a title."""
    return " ".join(col.split("_")).title()


def plot_violin_plotly(data, x, y, remove_outliers=False):
    if remove_outliers:
        # Calculate the 1st and 3rd quartiles
        q1 = data[x].quantile(0.25)
        q3 = data[x].quantile(0.75)

        # Calculate the interquartile range (IQR)
        iqr = q3 - q1

        # Define the lower and upper bounds for outliers
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr

        # Filter the data to remove outliers
        data = data[(data[x] >= lower_bound) & (data[x] <= upper_bound)]

    fig = go.Figure(
        data=go.Violin(
            x=data[x],
            y=data[y],
            orientation="h",
            box_visible=True,
            meanline_visible=True,
            points=None,
            spanmode="hard",
        )
    )

    x_label = format_col_for_title(x)
    y_label = format_col_for_title(y)

    fig.update_layout(
        title=f"Violin Plot of {x_label} by {y_label}",
        xaxis_title=x_label,
        yaxis_title=y_label,
        template="simple_white",
    )

    fig.show()


def plot_bar_chart_plotly(
    data, x, y, sorted=True, title=None, x_label=None, y_label=None
):
    if sorted:
        data = data.sort_values(by=y, ascending=False)
    fig = go.Figure(go.Bar(x=data[x], y=data[y], name=format_col_for_title(y)))

    # Add annotations for the bars
    for i, value in enumerate(data[y]):
        fig.add_annotation(
            x=data[x][i],
            y=value,
            text=str(value),
            showarrow=True,
            arrowhead=1,
            font=dict(size=10),
            xanchor="center",
            yanchor="bottom",
        )

    fig.update_layout(
        title=f"Bar Chart of {format_col_for_title(y)} by {format_col_for_title(x)}",
        xaxis_title=format_col_for_title(x) if x_label is None else x_label,
        yaxis_title=format_col_for_title(y) if y_label is None else y_label,
        template="simple_white",
    )
    if title:
        fig.update_layout(title=title)

    fig.show()

In [227]:
# Median price by product
plot_bar_chart_plotly(
    all_product_data.groupby("product_name_chinese_name")
    .agg({"price": "median"})
    .reset_index(),
    x="product_name_chinese_name",
    y="price",
    title="Median Price by Product (All Data)",
    x_label="Product Name",
    y_label="Median Price ($)",
)

In [207]:
# Median total sales by product
plot_bar_chart_plotly(
    all_product_data.groupby("product_name")["Total Sales"].median().reset_index(),
    x="product_name",
    y="Total Sales",
    title="Median Total Sales by Product (All Data)",
)

# Median total sales by product with sales
plot_bar_chart_plotly(
    all_product_data[all_product_data["has_sales"] == True]
    .groupby("product_name")["Total Sales"]
    .median()
    .reset_index(),
    x="product_name",
    y="Total Sales",
    title="Median Total Sales by Product (With Sales)",
)

In [192]:
# Number of unique product URLs by product
plot_bar_chart_plotly(
    all_product_data.groupby("product_name")["Product URL"].nunique().reset_index(),
    x="product_name",
    y="Product URL",
    title="Number of Unique Products",
)

# With sales
plot_bar_chart_plotly(
    all_product_data[all_product_data["has_sales"] == True]
    .groupby("product_name")["Product URL"]
    .nunique()
    .reset_index(),
    x="product_name",
    y="Product URL",
    title="Number of Unique Products (With Sales)",
)


# Plot Percentage of products with sales per product
plot_bar_chart_plotly(
    all_product_data.groupby("product_name")["has_sales"].mean().reset_index(),
    x="product_name",
    y="has_sales",
    title="Percentage of Products with Sales",
)

In [195]:
# Number of unique stores by product
plot_bar_chart_plotly(
    all_product_data.groupby("product_name")["Store Name"].nunique().reset_index(),
    x="product_name",
    y="Store Name",
    title="Number of Unique Stores by Product",
)

# Number of unique stores by product with sales
plot_bar_chart_plotly(
    all_product_data[all_product_data["has_sales"] == True]
    .groupby("product_name")["Store Name"]
    .nunique()
    .reset_index(),
    x="product_name",
    y="Store Name",
    title="Number of Unique Stores by Product (With Sales)",
)

# Percentage of stores with sales by product
# This should basically be the division of the two previous plots
stores_with_sales = (
    all_product_data[all_product_data["has_sales"] == True]
    .groupby("product_name")["Store Name"]
    .nunique()
    .reset_index()
)
stores = all_product_data.groupby("product_name")["Store Name"].nunique().reset_index()
stores_with_sales = stores_with_sales.merge(stores, on="product_name")
stores_with_sales["percentage"] = (
    stores_with_sales["Store Name_x"] / stores_with_sales["Store Name_y"]
)
plot_bar_chart_plotly(
    stores_with_sales,
    x="product_name",
    y="percentage",
    title="Percentage of Stores with Sales by Product",
)