In [2]:
import pandas as pd
import random
from faker import Faker
import os
from dotenv import load_dotenv
from openai import OpenAI
import csv


In [3]:
fake = Faker()

ont_cities = [
    "Toronto", "Ottawa"
]

# Definitions
brands = ['Zentrax', 'FootFlex', 'StrideOne', 'Loopic', 'RunXpress']
types = ['Running', 'Walking']
classes = ['Men', 'Women']
materials = ['Synthetic', 'Knit']
colors = ['Black', 'White']
arch_supports = ['High', 'Flat']
weather_resistances = ['Waterproof', 'Resistant']
sizes = [round(s, 1) for s in range(6, 13)] + [s + 0.5 for s in range(6, 13)]
store_ids = range(1, 21)

# Helper: create a fake product name
def create_product_name(brand, shoe_type):
    return f"{brand} {random.choice(['Ultra', 'Flex', 'Pro', 'X', 'Max'])} {shoe_type}"

# Helper: create fake keywords
def generate_keywords(shoe_type, material):
    keywords = [shoe_type.lower(), material.lower()]
    keywords += random.sample(['lightweight', 'durable', 'breathable', 'cushioned', 'supportive', 'flexible'], 3)
    return ', '.join(keywords)

def generate_shoe_data(n=500):
    data = []
    used_skus = set()

    for _ in range(n):
        brand = random.choice(brands)
        shoe_type = random.choice(types)
        shoe_class = random.choice(classes)
        material = random.choice(materials)
        size = random.choice(sizes)
        color = random.choice(colors)
        arch = random.choice(arch_supports)
        weather = random.choice(weather_resistances)
        store_id = random.choice(store_ids)
        city = random.choice(ont_cities)
                
        price = round(random.uniform(29.99, 149.99), 2)
        rating = round(random.uniform(3.0, 5.0), 1)
        product_name = create_product_name(brand, shoe_type)

        # Ensure SKU uniqueness
        while True:
            sku = f"{brand[:3].upper()}-{random.randint(1000, 9999)}"
            if sku not in used_skus:
                used_skus.add(sku)
                break

        data.append({
            'SKU': sku,
            'PRODUCT_NAME': product_name,
            'BRAND': brand,
            'CLASS': shoe_class,
            'TYPE': shoe_type,
            'MATERIAL': material,
            'COLOR': color,
            'WEATHER_RESISTANCE': weather,
            'ARCH_SUPPORT': arch,
            'SIZE': size,
            'PRICE': price,
            'RATING': rating,
            'STORE_ID': store_id,
            'CITY': city
        })

    return pd.DataFrame(data)

# Generate and save
df_shoes = generate_shoe_data(500)
# sq_shoes.to_csv("shoes.csv", index=False)
# print("Dataset saved as 'shoes.csv'")

In [4]:
df_shoes.head()

Unnamed: 0,SKU,PRODUCT_NAME,BRAND,CLASS,TYPE,MATERIAL,COLOR,WEATHER_RESISTANCE,ARCH_SUPPORT,SIZE,PRICE,RATING,STORE_ID,CITY
0,LOO-5830,Loopic Flex Walking,Loopic,Men,Walking,Synthetic,White,Resistant,Flat,10.5,40.89,4.1,6,Toronto
1,STR-1748,StrideOne Pro Walking,StrideOne,Women,Walking,Synthetic,White,Waterproof,Flat,8.5,63.11,4.3,16,Ottawa
2,STR-1771,StrideOne Flex Running,StrideOne,Women,Running,Synthetic,White,Waterproof,Flat,10.5,117.42,4.9,17,Toronto
3,ZEN-6307,Zentrax Flex Running,Zentrax,Men,Running,Knit,Black,Waterproof,High,6.0,64.91,3.7,11,Ottawa
4,ZEN-1373,Zentrax Pro Running,Zentrax,Women,Running,Knit,White,Resistant,Flat,12.5,112.09,4.5,4,Toronto


In [5]:
embedding_cols = ['TYPE', 'MATERIAL', 'COLOR', 'WEATHER_RESISTANCE', 'ARCH_SUPPORT']

In [6]:
df_shoes[embedding_cols].head()

Unnamed: 0,TYPE,MATERIAL,COLOR,WEATHER_RESISTANCE,ARCH_SUPPORT
0,Walking,Synthetic,White,Resistant,Flat
1,Walking,Synthetic,White,Waterproof,Flat
2,Running,Synthetic,White,Waterproof,Flat
3,Running,Knit,Black,Waterproof,High
4,Running,Knit,White,Resistant,Flat


# Generating embedding vetors for the shoes

Combine all embedding columns into a single string for each row, including column names

In [7]:
df_shoes['COMBINED'] = df_shoes.apply(
    lambda row: ' [SEP] '.join([f"{col_name}: {row[col_name]}" for col_name in embedding_cols]), 
    axis=1
)

In [8]:
cols_to_show = ['TYPE', 'MATERIAL', 'COLOR', 'WEATHER_RESISTANCE', 'ARCH_SUPPORT', 'COMBINED']
df_shoes[cols_to_show].head()

Unnamed: 0,TYPE,MATERIAL,COLOR,WEATHER_RESISTANCE,ARCH_SUPPORT,COMBINED
0,Walking,Synthetic,White,Resistant,Flat,TYPE: Walking [SEP] MATERIAL: Synthetic [SEP] ...
1,Walking,Synthetic,White,Waterproof,Flat,TYPE: Walking [SEP] MATERIAL: Synthetic [SEP] ...
2,Running,Synthetic,White,Waterproof,Flat,TYPE: Running [SEP] MATERIAL: Synthetic [SEP] ...
3,Running,Knit,Black,Waterproof,High,TYPE: Running [SEP] MATERIAL: Knit [SEP] COLOR...
4,Running,Knit,White,Resistant,Flat,TYPE: Running [SEP] MATERIAL: Knit [SEP] COLOR...


In [9]:
df_shoes.iloc[0]['COMBINED']

'TYPE: Walking [SEP] MATERIAL: Synthetic [SEP] COLOR: White [SEP] WEATHER_RESISTANCE: Resistant [SEP] ARCH_SUPPORT: Flat'

Setting up `openAI` embedding API connection

In [10]:
load_dotenv(os.path.join(os.getcwd(), ".env"), override=True)

# Create OpenAI client
client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
    base_url=os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1"),
)

# Embedding model name from env or default
embed_model = os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-large")


Showing a few sample rows with their embedding vectors

In [11]:
row_combined = df_shoes['COMBINED'].tolist()

# Call OpenAI embeddings API
response = client.embeddings.create(
    model=embed_model,
    input=row_combined,
)

# Extract vectors
shoe_vectors = [item.embedding for item in response.data]

df_shoes['EMBEDDING'] = shoe_vectors
df_shoes['EMBEDDING'] = df_shoes['EMBEDDING'].apply(
    lambda x: '[' + ', '.join(map(str, x)) + ']'
)
df_shoes.drop(['COMBINED'], axis=1, inplace=True)


In [12]:
cols_to_show = ['TYPE', 'MATERIAL', 'COLOR', 'WEATHER_RESISTANCE', 'ARCH_SUPPORT', 'EMBEDDING']
df_shoes[cols_to_show].head()

Unnamed: 0,TYPE,MATERIAL,COLOR,WEATHER_RESISTANCE,ARCH_SUPPORT,EMBEDDING
0,Walking,Synthetic,White,Resistant,Flat,"[-0.035650141537189484, -0.013747972436249256,..."
1,Walking,Synthetic,White,Waterproof,Flat,"[-0.027450963854789734, -0.025129564106464386,..."
2,Running,Synthetic,White,Waterproof,Flat,"[-0.008436889387667179, -0.04816874861717224, ..."
3,Running,Knit,Black,Waterproof,High,"[0.003705444047227502, -0.029707785695791245, ..."
4,Running,Knit,White,Resistant,Flat,"[-0.01030382513999939, -0.043492391705513, -0...."


In [13]:
# df_shoes.iloc[0]['EMBEDDING']

In [14]:
df_shoes.columns

Index(['SKU', 'PRODUCT_NAME', 'BRAND', 'CLASS', 'TYPE', 'MATERIAL', 'COLOR',
       'WEATHER_RESISTANCE', 'ARCH_SUPPORT', 'SIZE', 'PRICE', 'RATING',
       'STORE_ID', 'CITY', 'EMBEDDING'],
      dtype='object')

Save the shoes dataframe into a .csv file

In [15]:
df_shoes.to_csv(
    'shoes-vectors.csv',
    index=False,
    quoting=csv.QUOTE_NONNUMERIC
)