In [16]:
import numpy as np
import pandas as pd

# Define product categories and their weights
products = ['soaps', 'detergents', 'body wash', 'pain relievers', 'foot cream', 'canes', 'toilet paper', 'ipads', 'headphones', 'TVs', 'Jewelry']
weights = [0.1, 0.1, 0.1, 0.05, 0.05, 0.02, 0.1, 0.1, 0.1, 0.1, 0.08]

# Define customer segments based on income and age
segments = {
    'high_income_older': {'income': (75000, 150000), 'age': (50, 100)},
    'high_income_younger': {'income': (75000, 150000), 'age': (18, 49)},
    'low_income_older': {'income': (0, 74999), 'age': (50, 100)},
    'low_income_younger': {'income': (0, 74999), 'age': (18, 49)}
}

# Define locations
locations = ['New York', 'California', 'Texas']

# Generate customer data
num_customers = 10000
customer_data = []

for i in range(num_customers):
    # Randomly select customer segment and location
    segment = np.random.choice(list(segments.keys()))
    location = np.random.choice(locations)
    
    # Assign income and age based on segment
    income = np.random.randint(segments[segment]['income'][0], segments[segment]['income'][1])
    age = np.random.randint(segments[segment]['age'][0], segments[segment]['age'][1])
    
    # Assign total amount spent based on income and age
    if income > 100000:
        if age > 40:
            total_spent = np.random.normal(loc=300, scale=50)
        else:
            total_spent = np.random.normal(loc=250, scale=50)
    else:
        if age > 40:
            total_spent = np.random.normal(loc=200, scale=50)
        else:
            total_spent = np.random.normal(loc=150, scale=50)
    
    # Assign amount spent for each product based on weights
    product_spending = np.round(np.random.dirichlet(np.ones(len(products)) * 10) * total_spent * np.random.uniform(0.8, 1.2, len(products)), decimals=2)
    
    # Add data to list
    customer_data.append([age, income, location, total_spent] + list(product_spending))
    
# Convert to DataFrame
customer_df = pd.DataFrame(customer_data, columns=['Age', 'Income', 'Location', 'Total Spent'] + products)



In [4]:
customer_data.head()

Unnamed: 0,Income,Age,Location,Total Spent,soaps,detergents,body wash,pain relievers,foot cream,canes,toilet paper,ipads,headphones,TVs,Jewelry
0,37826.229821,30.532061,California,462.026307,375.090189,38.949251,12.749409,168.780845,279.074131,554.091485,468.137471,367.365524,45.392167,17.527464,323.353069
1,37834.03152,32.075799,California,489.160637,465.327223,43.832968,6.669093,198.716016,418.384452,76.918812,692.960365,-7.709464,38.179279,56.285365,380.899616
2,56270.895376,47.052293,California,523.552207,442.285876,31.622144,12.505727,236.978624,317.211526,457.734826,442.609645,488.405331,44.394806,32.934979,248.01161
3,42759.966824,38.146583,New York,542.725423,337.109552,39.194341,9.07784,130.665778,271.694004,341.73133,434.632198,326.86319,49.267176,66.813134,319.045552
4,32292.428055,29.997007,New York,422.100306,489.230388,27.192577,12.944115,118.693898,88.701085,330.549814,219.06839,77.162692,17.838922,23.308431,396.361383


In [18]:
import plotly.express as px

# Cluster visualization based on age and total spending
fig1 = px.scatter(customer_df, x='Age', y='Total Spent',
                  hover_data=['Income', 'Location'])
fig1.show()

# Cluster visualization based on price and quantity for each product
fig2 = px.scatter(customer_df, x='Total Spent', y='Income',  hover_data=['Age', 'Location'])
fig2.show()

In [9]:
customer_data.columns
customer_df = customer_data 

In [19]:
from sklearn.preprocessing import MinMaxScaler

# Define the features to scale
features_to_scale = ['Age', 'Income', 'Location', 'Total Spent', 'soaps', 'detergents', 'body wash', 
                     'pain relievers', 'foot cream', 'canes', 'toilet paper', 'ipads', 'headphones', 
                     'TVs', 'Jewelry']

# Create a MinMaxScaler object
scaler = MinMaxScaler()

customer_df['Location'] = customer_df['Location'].astype('category')
customer_df['Location']  = customer_df['Location'] .cat.codes
# Scale the features in the customer segmentation data
customer_seg_scaled = customer_df.copy()
customer_seg_scaled[features_to_scale] = scaler.fit_transform(customer_df[features_to_scale])

# Display the scaled data
print(customer_seg_scaled.head())

        Age    Income  Location  Total Spent     soaps  detergents  body wash  \
0  0.296296  0.719834       0.5     0.654211  0.421961    0.478123   0.472515   
1  0.592593  0.895099       0.5     0.608378  0.419134    0.315894   0.406373   
2  0.098765  0.545649       0.0     0.371109  0.270793    0.145888   0.326774   
3  0.654321  0.318363       1.0     0.360970  0.195804    0.249736   0.169100   
4  0.283951  0.948433       1.0     0.677388  0.543520    0.676726   0.316745   

   pain relievers  foot cream     canes  toilet paper     ipads  headphones  \
0        0.338869    0.289875  0.477055      0.369470  0.444264    0.272416   
1        0.365889    0.410074  0.388418      0.210829  0.368935    0.398937   
2        0.213391    0.207072  0.177684      0.368779  0.099642    0.233904   
3        0.220509    0.280590  0.193253      0.251037  0.291545    0.176728   
4        0.324634    0.293055  0.492215      0.250000  0.193314    0.415239   

        TVs   Jewelry  
0  0.405049  0

In [7]:
print(customer_seg_scaled.columns)

Index(['Age', 'Income', 'Location', 'Total Spent', 'soaps', 'detergents',
       'body wash', 'pain relievers', 'foot cream', 'canes', 'toilet paper',
       'ipads', 'headphones', 'TVs', 'Jewelry'],
      dtype='object')


In [22]:
from sklearn.cluster import KMeans
X = customer_seg_scaled[['Age', 'Income', 'Location', 'Total Spent', 'soaps', 'detergents',
       'body wash', 'pain relievers', 'foot cream', 'canes', 'toilet paper',
       'ipads', 'headphones', 'TVs', 'Jewelry']]
kmeans = KMeans(n_clusters=3, random_state=0).fit(X)
customer_df['labels'] = kmeans.labels_

In [23]:
import plotly.express as px

# Cluster visualization based on age and total spending
fig1 = px.scatter(customer_df, x='Age', y='Total Spent',
                  hover_data=['Income', 'Location'], color='labels')
fig1.show()


fig2 = px.scatter(customer_df, x='Total Spent', y='Income',  hover_data=['Age', 'Location'], color='labels')
fig2.show()

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=c4b20a38-75f0-406e-873e-9c308a030295' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>