In [16]:
from pymongo import MongoClient
import numpy as np
from sklearn.cluster import KMeans
from joblib import dump


CONNECTION_STRING = "mongodb://localhost:27017"
client = MongoClient(CONNECTION_STRING)
db = client["florence"]
collection = db["products"]

### Fetching data

In [17]:
p = collection.find({})
data = []
for doc in p:
    d = {}
    d["id"] = doc["id"]
    d["fragnance"] = doc["fragnance"]
    d["colors"] = {
        key: round(value, 2) if isinstance(value, float) else value
        for key, value in doc["colors"].items()
    }
    data.append(d)

all_products = list(
    map(
        lambda x: {
            "id": x["id"],
            "fragnance": x["fragnance"],
            "colors": {k: v for k, v in x["colors"].items() if k != "Undefined"},
        },
        data,
    )
)

### Labels

In [18]:
ALL_FRAGRANCES = ["floral", "rosy", "orchid", "lilies"]
COLORS_LIST = [
    "Orange",
    "Yellow-Orange",
    "Red",
    "Red-Orange",
    "Yellow",
    "Yellow-Green",
    "Undefined",
    "Magenta",
    "Green",
    "Purple-Magenta",
    "Purple",
    "Pink",
    "Grayish Purple",
    "Cyan",
    "Light Pink",
    "Red-Magenta",
    "Pale Pink",
    "Blue",
    "Blue-Cyan",
]

### Preparing data points

In [19]:
# Create feature vectors for clustering
X = []
for product in all_products:
    color_vector = [
        product["colors"][colour] if colour in product["colors"] else 0
        for colour in COLORS_LIST
    ]
    frag_vector = [1 if frag in product["fragnance"] else 0 for frag in ALL_FRAGRANCES]
    feature_vector = color_vector + frag_vector
    feature_vector = [float(val) for val in feature_vector]
    X.append(feature_vector)
X = np.array(X)
X

array([[0.3 , 0.12, 0.43, ..., 1.  , 0.  , 0.  ],
       [0.29, 0.  , 0.4 , ..., 1.  , 0.  , 0.  ],
       [0.  , 0.  , 0.84, ..., 1.  , 0.  , 0.  ],
       ...,
       [0.11, 0.44, 0.  , ..., 0.  , 0.  , 0.  ],
       [0.37, 0.3 , 0.  , ..., 1.  , 0.  , 0.  ],
       [0.31, 0.1 , 0.29, ..., 0.  , 0.  , 0.  ]])

### Model training process

In [20]:
num_clusters = 7
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(X)

dump(kmeans, "model.bird")



['model.bird']