In [1]:
import pandas as pd
from sklearn.cluster import MiniBatchKMeans
import pickle
import numpy as np
from wordcloud import WordCloud, STOPWORDS
import gc
import time
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, plot_tree

In [2]:
df_recipes_one_hot = pd.read_pickle('../cleaned_data/recipe_encoded_df.pkl')

# Train models

In [3]:
MIN_CLUSTERS = 2
MAX_CLUSTERS =30
STEP = 2
MAX_ITER = 10000
ROOT_FOLDER = '../temp_models'

In [4]:
model_locs = []
for n_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS, STEP):
    print(f"Training {n_clusters} clusters")
    kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=0, max_iter=MAX_ITER, verbose=1)
    for i in range(0, df_recipes_one_hot.shape[0], 10000):
        kmeans = kmeans.partial_fit(df_recipes_one_hot[i:i+10000])
    filename = f'{ROOT_FOLDER}/trained_kmeans_{n_clusters}_{time.strftime("%Y%m%d-%H%M%S")}.pkl'
    pickle.dump(kmeans, open(filename, 'wb'))
    model_locs.append(filename)
    del kmeans
    gc.collect()

Training 2 clusters
[MiniBatchKMeans] Reassigning 1 cluster centers.
Training 4 clusters
[MiniBatchKMeans] Reassigning 1 cluster centers.
[MiniBatchKMeans] Reassigning 1 cluster centers.
Training 6 clusters
Training 8 clusters
[MiniBatchKMeans] Reassigning 1 cluster centers.
Training 10 clusters
[MiniBatchKMeans] Reassigning 1 cluster centers.
[MiniBatchKMeans] Reassigning 1 cluster centers.
Training 12 clusters
[MiniBatchKMeans] Reassigning 1 cluster centers.
[MiniBatchKMeans] Reassigning 1 cluster centers.
Training 14 clusters
[MiniBatchKMeans] Reassigning 1 cluster centers.
[MiniBatchKMeans] Reassigning 1 cluster centers.
Training 16 clusters
[MiniBatchKMeans] Reassigning 1 cluster centers.
[MiniBatchKMeans] Reassigning 1 cluster centers.
[MiniBatchKMeans] Reassigning 1 cluster centers.
[MiniBatchKMeans] Reassigning 1 cluster centers.
Training 18 clusters
[MiniBatchKMeans] Reassigning 1 cluster centers.
[MiniBatchKMeans] Reassigning 1 cluster centers.
[MiniBatchKMeans] Reassigning 1

In [5]:
model_locs

['../temp_models/trained_kmeans_2_20221117-223707.pkl',
 '../temp_models/trained_kmeans_4_20221117-223709.pkl',
 '../temp_models/trained_kmeans_6_20221117-223711.pkl',
 '../temp_models/trained_kmeans_8_20221117-223713.pkl',
 '../temp_models/trained_kmeans_10_20221117-223715.pkl',
 '../temp_models/trained_kmeans_12_20221117-223717.pkl',
 '../temp_models/trained_kmeans_14_20221117-223719.pkl',
 '../temp_models/trained_kmeans_16_20221117-223721.pkl',
 '../temp_models/trained_kmeans_18_20221117-223723.pkl',
 '../temp_models/trained_kmeans_20_20221117-223725.pkl',
 '../temp_models/trained_kmeans_22_20221117-223727.pkl',
 '../temp_models/trained_kmeans_24_20221117-223730.pkl',
 '../temp_models/trained_kmeans_26_20221117-223732.pkl',
 '../temp_models/trained_kmeans_28_20221117-223734.pkl']

# Evaluate models

In [6]:
random_sample_df = df_recipes_one_hot.sample(n=5000, random_state=0)

In [7]:
for filename in model_locs: 
    kmeans = pickle.load(open(filename, 'rb'))
    print(filename, kmeans.score(random_sample_df))

../temp_models/trained_kmeans_2_20221117-223707.pkl -38770.714574583704
../temp_models/trained_kmeans_4_20221117-223709.pkl -36168.421819326366
../temp_models/trained_kmeans_6_20221117-223711.pkl -35086.00392231671
../temp_models/trained_kmeans_8_20221117-223713.pkl -34843.05256679404
../temp_models/trained_kmeans_10_20221117-223715.pkl -34386.477152431304
../temp_models/trained_kmeans_12_20221117-223717.pkl -34133.12269232504
../temp_models/trained_kmeans_14_20221117-223719.pkl -33627.7808503231
../temp_models/trained_kmeans_16_20221117-223721.pkl -33463.02081205641
../temp_models/trained_kmeans_18_20221117-223723.pkl -33317.28857578833
../temp_models/trained_kmeans_20_20221117-223725.pkl -33238.70020138005
../temp_models/trained_kmeans_22_20221117-223727.pkl -33022.87582957478
../temp_models/trained_kmeans_24_20221117-223730.pkl -32885.23856720834
../temp_models/trained_kmeans_26_20221117-223732.pkl -32730.121696977767
../temp_models/trained_kmeans_28_20221117-223734.pkl -32565.49527

# Visualize content of groups

In [8]:
recipe_df = pd.read_csv('../data/RAW_recipes.csv', index_col='id')

In [9]:
recipe_df['str_name'] = recipe_df['name'].astype(str)

In [10]:
def get_labeled_recipes(filename: str):
    kmeans = pickle.load(open(filename, 'rb'))
    labels = []
    for i in range(0, df_recipes_one_hot.shape[0], 10000):
       labels.extend(kmeans.predict(df_recipes_one_hot[i:i+10000]))
    
    df_recipes_with_labels = df_recipes_one_hot.copy()
    df_recipes_with_labels["label"] = labels
    
    labeled_recipes_df = recipe_df.join(df_recipes_with_labels[["label"]])
    labels = np.unique(labeled_recipes_df["label"])
        
    return labeled_recipes_df, labels

In [11]:
def create_wordclouds(df, labels):
    n_labels = max(labels) + 1
    wordclouds = []
    for label in labels:
        tmp_df = df[df["label"] == label]
        names = np.unique(tmp_df["str_name"])
        wordcloud = WordCloud(stopwords=STOPWORDS).generate(" ".join(names))
        wordclouds.append(wordcloud)
       
    plt.figure()
    rows = len(labels)//5
    if len(labels) % 5 != 0:
        rows += 1
        
    figs, axs = plt.subplots(rows, 5, figsize=(20, 20))
    
    if rows != 1:
        for row in range(rows):
            for i in range(5):
                axs[row][i].axis("off")

        for idx, wordcloud in enumerate(wordclouds):
            axs[idx // 5][idx % 5].imshow(wordcloud, interpolation='bilinear')
            
    else:
        for i in range(5):
            axs[i].axis("off")
            
        for idx, wordcloud in enumerate(wordclouds):
            axs[idx].axis("off")
            axs[idx].imshow(wordcloud, interpolation='bilinear')
            
    plt.savefig(f'words_{n_labels}.svg',format='svg',bbox_inches = "tight")

In [12]:
import math
def create_tree(df, labels):
    n_labels = max(labels) + 1
    depth = int(math.ceil(math.sqrt(n_labels)))
    decision_tree = DecisionTreeClassifier(random_state=0, max_depth=depth).fit(df_recipes_one_hot, df["label"])
    
    plt.figure(figsize=(depth*10, depth*3))
    plot_tree(decision_tree, feature_names=df_recipes_one_hot.columns, filled=True, fontsize=10)
    plt.savefig(f'tree_{n_labels}.svg',format='svg',bbox_inches = "tight")

In [14]:
for idx, file in enumerate(model_locs): 
    labeled_recipes_df, labels = get_labeled_recipes(file)
    labeled_recipes_df.to_pickle(f"../cleaned_data/labeled_recipes_df_{len(labels)}.pkl")    
    # create_wordclouds(labeled_recipes_df, labels)
    # create_tree(labeled_recipes_df, labels)