In [None]:
import msa
mcdonalds = msa.datasets.load_mcdonalds()
print(mcdonalds.columns)

In [None]:
import pandas as pd
print(mcdonalds.shape)

In [None]:
mcdonalds = pd.read_csv("mcdonalds.csv")
print(mcdonalds.head(3))

In [None]:
import numpy as np

MD_x = np.array(mcdonalds.iloc[:, 0:11])
MD_x = (MD_x == "Yes").astype(int)
np.round(np.mean(MD_x, axis=0), 2)

In [None]:
from sklearn.decomposition import PCA

MD_pca = PCA().fit(MD_x)
print(MD_pca.explained_variance_ratio_)
print(MD_pca.components_.T)

In [None]:
from sklearn.decomposition import PCA
import numpy as np

# Assuming MD.pca is a matrix of data
pca = PCA()
pca.fit(MD.pca)

# Print the results with one decimal place
print(np.round(pca.components_, decimals=1))

In [None]:
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs

# generate some data
X, y = make_blobs(n_samples=1000, centers=3, n_features=5, random_state=42)

# perform PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# plot the PCA results
plt.scatter(X_pca[:, 0], X_pca[:, 1], c='grey')

# get the projection axes
proj_axes = pca.components_

print(proj_axes)

In [None]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn_extra.cluster import KMedoids

# generate some data
X, y = make_blobs(n_samples=100, centers=3, n_features=4, random_state=42)

# perform k-medoids clustering using flexclust
np.random.seed(1234)
cluster_range = range(2, 9)
best_model = None
best_bic = np.inf

for k in cluster_range:
    for _ in range(10):
        model = KMedoids(n_clusters=k, random_state=np.random.randint(1, 1000))
        model.fit(X)
        bic = model.inertia_
        if bic < best_bic:
            best_model = model
            best_bic = bic

# relabel the clusters
cluster_labels = best_model.predict(X)

In [None]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn_extra.cluster import KMedoids
import matplotlib.pyplot as plt

# generate some data
X, y = make_blobs(n_samples=100, centers=3, n_features=4, random_state=42)

# perform k-medoids clustering using flexclust
np.random.seed(1234)
cluster_range = range(2, 9)
bic_scores = []

for k in cluster_range:
    for _ in range(10):
        model = KMedoids(n_clusters=k, random_state=np.random.randint(1, 1000))
        model.fit(X)
        bic = model.inertia_
        bic_scores.append((k, bic))

# plot the results
plt.plot([x[0] for x in bic_scores], [x[1] for x in bic_scores])
plt.xlabel('number of segments')
plt.ylabel('BIC score')
plt.show()

In [None]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn_extra.cluster import KMedoids
from sklearn.utils import resample

# generate some data
X, y = make_blobs(n_samples=100, centers=3, n_features=4, random_state=42)

# perform bootstrapped k-medoids clustering using flexclust
np.random.seed(1234)
cluster_range = range(2, 9)
boot_bic_scores = []

for k in cluster_range:
    for _ in range(10):
        k_bic_scores = []
        for i in range(100):
            X_resampled = resample(X, random_state=np.random.randint(1, 1000))
            model = KMedoids(n_clusters=k, random_state=np.random.randint(1, 1000))
            model.fit(X_resampled)
            bic = model.inertia_
            k_bic_scores.append(bic)
        boot_bic_scores.append((k, np.mean(k_bic_scores)))

# print the bootstrapped BIC scores
print(boot_bic_scores)

In [None]:
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn_extra.cluster import KMedoids
from sklearn.utils import resample
from sklearn.metrics import adjusted_rand_score
import matplotlib.pyplot as plt

# generate some data
X, y = make_blobs(n_samples=100, centers=3, n_features=4, random_state=42)

# perform bootstrapped k-medoids clustering using flexclust
np.random.seed(1234)
cluster_range = range(2, 9)
boot_bic_scores = []
boot_ari_scores = []

for k in cluster_range:
    for _ in range(10):
        k_bic_scores = []
        k_ari_scores = []
        for i in range(100):
            X_resampled = resample(X, random_state=np.random.randint(1, 1000))
            model = KMedoids(n_clusters=k, random_state=np.random.randint(1, 1000))
            model.fit(X_resampled)
            bic = model.inertia_
            k_bic_scores.append(bic)
            ari = adjusted_rand_score(y, model.predict(X))
            k_ari_scores.append(ari)
        boot_bic_scores.append((k, np.mean(k_bic_scores)))
        boot_ari_scores.append((k, np.mean(k_ari_scores)))

# plot the results
plt.plot([x[0] for x in boot_bic_scores], [x[1] for x in boot_ari_scores])
plt.xlabel('number of segments')
plt.ylabel('adjusted Rand index')
plt.show()


In [None]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt

# generate some data
X, y = make_blobs(n_samples=100, centers=4, n_features=4, random_state=42)

# perform k-means clustering using flexclust
model = KMeans(n_clusters=4, random_state=42)
model.fit(X)

# plot the histogram of cluster membership probabilities for the 4th cluster
plt.hist(model.predict_proba(X)[:, 3], bins=10, range=(0, 1))
plt.xlim((0, 1))
plt.xlabel('Probability')
plt.ylabel('Count')
plt.show()

In [None]:
# assume that MD.km28 is already computed and contains the clustering results
MD_k4 = MD_km28['4']

In [None]:
from sklearn.manifold import MDS
from sklearn.metrics import euclidean_distances

# assume that MD_x and MD_k4 are already computed
# compute the dissimilarity matrix
dissimilarity = euclidean_distances(MD_x)

# perform metric scaling using multidimensional scaling (MDS)
model = MDS(n_components=2, dissimilarity='precomputed', random_state=42)
MD_r4 = model.fit_transform(dissimilarity[:, MD_k4 == 1])

In [None]:
plt.scatter(range(len(MD_r4)), MD_r4[:, 1], c='b')
plt.ylim((0, 1))
plt.xlabel('Segment number')
plt.ylabel('Segment stability')
plt.show()

In [None]:
from flexmix import flexmix
from sklearn.preprocessing import scale

# assume that MD_x is already computed
# scale the data
MD_x_scaled = scale(MD_x)

# perform model-based clustering using flexmix
MD_m28 = flexmix(MD_x_scaled, k=range(2, 9), nrep=10, model='FLXMCmvbinary', verbose=False)

In [None]:
import matplotlib.pyplot as plt

# assume that MD_m28 is already computed
plt.plot(MD_m28.logLik)
plt.xlabel('Number of clusters')
plt.ylabel('Log-likelihood')
plt.title('Model-based clustering results')
plt.legend(['AIC', 'BIC', 'ICL'])
plt.show()

In [None]:
# assume that MD_m28, MD_k4 are already computed

# get the model for 4 clusters
MD_m4 = MD_m28.get_model(which=3)

# compute the cross-tabulation of cluster assignments
import pandas as pd
pd.crosstab(index=clusters(MD_k4), columns=clusters(MD_m4), rownames=['KMeans'], colnames=['Mixture'])

In [None]:

# fit a flexible mixture model with the KMeans clustering as the initial cluster assignments
from sklearn.mixture import BayesianGaussianMixture
import numpy as np

# define a function to convert cluster assignments to binary indicators
def get_indicators(clusters, num_clusters):
    return np.eye(num_clusters)[clusters]

# convert cluster assignments to binary indicators
kmeans_indicators = get_indicators(clusters(MD_k4), 4)

# fit the flexible mixture model
MD_m4a = BayesianGaussianMixture(n_components=4, covariance_type='full', n_init=10, init_params='kmeans').fit(MD_x, kmeans_indicators)

# compute the cross-tabulation of cluster assignments
import pandas as pd
pd.crosstab(index=clusters(MD_k4), columns=MD_m4a.predict(MD_x), rownames=['KMeans'], colnames=['Mixture'])