# Solutions for Unsupervised Machine Learning

In [None]:
%matplotlib inline

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

## Principal Component Analysis

In [None]:
from bisect import bisect_right

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
df = pd.read_csv("./olympics.csv", index_col=0)

In [None]:
df.describe()

In [None]:
scaler = StandardScaler()
X = df.drop("score", axis=1).values
X_scaled = scaler.fit_transform(X)
X_scaled.var(axis=0)

In [None]:
pca = PCA(random_state=42).fit(X_scaled)
pca = PCA().fit(X_scaled)
out = pd.DataFrame(pca.components_, columns=df.columns[:-1])
df.index += 1
out.index.name = "Component"

In [None]:
print(out)

In [None]:
sns.heatmap(out, cmap="PiYG")

In [None]:
cumulated = pca.explained_variance_ratio_.cumsum()
THRES = 0.9
n_components = bisect_right(cumulated, THRES) + 1
print(f"You need {n_components} components to explain at least {THRES:.0%} of the variance.")

## Clustering

In [None]:
from sklearn.cluster import AgglomerativeClustering, DBSCAN, KMeans
from sklearn.datasets import load_iris
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

In [None]:
iris = load_iris()
X = iris["data"]

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
k = 3
kmeans = KMeans(n_clusters=k, random_state=42).fit(X_scaled)
agg = AgglomerativeClustering(n_clusters=k).fit(X_scaled)
dbscan = DBSCAN(min_samples=2, eps=1).fit(X_scaled)
out = pd.DataFrame({"kmeans": kmeans.labels_,
                    "agglomerative": agg.labels_,
                    "DBSCAN": dbscan.labels_})

In [None]:
print(f"K-Means: {silhouette_score(X_scaled, kmeans.labels_):.3f}")
print(f"Agglomerative clustering: {silhouette_score(X_scaled, agg.labels_):.3f}")
mask = dbscan.labels_ > -1
print(f"DBSCAN: {silhouette_score(X_scaled[mask], dbscan.labels_[mask]):.3f}")

In [None]:
add = pd.DataFrame(X.T[1:3].T, columns=iris["feature_names"][1:3])
out = pd.concat([out, add], axis=1, sort=True)
out

In [None]:
out["DBSCAN"] = out["DBSCAN"].replace(-1, "Noise")

In [None]:
out = out.melt(id_vars=iris["feature_names"][1:3],
               var_name="Cluster algorithm", value_name="assignment")
sns.catplot(x="sepal width (cm)", y="petal length (cm)",
            col="Cluster algorithm", hue="assignment", data=out)
plt.savefig(Path("./output/cluster_petal.pdf"))