In [None]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.metrics import silhouette_samples
from matplotlib.ticker import FixedLocator, FixedFormatter
import seaborn as sns
import os


In [None]:
os.listdir('/kaggle/input/unsupervised-learning-on-country-data')

In [None]:
country_df = pd.read_csv('/kaggle/input/unsupervised-learning-on-country-data/Country-data.csv')

In [None]:
country_df.head()

In [None]:
country_df.info()

In [None]:
country_df.describe()

In [None]:
country_df.isna().sum()

# Keeping only the numeric features

In [None]:
only_features_df = country_df.drop('country', axis=1)
only_features_df.head()

In [None]:
only_features_df_corr = only_features_df.corr()
mask = np.triu(np.ones_like(only_features_df_corr, dtype=bool))
sns.heatmap(only_features_df_corr, mask=mask, cbar=False, cmap="BuGn", linewidths=0.3)

# Scaling

In [None]:
sc = StandardScaler()
scaled_features = sc.fit_transform(only_features_df)
scaled_features[:5]

# Elbow method

In [None]:
kmeans_per_k = [KMeans(n_clusters=k, random_state=42).fit(scaled_features)
                for k in range(1, 10)]
inertias = [model.inertia_ for model in kmeans_per_k]

In [None]:
plt.figure(figsize=(8, 3.5))
plt.plot(range(1, 10), inertias, "bo-")
plt.xlabel("Number of clusters", fontsize=14)
plt.ylabel("Inertia", fontsize=14)   
plt.show()

# Silhouette scores

In [None]:
silhouette_scores = [silhouette_score(scaled_features, model.labels_)
                     for model in kmeans_per_k[1:]]

In [None]:
plt.figure(figsize=(8, 3))
plt.plot(range(2, 10), silhouette_scores, "bo-")
plt.xlabel("Number of Clusters", fontsize=14)
plt.ylabel("Silhouette score", fontsize=14)
plt.show()

# Silhouette diagram

In [None]:
plt.figure(figsize=(11, 9))

for k in (3, 4, 5, 6):
    plt.subplot(2, 2, k - 2)
    
    y_pred = kmeans_per_k[k - 1].labels_
    silhouette_coefficients = silhouette_samples(scaled_features, y_pred)

    padding = len(scaled_features) // 30
    pos = padding
    ticks = []
    for i in range(k):
        coeffs = silhouette_coefficients[y_pred == i]
        coeffs.sort()

        color = mpl.cm.Spectral(i / k)
        plt.fill_betweenx(np.arange(pos, pos + len(coeffs)), 0, coeffs,
                          facecolor=color, edgecolor=color, alpha=0.7)
        ticks.append(pos + len(coeffs) // 2)
        pos += len(coeffs) + padding

    plt.gca().yaxis.set_major_locator(FixedLocator(ticks))
    plt.gca().yaxis.set_major_formatter(FixedFormatter(range(k)))
    if k in (3, 5):
        plt.ylabel("Cluster")
    
    if k in (5, 6):
        plt.gca().set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
        plt.xlabel("Silhouette Coefficient")
    else:
        plt.tick_params(labelbottom=False)

    plt.axvline(x=silhouette_scores[k - 2], color="red", linestyle="--")
    plt.title("$k={}$".format(k), fontsize=16)

plt.show()

### Even though K = 4 has the highest silhouette score, we can see from the diagram that one of the cluster is smaller than the rest. Meanwhile K = 3 , has similar sizes for all their clusters.

In [None]:
clusters = KMeans(n_clusters=3, random_state=42).fit(scaled_features)
pd.Series(clusters.labels_).value_counts()

In [None]:
kmeans_df = only_features_df.copy()
kmeans_df.head()

In [None]:
kmeans_df['label'] = clusters.labels_
kmeans_df.head()

In [None]:
fig, ax = plt.subplots(2,2, figsize=(10,10))
sns.boxplot(x='label', y='child_mort', data=kmeans_df, ax=ax[0,0])
ax[0,0].set_title('child mortality')
sns.boxplot(x='label', y='inflation', data=kmeans_df, ax=ax[0,1])
ax[0,1].set_title('Inflation')
sns.boxplot(x='label', y='life_expec', data=kmeans_df, ax=ax[1,0])
ax[1,0].set_title('Life Expectancy')
sns.boxplot(x='label', y='gdpp', data=kmeans_df, ax=ax[1,1])
ax[1,1].set_title('GDP per capita')
fig.tight_layout()

## Based on the figures we can say make the following ranking:
1. Cluster-0
2. Cluster-2
3. Cluster-1

In [None]:
kmeans_df['country'] = country_df['country']

In [None]:
print( 'Label 0 Countries\n',kmeans_df.loc[kmeans_df['label'] == 0, 'country'][:5])
print( 'Label 2 Countries\n',kmeans_df.loc[kmeans_df['label'] == 2, 'country'][:5])
print( 'Label 1 Countries\n',kmeans_df.loc[kmeans_df['label'] == 1, 'country'][:5])