<h2>Clustering: Country Analysis<h2>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist, pdist
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN

In [None]:
# Read the data
X = pd.read_csv("countryanalysis.csv")
X.head()

In [None]:
X_nocountry = X.drop(['country'],axis=1)
X_nocountry.describe()

In [None]:
# Feature scaling
scaler = StandardScaler()  # Remove the mean and scale to unit variance
X_scaled = scaler.fit_transform(X_nocountry)
X_scaled = pd.DataFrame(X_scaled, columns = ['child_mort','exports','health','imports','income','inflation','life_expec','total_fer','gdpp'])
X_scaled

In [None]:
# Generate a heatmap of the correlations among the features
print('Correlation among features')
sns.heatmap(X_scaled.corr(),annot=True);

In [None]:
# Generate a heatmap of the correlations among the features
print('Co-clustering')
sns.clustermap(X_scaled.head(30),method='ward', metric='euclidean',annot=True);

In [None]:
X_colsubset = X_scaled[['child_mort', 'imports', 'gdpp']]
X_colsubset

In [None]:
# Display the selected data
sns.scatterplot(x = 'gdpp', y = 'child_mort', data = X_colsubset)

In [None]:
sns.scatterplot(x = 'gdpp', y = 'imports', data = X_colsubset)

<b>K-means Clustering</b>

In [None]:
from sklearn.cluster import KMeans

# Select the number of clusters by means of SSE
SSE = []
range_n_clusters = range(2,9)
for nclust in range_n_clusters:
    # Initialize K-means clustering
    km = KMeans(n_clusters=nclust, 
                init='random', # or 'k-means++'
                n_init=10, 
                max_iter=100, 
                random_state=0)
    # Generate K-means clustering
    km.fit(X_scaled)
    SSE.append(km.inertia_)
    
plt.plot(range_n_clusters, SSE, marker='o')
plt.xlabel('Number of clusters K')
plt.ylabel('Sum of Squared Distances (SSE)')
plt.show()

In [None]:
# Generate the final K-means clustering model
km = KMeans(n_clusters=5,
            init='random',
            n_init=10, 
            max_iter=100,
            random_state=0)
cluster_labels = km.fit_predict(X_scaled)
X['kmeans_labels'] = cluster_labels
X

In [None]:
# Plot the features of interest
X_colsubset = X[['child_mort', 'imports', 'gdpp', 'kmeans_labels']]
sns.scatterplot(x = 'gdpp', y = 'child_mort', data = X_colsubset, hue = 'kmeans_labels', palette = 'bright')

In [None]:
sns.scatterplot(x = 'gdpp', y = 'imports', data = X_colsubset, hue = 'kmeans_labels', palette = 'bright')

<b>DBSCAN</b>

In [None]:
# Selection of the size (eps) of the neighborhood
outlier_perc = []

for eps in np.linspace(0.001,3,50):  #Return evenly spaced numbers over a specified interval
    # Create the clustering model based
    dbscan = DBSCAN(eps=eps,min_samples=3)
    dbscan.fit(X_scaled)
    print(dbscan.labels_)
    # Count the percentage of points that are outliers
    perc_outliers = 100 * np.sum(dbscan.labels_ == -1) / len(dbscan.labels_)
    outlier_perc.append(perc_outliers)

In [None]:
plt.figure(figsize = (8,4), dpi = 100)
sns.lineplot(x=np.linspace(0.001,3,50),y=outlier_perc)
plt.ylabel("Percentage of points labeled as outliers")
plt.xlabel("Epsilon")
plt.show()

In [None]:
# Generate the final DBSCAN model
dbscan = DBSCAN(eps = 1.2, min_samples = 5)
cluster_labels = dbscan.fit_predict(X_scaled)
X['dbscan_labels'] = cluster_labels
X

In [None]:
# Plot the features of interest
X_colsubset = X[['child_mort', 'imports', 'gdpp', 'dbscan_labels']]
sns.scatterplot(x = 'gdpp', y = 'child_mort', data = X_colsubset, hue = 'dbscan_labels', palette = 'bright')

In [None]:
sns.scatterplot(x = 'gdpp', y = 'imports', data = X_colsubset, hue = 'dbscan_labels', palette = 'bright')