In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
dataDict = pd.read_csv('../input/country-socioeconomic-data/data-dictionary.csv')
dataDict

In [None]:
df = pd.read_csv('../input/country-socioeconomic-data/Country-data.csv')
df.head()

In [None]:
def shape(x):
    rows, cols = df.shape
    print(f'The dataframe has {rows} rows and {cols} cols!')

In [None]:
shape(df)

In [None]:
df.info()

<p style='font-family:verdana; color:green'><b>There is one categorical, and 9 numerical columns. There are no missing values in the dataset</b></p>

In [None]:
df.describe()

<p style='font-family:verdana; color:green'><b>The different columns have different scale of data, so we'll need to do Standard Scaling before proceeding with the clusters</b></p>

In [None]:
df.describe(include='object')

<p style='font-family:verdana; color:green'><b>This dataset contains records for 167 different countries</b></p>

In [None]:
#Covariance Table
pd.DataFrame(np.cov(df.iloc[:,1:].T), columns=df.columns[1:], index=df.columns[1:])

In [None]:
#correlation plot
df.corr()

In [None]:
plt.figure(figsize=(16,6))
mask = np.triu(df.corr(),k=1)
sns.heatmap(df.corr(), annot=True, linewidths=1, mask=mask, cmap='coolwarm')
plt.title('Correlation Plot')
plt.show()

<p style='font-family:verdana; color:green'><b>There is some multicollinearity present in the data as "total_fer" is highly correlated with "child_mort", and "gdpp" is highly correlated with "income"
Also "imports" and "exports" are correlated with each other, and "heatlh" and "gdpp" have a positive correlation
"life_expec" has a high negative correlation with the "child_mort" and "total_fer" </b></p>

In [None]:
sns.pairplot(df, diag_kind='kde', palette='Pastel1')
plt.show()

In [None]:
df.isna().sum().sum() #there are no missing values in this data

In [None]:
df['country'].nunique()

In [None]:
country = df['country']
df.drop(['country'],1,inplace = True)

In [None]:
#standardizing the data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
Xsc = sc.fit_transform(df)

In [None]:
dfsc = pd.DataFrame(Xsc, columns=df.columns)
dfsc.head(2)

<p style='font-family:verdana; color:green'><b>As we have seen from the correlation matrix, there is some mild to high correlation between the variables, which results in multicollinearity. To reduce the effect of multicollinearity, and to reduce the dimension, we use PCA. </b></p>

In [None]:
from sklearn.decomposition import PCA

pca = PCA()
pcadf = pca.fit_transform(Xsc)

In [None]:
pcadf = pd.DataFrame(pcadf, columns=['PC'+str(i) for i in range(1,pcadf.shape[1]+1)])
pcadf.head(2)

In [None]:
np.cumsum(pca.explained_variance_ratio_)

In [None]:
plt.figure(figsize=(18,5))
clrs = ['grey' if i<0.90 else 'orange' for i in np.cumsum(pca.explained_variance_ratio_)]
g = sns.barplot(x=pcadf.columns, y = pca.explained_variance_ratio_, palette = clrs)
sns.lineplot(x=pcadf.columns, y = pca.explained_variance_ratio_,color='black')
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance')
plt.yticks(np.arange(0,0.7,0.1))
for p in g.patches:
    g.annotate('{:.3f}'.format(p.get_height()), (p.get_x()+0.35, p.get_height()+0.003), ha='center',
              va='bottom',color='black')
plt.show()

In [None]:
plt.figure(figsize=(18,5))
sns.lineplot(x=pcadf.columns, y=np.cumsum(pca.explained_variance_ratio_),drawstyle='steps-pre',color='orange')
plt.axhline(0.90,color='green')
plt.axvline(4, color='green')
plt.title('Cumulative Variance Plot')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance')
plt.grid()
plt.show()

<p style='font-family:verdana; color:green'><b>As we can see from the cumulative variance plot, we need 5 principal components to retain >90% variance</b></p>

In [None]:
#refitting the pca with 5 principal components
pca = PCA(n_components=5)
pcadf = pca.fit_transform(Xsc)

In [None]:
pcadf = pd.DataFrame(pcadf, columns=['PC'+str(i) for i in range(1,pcadf.shape[1]+1)])
pcadf.head(2)

In [None]:
mask = np.triu(pcadf.corr())
sns.heatmap(pcadf.corr(), annot=True, linewidths=1, mask=mask, cmap='coolwarm',vmax=1)
plt.title('PCA - Correlation Plot')
plt.show()

<p style='font-family:verdana; color:green'><b>As we can see, there is no multicollinearity in the dataset</b></p>

In [None]:
sns.pairplot(pcadf)

In [None]:
plt.figure(figsize=(18,8))
i = 1
for pc in pcadf.columns:
    plt.subplot(2,3,i)
    sns.boxplot(x=pcadf[pc],palette='Pastel2')
    i += 1
plt.suptitle('Outliers in the principal components', color='darkgreen', fontsize=16)
plt.show()

In [None]:
#Since we have a few outliers in the PCs, we will cap them as KMeans are sensitive to outliers
for pc in pcadf:
    q1,q3,q10,q90 = pcadf[pc].quantile([0.25,0.75,0.1,0.90])
    iqr = q3-q1
    ul = q3 + 1.5*iqr
    ll = q1 - 1.5*iqr
    pcadf[pc] = pcadf[pc].apply(lambda x: q10 if x<ll else q90 if x>ul else x)

In [None]:
plt.figure(figsize=(18,8))
i = 1
for pc in pcadf.columns:
    plt.subplot(2,3,i)
    sns.boxplot(x=pcadf[pc],palette='Pastel2')
    i += 1
plt.suptitle('Outliers in the principal components - post treatment', color='darkgreen', fontsize=16)
plt.show()

<p style='font-family:verdana; color:green'><b>All the outliers have been capped within 10th and 90th quantile of the features</b></p>

In [None]:
#finding the optimal number of cluster value
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

inertia_score = []
for k in range(1,11):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(pcadf)
    inertia_score.append(kmeans.inertia_)

#Visualizing the inertia vs k plot
plt.figure(figsize=(18,6))
sns.lineplot(x=range(1,11),y=inertia_score,color='green')
plt.xticks(range(1,11))
plt.xlabel('k values')
plt.ylabel('Inertia')
plt.title('Elbow Plot to find the optimal number of clusters')
plt.grid()
plt.show()

<p style='font-family:verdana; color:green'><b>From the elbow plot, we can see that the optimal number of clusters is 3</b></p>

In [None]:
km = KMeans(n_clusters=3)
km.fit(pcadf)
print(f'Inertia Score --> {km.inertia_}')
print(f'Silhouette Score --> {silhouette_score(pcadf,km.labels_)}')

In [None]:
#Checking the distribution of cluster between PC1 and PC2
plt.figure(figsize=(18,6))
plt.scatter(pcadf['PC1'], pcadf['PC2'], c=km.labels_, cmap='viridis')
plt.title('Comparing the clusters between PC1 and PC2')
plt.show()

In [None]:
#Preparing an AGC model
from scipy.cluster.hierarchy import cophenet,dendrogram,linkage
from scipy.spatial.distance import pdist
for link in ['single','complete','average','ward']:
    z = linkage(pcadf,link)
    c, coph_dist = cophenet(z,pdist(pcadf))
    print(f'{link} --> {c}') #closer it is to 1, better the clustering

In [None]:
#As average has the best cophenet index, we will use it for the Agglomerative Clustering
from sklearn.cluster import AgglomerativeClustering
m2 = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='average')
m2.fit(pcadf)

In [None]:
#Plotting a Dendrogram
plt.figure(figsize=(18,8))
plt.title('Agglomerative Hierarchical Clustering - Dendrogram')
plt.xlabel('Sample Index')
plt.ylabel('Distance')
z = linkage(pcadf,'average')
dendrogram(z,leaf_rotation=90.0, leaf_font_size=8,truncate_mode='level',p=5,color_threshold=4)
plt.tight_layout()

In [None]:
#silhouette score for the AGC model
silhouette_score(pcadf,m2.labels_)

In [None]:
#calculating the intertia for AGC model, since it's not an inbuilt method
pcadf_labelled = pcadf.copy(deep=True)
pcadf_labelled['class']=m2.labels_
agc_clusters = pcadf_labelled.groupby('class')
df0 = agc_clusters.get_group(0)
df1 = agc_clusters.get_group(1)
df2 = agc_clusters.get_group(2)
#Calculating the centroids
c0 = np.array(df0.iloc[:,:-1].mean())
c1 = np.array(df1.iloc[:,:-1].mean())
c2 = np.array(df2.iloc[:,:-1].mean())

In [None]:
agc_inert0 = 0
agc_inert1 = 0
agc_inert2 = 0
for i in np.arange(df0.shape[0]):
    agc_inert0 = agc_inert0+np.sum((df0.iloc[i,:-1]-c0)**2)
for i in np.arange(df1.shape[0]):
    agc_inert1 += np.sum((df1.iloc[i,:-1]-c1)**2)
for i in np.arange(df2.shape[0]):
    agc_inert2 += np.sum((df2.iloc[i,:-1]-c2)**2)
agc_inertia = agc_inert0+agc_inert1+agc_inert2
print(agc_inertia)

In [None]:
#Comparing the inertia score of kmeans and agc model
print(f'KMeans Inertia Score --> {km.inertia_}')
print(f'AGC Inertia Score --> {agc_inertia}')

#Comparing the silhouette score of kmeans and agc models
print(f'KMeans Silhouette Score --> {silhouette_score(pcadf,km.labels_)}')
print(f'AGC Silhouette Score --> {silhouette_score(pcadf,m2.labels_)}')

<p style='font-family:verdana; color:green'><b>As we can see, KMeans model has lesser Inertia and Higher Silhoutte score than the AGC model, which suggests that it is a better model for this dataset</b></p>

In [None]:
#Plotting a 3D plot between PC1,PC2 and PC2
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(8,6))
ax = Axes3D(fig, elev=-150, azim=100)
ax.scatter(pcadf['PC1'],pcadf['PC2'],pcadf['PC3'],c=km.labels_,cmap='viridis')
ax.set_title('Comparing the clusters between PC1, PC2 and PC3')
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3')
plt.show()

In [None]:
#adding labels to the original dataframe
df1 = df.copy(deep=True)
df1['country'] = country
df1['labels'] = km.labels_
df1.head()

In [None]:
df1['labels'].value_counts()

In [None]:
df1['labels'].value_counts().plot(kind='pie', autopct='%.2f%%', explode=[0.1,0.1,0.1])

In [None]:
df1.groupby('labels').mean()

<p style='font-family:verdana; color:green'><b>as we can see, there is a clear difference between the means of all the different labels, which suggest it is a good cluster</b></p>

In [None]:
metrics = pd.DataFrame({'KMeans':[km.inertia_,silhouette_score(pcadf,km.labels_)],
                       'AGC':[agc_inertia,silhouette_score(pcadf,m2.labels_)]},index=['Inertia','Silhouette Score'])
metrics

<p style='font-family:verdana; color:green'><b>A lower intertia is considered good, and so does a higher silhouette score
As we can see above, KMeans has got both, a lower inertia and a better silhouette score.
So we are choosing KMeans over AGC.
A positive silhouette score is considered as a good model. As we can see, our silhouette score is approximately 0.4, which is a good one.
So we can say that it is a good model with good clusters.</b></p>

In [None]:
#Visualizing the same using barplots, separated by labels
i = 1
plt.figure(figsize=(18,12))
for col in df1.columns[:-2]:
    plt.subplot(3,3,i)
    sns.barplot(x=df1['labels'], y=df1[col], palette = 'Pastel1', ci=None)
    i += 1
plt.suptitle('Distribution of various parameters among different labels', color='darkgreen', fontsize=16, y=1.01)
plt.tight_layout()
plt.show()

In [None]:
#finding out the list of countries in different labels
country_clusters = df1.groupby('labels')['country']

In [None]:
#Developed Countries
print(country_clusters.get_group(2).unique())

In [None]:
#Developing Countries
print(country_clusters.get_group(0).unique())

In [None]:
#Poor Countries
print(country_clusters.get_group(1).unique())

___