In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

___Our main task is to cluster the countries by the factors mentioned above and then present solution and recommendations to the CEO___

Lets divide the entire process of clustering into steps:

* Step 1: Reading and understanding data
* Step 2: Exploratory Data analysis
* a) Data cleaning
* b) univariate Analysis
* c) Bivariate Analysis
* Step 3: Outlier Treatment
* Step 4: Scaling data
* Step 5: Creating k-means clustering algorithm and visualising clusters formed
* Step 6: Creating Hierarchical algorithm and visualising clusters formed
* Step 7: Reporting 5 or more backward countries

## Step 1: Reading and understanding data

In [None]:
# importing necessary libraries and warnings

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes = True)

import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import cut_tree

In [None]:
# Reading the dataset
data= pd.read_csv("/kaggle/input/unsupervised-learning-on-country-data/Country-data.csv")
data

In [None]:
# Checking shape
data.shape

In [None]:
# Describing data
data.describe()

In [None]:
# data info
data.info()

In [None]:
# checking data types
data.dtypes

## Step 2: Exploratory Data analysis

### a) Data cleaning

In [None]:
# Convert exports, imports and health spending percentages tpo absolute values
data['exports'] = data['exports']*data['gdpp']/100
data['imports'] = data['imports']*data['gdpp']/100
data['health'] = data['health']*data['gdpp']/100

In [None]:
# Checking null values
data.isnull().sum()

Since there are no null values proceeding to further step

### a) Univariate analysis

In [None]:
# Plotting histogram for each numerical column
plt.figure(figsize=[20,15])
for i in enumerate(data.describe().columns):
    plt.subplot(3,3,i[0]+1)
    sns.distplot(data[i[1]], color='darkorange',kde_kws={'color':'royalblue'})
plt.show()

___Inferences___:

1. > From the above plot, most of the average income per person and gdp per capita is observed in the range of 0-15000
2. > On average, we can infer that life expectancy of a person for most of the countries is observed between 60-80
3. > Child mortality (Death of children under 5 years of age per 1000 live births) seems to be below 50 in most of the countries, only few countries child mortality is above 100.


### c) Bivariate Analysis

In [None]:
# sorting gdp value for each country
gdp = data.sort_values(by = ['gdpp'], ascending=True)
gdp

In [None]:
# plotting top 10 countries with highest and lowest gdp
plt.figure(figsize=[15,8])
plt.subplot(2,1,1)
sns.barplot(gdp.country.head(10), gdp.gdpp.head(10))
plt.title("Top 10 countries having lowest GDP")
plt.subplot(2,1,2)
sns.barplot(gdp.country.tail(10), gdp.gdpp.tail(10))
plt.title("\nTop 10 countries having Highest GDP")
plt.tight_layout()
plt.show()

___Inferences___:

1. > From the above plot, we can see that top 5 countries having lowest GDP are ___Burundi, Liberia, Congo, Dem. Rep., Niger and Sierra Leone.___
2. > Top 5 countries having highest GDP are ___Luxembourg, Norway, Switzerland, Qatar and Denmark.___

In [None]:
# sorting net income per person for each country
income = data.sort_values(by = ['income'], ascending=True)
income

In [None]:
#plotting top 10 countries with highest and lowest income
plt.figure(figsize=[15,8])
plt.subplot(2,1,1)
sns.barplot(income.country.head(10), income.income.head(10))
plt.title("Top 10 countries having lowest net income per person")
plt.subplot(2,1,2)
sns.barplot(income.country.tail(10), income.income.tail(10))
plt.title("\nTop 10 countries having Highest net income per person")
plt.tight_layout()
plt.show()

___Inferences___:

1. > From the above plot, we can see that top 5 countries having lowest net income per person are ___Congo, Dem. Rep.,Liberia,Burundi Niger and Central African Republic___
2. > Top 5 countries having highest net income per person are ___Qatar, Luxembourg, Brunei, Kuwait and Singapore.___

In [None]:
# sorting health for each country
health = data.sort_values(by = ['health'], ascending=True)
health

In [None]:
#plotting top 10 countries spent lowest and highest health per capita
plt.figure(figsize=[15,8])
plt.subplot(2,1,1)
sns.barplot(health.country.head(10), health.health.head(10))
plt.title("Top 10 countries spent lowest health per capita")
plt.subplot(2,1,2)
sns.barplot(health.country.tail(10), health.health.tail(10))
plt.title("\nTop 10 countries spent Highest health per capita")
plt.tight_layout()
plt.show()

___Inferences___:

1. > From the above plot, we can see that top 5 countries spent lowest health per capita are ___Eritrea, Madagascar,	Central African Republic, Niger and Myanmar___
2. > Top 5 countries spent highest health per capita are ___United States, Switzerland, Norway, Luxembourg and Denmark.___

In [None]:
# sorting child mortality for each country below 5 years per 1000 live births
child = data.sort_values(by = ['child_mort'], ascending=False)
child

In [None]:
#plotting top 10 countries with highest and lowest child mortality
plt.figure(figsize=[15,8])
plt.subplot(2,1,1)
sns.barplot(child.country.head(10), child.child_mort.head(10))
plt.title("Top 10 countries high Death of children under 5 years of age per 1000 live births")
plt.subplot(2,1,2)
sns.barplot(child.country.tail(10), child.child_mort.tail(10))
plt.title("\nBottom 10 countries low Death of children under 5 years of age per 1000 live births") 
plt.tight_layout()
plt.show()

___Inferences___:

1. > From the above plot, we can see that top 5 countries with lowest child mortality are ___Iceland, Luxembourg, Singapore, Sweden and Finland___
2. > Top 5 countries with highest child mortality are ___Haiti, Sierra Leone, Chad, Central African Republic and Mali.___

In [None]:
# sorting life expectency for each country
life = data.sort_values(by = ['life_expec'], ascending=False)
life

In [None]:
#plotting top 10 countries with highest and lowest life_expec
plt.figure(figsize=[15,8])
plt.subplot(2,1,1)
sns.barplot(life.country.head(10), life.life_expec.head(10))
plt.title("Top 10 countries with high life expectency\n")
plt.subplot(2,1,2)
sns.barplot(life.country.tail(10), life.life_expec.tail(10))
plt.title("\nBottom 10 countries with low life expectency\n") 
plt.tight_layout()
plt.show()

___Inferences___:

1. > From the above plot, we can see that top 10 countries have the same life expectancy of 80-82 years approximately. 
2. > Top 5 countries with lowest life expectancy are ___Haiti, Lesotho, Central African Republic,Zambia and Malawi.___

In [None]:
# Pair plot for all numerical variables
sns.pairplot(data)
plt.show()

In [None]:
# checking correlation for the data
correlation = data.corr()
correlation

In [None]:
# Heapmat for given data
plt.figure(figsize=[12,8])
sns.heatmap(correlation, annot=True, cmap='RdYlGn')
plt.title('Heatmap\n')
plt.show()

**Inferences:**
> 1. From the above plot, we can infer that there is a high correlation between income and GDP, so, if average net income per person increases the GDP of the country will also increase.
> 2. If child mortality(Death of children under 5 years of age per 1000 live births) decreases the life expectancy will also decreases.
> 3. If you spend more on Health , then the child mortality will reduce and it will increase the GDP rate.

In [None]:
# Resetting index before outlier treatment
data.reset_index()

## Step 3: Outlier Treatment


In [None]:
# checking outliers using boxplot for each numerical column before outlier treatment
plt.figure(figsize=[20,15])
for i in enumerate(data.describe().columns):
    plt.subplot(3,3,i[0]+1)
    sns.boxplot(data[i[1]])

plt.show()

1. > Removing the lower range outliers for countries with low child mortality, low inflation and low total fertility. 
2. > Removing the upper range outliers for countries with high income, GDP, imports, exports, life_expectancy and health.
3. > Here, we are using soft capping method for removing these outliers with quantiles of 0.05 for lower range and 0.95 for higher range

In [None]:
# Removing the lower range outliers for countries with low child mortality, low inflation and low total fertility
low_range = ['child_mort','inflation','total_fer']
for column in low_range:
    Q1 = data[column].quantile(0.05)
    data = data[(data[column]>= Q1)]

In [None]:
# Removing the upper range outliers for countries with high income, GDP, imports, exports, life_expectancy and health
high_range = ['exports', 'health', 'imports','income','gdpp','life_expec']
for column in high_range:
    Q3 = data[column].quantile(0.95)
    data = data[(data[column] <= Q3)]

In [None]:
# plotting boxplot again after outlier treatment
plt.figure(figsize=[20,15])
for i in enumerate(data.describe().columns):
    plt.subplot(3,3,i[0]+1)
    sns.boxplot(data[i[1]], color='g')
plt.show()

In [None]:
# Checking shape after outlier treatment
data.shape

## Step 4: Scaling data

In [None]:
# creating data with only numerical columns
clusterdata = data.iloc[:,1:]
clusterdata.head()

In [None]:
# Scaling data
scaler = StandardScaler()

cluster_scaled = scaler.fit_transform(clusterdata)
cluster_scaled.shape

In [None]:
data.columns

In [None]:
# Creating dataframe with scaled data and assigning column names
cluster_scaled = pd.DataFrame(cluster_scaled)
cluster_scaled.columns = ['child_mort', 'exports', 'health', 'imports', 'income',
                          'inflation', 'life_expec', 'total_fer', 'gdpp']
cluster_scaled.head()

### Hopkins Statistics:
The Hopkins statistic, is a statistic which gives a value which indicates the cluster tendency, in other words: how well the data can be clustered.

- If the value is between {0.01, ...,0.3}, the data is regularly spaced.

- If the value is around 0.5, it is random.

- If the value is between {0.7, ..., 0.99}, it has a high tendency to cluster.

In [None]:
# Hopkins test
from sklearn.neighbors import NearestNeighbors
from random import sample
from numpy.random import uniform
import numpy as np
from math import isnan
 
def hopkins(X):
    d = X.shape[1]
    #d = len(vars) # columns
    n = len(X) # rows
    m = int(0.1 * n) 
    nbrs = NearestNeighbors(n_neighbors=1).fit(X.values)
 
    rand_X = sample(range(0, n, 1), m)
 
    ujd = []
    wjd = []
    for j in range(0, m):
        u_dist, _ = nbrs.kneighbors(uniform(np.amin(X,axis=0),np.amax(X,axis=0),d).reshape(1, -1), 2, return_distance=True)
        ujd.append(u_dist[0][1])
        w_dist, _ = nbrs.kneighbors(X.iloc[rand_X[j]].values.reshape(1, -1), 2, return_distance=True)
        wjd.append(w_dist[0][1])
 
    H = sum(ujd) / (sum(ujd) + sum(wjd))
    if isnan(H):
        print(ujd, wjd)
        H = 0
 
    return H

In [None]:
# checking Hopkins values
Hopkins = []
for i in range(5):
    Hopkins.append(hopkins(cluster_scaled))

In [None]:
# Taking average hopkins value
sum(Hopkins)/len(Hopkins)

Since the value is 0.84 there is a high tendency to cluster this data

## Step 5: Creating k-means clustering algorithm and visualising clusters formed

### Finding the Optimal Number of Clusters

#### SSD (Elbow curve method)

In [None]:
# elbow-curve/SSD
ssd = []
range_n_clusters = [2,3,4,5,6,7,8,9,10,11,12]
for num_clusters in range_n_clusters:
    kmeans = KMeans(n_clusters=num_clusters, max_iter=50)
    kmeans.fit(cluster_scaled)
    
    ssd.append(kmeans.inertia_)
    
# plot the SSDs for each n_clusters
# ssd
plt.figure(figsize=[12,8])
plt.plot(range_n_clusters,ssd)
plt.xticks([2,3,4,5,6,7,8,9,10,11,12])
plt.title("Plot for k-value vs SSD(Sum of squared Distances)")
plt.xlabel('k')
plt.ylabel('SSD')
plt.show()

From the above plot we can clearly see that elbow has been formed at k = 3 or k = 5

Further, lets proceed with Silhouette Analysis to check the best k value

### Silhouette Analysis

$$\text{silhouette score}=\frac{p-q}{max(p,q)}$$

$p$ is the mean distance to the points in the nearest cluster that the data point is not a part of

$q$ is the mean intra-cluster distance to all the points in its own cluster.

* The value of the silhouette score range lies between -1 to 1. 

* A score closer to 1 indicates that the data point is very similar to other data points in the cluster, 

* A score closer to -1 indicates that the data point is not similar to the data points in its cluster.

In [None]:
# silhouette analysis
range_n_clusters = [2,3,4,5,6,7,8,9,10,11,12]

for num_clusters in range_n_clusters:
    
    # intialise kmeans
    kmeans = KMeans(n_clusters=num_clusters, max_iter=50)
    kmeans.fit(cluster_scaled)
    
    cluster_labels = kmeans.labels_
    
    # silhouette score
    silhouette_avg = silhouette_score(cluster_scaled, cluster_labels)
    print("For n_clusters={0}, the silhouette score is {1}".format(num_clusters, silhouette_avg))

From the above scores, we can see that for n=2 seems to be higher, however, with 2 clusters we are won't be able to extract the backward countries and lets proceed further by taking k = 3 as number of clusters

In [None]:
# final model with k=3
kmeans = KMeans(n_clusters=3, max_iter=50, random_state= 100)
kmeans.fit(cluster_scaled)

In [None]:
kmeans.labels_

In [None]:
# Assigning labels to main data formed by k-means
data['K_means_labels'] = kmeans.labels_

In [None]:
data.head()

In [None]:
# plotting box plots with gdpp and k-means cluster labels
sns.boxplot(x = 'K_means_labels', y = 'gdpp', data= data)
plt.show()

In [None]:
# plotting box plots with child_mort and k-means cluster labels
sns.boxplot(x = 'K_means_labels', y = 'child_mort', data= data)
plt.show()

In [None]:
# plotting box plots with income and k-means cluster labels
sns.boxplot(x = 'K_means_labels', y = 'income', data= data)
plt.show()

In [None]:
# scatter plot for income vs gdpp with cluster labels formed 

plt.figure(figsize=[12,8])
sns.scatterplot(data.income, data.gdpp,hue = data.K_means_labels, palette='Set1')
plt.show()

In [None]:
# scatter plot for income vs child_mort with cluster labels formed 

plt.figure(figsize=[12,8])
sns.scatterplot(data.income, data.child_mort,hue = data.K_means_labels, palette='Set1')
plt.show()

In [None]:
# scatter plot for gdpp vs child_mort with cluster labels formed 

plt.figure(figsize=[12,8])
sns.scatterplot(data.gdpp, data.child_mort,hue = data.K_means_labels, palette='Set1')
plt.show()

Finally we can see that using, k-means clustering method, clusters have been formed as

label = 0, low income, low GDP and high child mortality

label = 1, high income, high GDP and low child mortality

label = 2, moderate income, moderate GDP and low child mortality

Inorder to suggest the backward countries, we should focus on data where cluster label = 0

Lets proceed further to check Hierarchical clustering

## Step 6: Creating Hierarchical algorithm and visualising clusters formed


### Single linkage

In [None]:
# Hierarchical clustering with single linkage
mergings = linkage(cluster_scaled, method="single", metric='euclidean')
dendrogram(mergings)
plt.show()

In [None]:
# creating 3 clusters using cut_tree method
cluster_labels_single = cut_tree(mergings, n_clusters=3).reshape(-1, )
cluster_labels_single

In [None]:
data['Hierachical_single'] = cluster_labels_single

In [None]:
# plotting box plots with gdpp
sns.boxplot(x = 'Hierachical_single', y = 'gdpp', data= data)
plt.show()

In [None]:
# plotting box plots with income
sns.boxplot(x = 'Hierachical_single', y = 'income', data= data)
plt.show()

In [None]:
# plotting box plots with childmort
sns.boxplot(x = 'Hierachical_single', y = 'child_mort', data= data)
plt.show()

In [None]:
# scatter plot for income vs gdpp with cluster labels formed 
plt.figure(figsize=[12,8])
sns.scatterplot(data.income, data.gdpp,hue = data['Hierachical_single'], palette='Set1')
plt.show()

In [None]:
# scatter plot for child_mort vs gdpp with cluster labels formed
plt.figure(figsize=[12,8])
sns.scatterplot(data.child_mort, data.gdpp,hue = data['Hierachical_single'], palette='Set1')
plt.show()

In [None]:
# scatter plot for child_mort vs income with cluster labels formed 
plt.figure(figsize=[12,8])
sns.scatterplot(data.child_mort, data.income,hue = data['Hierachical_single'], palette='Set1')
plt.show()

We can see that single linkage clustering is not promising since most of the data points are clustered in single cluster label and only few data points are in remaining clusters. Lets proceed further to check the complete linkage

### Complete linkage

In [None]:
# Hierarchical clustering with complete linkage
mergings = linkage(cluster_scaled, method="complete", metric='euclidean')
dendrogram(mergings)
plt.show()

In [None]:
# creating 3 clusters using cut_tree method
cluster_labels = cut_tree(mergings, n_clusters=3).reshape(-1, )
cluster_labels

In [None]:
# Assigning cluster labels to the main data
data['Hierarchical_complete'] = cluster_labels

In [None]:
data

In [None]:
# plotting box plots with gdpp
sns.boxplot(x = 'Hierarchical_complete', y = 'gdpp', data= data)
plt.show()

In [None]:
# plotting box plots with income
sns.boxplot(x = 'Hierarchical_complete', y = 'income', data= data)
plt.show()

In [None]:
# plotting box plots with child_mort
sns.boxplot(x = 'Hierarchical_complete', y = 'child_mort', data= data)
plt.show()

In [None]:
# scatter plot for income vs gdpp with cluster labels formed 
plt.figure(figsize=[12,8])
sns.scatterplot(data.income, data.gdpp,hue = data.Hierarchical_complete, palette='Set1')
plt.show()

In [None]:
# scatter plot for child_mort vs gdpp with cluster labels formed
plt.figure(figsize=[12,8])
sns.scatterplot(data.child_mort, data.gdpp,hue = data.Hierarchical_complete, palette='Set1')
plt.show()

In [None]:
# scatter plot for child_mort vs income with cluster labels formed 
plt.figure(figsize=[12,8])
sns.scatterplot(data.child_mort, data.income,hue = data.Hierarchical_complete, palette='Set1')
plt.show()

Finally we can see that using, Hierarchical complete linkage clustering method, clusters have been formed as

label = 0, low income, low GDP and high child mortality

label = 1, high income, high GDP and low child mortality

label = 2, moderate income, moderate GDP and high child mortality

Inorder to suggest the backward countries, we should focus on data where cluster label = 0

## Step 7: Reporting 5 or more backward countries

In [None]:
data.head()

So, we need to check the countries having k-means cluster label = 1 and hierarchical cluster label = 0

In [None]:
# Countries recommended by k-means clustering
data[data['K_means_labels'] == 0]['country']

In [None]:
#countries recommended by hierarchical clustering complete linkage
data[data['Hierarchical_complete'] == 0]['country']

In [None]:
# Checking common countries recommended by k-means and hierarchical clustering
backward_countries = data[(data['K_means_labels'] == 0) & (data['Hierarchical_complete'] == 0)]
backward_countries

In [None]:
backward_countries.shape

In [None]:
# sorting backward countries with gdp, income, child_mort

Top10_backward_countries = backward_countries.sort_values(by=['gdpp','income','child_mort'])[:10]

In [None]:
Top10_backward_countries

__Top 10 backward countries:__

>1. Burundi
>2. Liberia
>3. Congo, Dem. Rep.
>4. Niger	
>5. Sierra Leone
>6. Madagascar
>7. Mozambique
>8. Central African Republic	
>9. Malawi
>10. Eritrea


This is my solution approach, let me know in case if you have any questions.

Thank you and Keep learning!