# Improrting Important Libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import cut_tree

pd.set_option("display.max_rows", None, "display.max_columns", None)
pd.options.display.float_format = "{:.2f}".format

# Reading and understanding the Data

In [None]:
df = pd.read_csv("/kaggle/input/unsupervised-learning-on-country-data/Country-data.csv")
df.head()

As per the data dictionary, below mentioned three columns are given as percentage of GDPP

Converting them to actual values for further analysis

In [None]:
# Changing percentage coulmns to actual values
df['exports']=df['gdpp']*df['exports']*100
df['health']=(df['gdpp']*df['health'])*100
df['imports']=(df['gdpp']*df['imports'])*100

In [None]:
df.head()

### Checking the sturcture of data

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
print("shape of dataset is" ,df.shape)

In [None]:
df.dtypes

In [None]:
df.columns

### Checking for null values is the data

In [None]:
print('Null values: \n{}'.format(df.isnull().sum()))
print('\nNaN values: \n{}'.format(df.isna().sum()))

There are no Null or NaN values in the dataset

# Data Visualisation

## Visualising numerical variables through pairplot 

In [None]:
plt.figure(figsize = (25,15))
sns.pairplot(df, diag_kind='kde')
plt.show()

### Inferences

 - Linear relation is found between gdpp-income,imports-exports,total_fer-child_mort
 - Rectangular hyperbola curve is generated by gdpp-child_mort.
 - If gdpp is HIGH:
   - child mortality is LOW
   - income is HIGH
   - inflation is LOW
   - life expectancy is HIGH
   - total fertility is LOW
   - health, imports and exports are MEDIUM

## Visualising the correlation between the variables via heatmap

In [None]:
plt.figure(figsize = (25,15))
ax = sns.heatmap(df.corr(),square = True,annot=True, cmap="Blues")
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5);

### Inferences
 - exports is highly correlated with imports.
 - health, exports, income,imports are highly correlated with gdpp.
 - child_mort is having high negative correlation with life_expec.
 - total_fer is highly positively correlated with child_mort and negatively correlated with life_expec


## Visualising the top 5 countries for each variable via Barplot

In [None]:
fig, axs = plt.subplots(3,3,figsize = (30,30))

# Child Mortality Rate : Death of children under 5 years of age per 1000 live births
Child_mort = df[['country','child_mort']].sort_values('child_mort', ascending = False).head(5)
ax = sns.barplot(x='country', y='child_mort', data= Child_mort, ax = axs[0,0])
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.01 , p.get_height() * 1.01))
ax.set(xlabel = '', title= 'Child Mortality Rate')

# Fertility Rate: The number of children that would be born to each woman if the current age-fertility rates remain the same
Total_fer = df[['country','total_fer']].sort_values('total_fer', ascending = False).head(5)
ax = sns.barplot(x='country', y='total_fer', data= Total_fer, ax = axs[0,1])
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.01 , p.get_height() * 1.01))
ax.set(xlabel = '', title= 'Fertility Rate')

# Life Expectancy: The average number of years a new born child would live if the current mortality patterns are to remain same
Life_expec = df[['country','life_expec']].sort_values('life_expec', ascending = True).head(5)
ax = sns.barplot(x='country', y='life_expec', data= Life_expec, ax = axs[0,2])
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.01 , p.get_height() * 1.01))
ax.set(xlabel = '', title= 'Life Expectancy')

# Health :Total health spending.
Health = df[['country','health']].sort_values('health', ascending = True).head(5)
ax = sns.barplot(x='country', y='health', data= Health, ax = axs[1,0])
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.01 , p.get_height() * 1.01))
ax.set(xlabel = '', title= 'Health')

# The GDP per capita : Calculated as the Total GDP divided by the total population.
GDPP = df[['country','gdpp']].sort_values('gdpp', ascending = True).head(5)
ax = sns.barplot(x='country', y='gdpp', data= GDPP, ax = axs[1,1])
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.01 , p.get_height() * 1.01))
ax.set(xlabel = '', title= 'GDP per capita')

# Per capita Income : Net income per person
Income = df[['country','income']].sort_values('income', ascending = True).head(5)
ax = sns.barplot(x='country', y='income', data= Income, ax = axs[1,2])
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.01 , p.get_height() * 1.01))
ax.set(xlabel = '', title= 'Per capita Income')


# Inflation: The measurement of the annual growth rate of the Total GDP
Inf = df[['country','inflation']].sort_values('inflation', ascending = False).head(5)
ax = sns.barplot(x='country', y='inflation', data= Inf, ax = axs[2,0])
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.01 , p.get_height() * 1.01))
ax.set(xlabel = '', title= 'Inflation Rate')


# Exports: Exports of goods and services.
Exports = df[['country','exports']].sort_values('exports', ascending = True).head(5)
ax = sns.barplot(x='country', y='exports', data= Exports, ax = axs[2,1])
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.01 , p.get_height() * 1.01))
ax.set(xlabel = '', title= 'Exports')


# Imports: Imports of goods and services.
Imports = df[['country','imports']].sort_values('imports', ascending = True).head(5)
ax = sns.barplot(x='country', y='imports', data= Imports, ax = axs[2,2])
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.01 , p.get_height() * 1.01))
ax.set(xlabel = '', title= 'Imports')

for ax in fig.axes:
    plt.sca(ax)
    plt.xticks(rotation = 90)
    
plt.tight_layout()    
plt.show()

### Inferences
 - The above plots shows the five countries which are in need of aid individually for all the factors taken in consideration
 - These plots will help us in further analysis when we'll cluster the countries and find top 5

## Visualising the varibles through univariate analysis via distplot

In [None]:
plt.figure(figsize = (25,12))
features = ['child_mort', 'exports', 'health', 'imports', 'income','inflation', 'life_expec', 'total_fer', 'gdpp']
for i in enumerate(features):
    plt.subplot(3,3,i[0]+1)
    sns.distplot(df[i[1]])

### Inferences
 - life_expec is right-skewed whereas all the rest features are left-skewed.
 - total_fer and gdpp are bimodal whereas all the rest features are unimodal.

# Outlier Treatment

In [None]:
plt.figure(figsize = (25,25))
features = ['child_mort', 'exports', 'health', 'imports', 'income','inflation', 'life_expec', 'total_fer', 'gdpp']
for i in enumerate(features):
    plt.subplot(5,2,i[0]+1)
    sns.boxplot(df[i[1]])

### Outlier Analysis

 - There is atleast one outlier in all the features
 - In case of gdpp and health, there are too many outliers.
 - As per the data given, all the outliers couldn't be capped as we don't want the countries that needs aid to be treated with a single value.
 - As per business need, The Features are capped as mentioned below:
1. Soft-Range capping of upper end outiers is done for mentioned features  : 'exports', 'health', 'imports', 'income','inflation','total_fer', 'gdpp'.
2. There are lower end outliers for feature 'life_expec' but capping is not done as those countries would be in need of aid.
3. There are upper end outliers for feature 'child_mort' but capping is not done as those countries would be in need of aid.
   

In [None]:
# Capping of outliers
features = ['exports', 'health', 'imports', 'income','inflation', 'total_fer', 'gdpp']
for i in features:
    q1 = df[i].quantile(0.01)
    q4 = df[i].quantile(0.99)
    df[i][df[i]>=q4]=q4

In [None]:
plt.figure(figsize = (25,25))
features = ['child_mort', 'exports', 'health', 'imports', 'income','inflation', 'life_expec', 'total_fer', 'gdpp']
for i in enumerate(features):
    plt.subplot(5,2,i[0]+1)
    sns.boxplot(df[i[1]])

After the outlier treatment, we now have values that look like outliers in the above plot, this is an artifact of the earlier transformation. These values are within the "normal" expected range of values for the data.

As per the business requirement, not capping the outliers further.

# Hopkins Check


The process to evaluate the data and check if the data is feasible for clustering or not, is known as clustering tendency.

To check clustering tendency, we calculate the hopkins statistics. It determines whether the data points differs significantly from uniformly distributed data in multidimensional space.

In [None]:
#Calculating the Hopkins statistic
from sklearn.neighbors import NearestNeighbors
from random import sample
from numpy.random import uniform
import numpy as np
from math import isnan
 
def hopkins(X):
    d = X.shape[1]
    #d = len(vars) # columns
    n = len(X) # rows
    m = int(0.1 * n) 
    nbrs = NearestNeighbors(n_neighbors=1).fit(X.values)
 
    rand_X = sample(range(0, n, 1), m)
 
    ujd = []
    wjd = []
    for j in range(0, m):
        u_dist, _ = nbrs.kneighbors(uniform(np.amin(X,axis=0),np.amax(X,axis=0),d).reshape(1, -1), 2, return_distance=True)
        ujd.append(u_dist[0][1])
        w_dist, _ = nbrs.kneighbors(X.iloc[rand_X[j]].values.reshape(1, -1), 2, return_distance=True)
        wjd.append(w_dist[0][1])
 
    H = sum(ujd) / (sum(ujd) + sum(wjd))
    if isnan(H):
        print(ujd, wjd)
        H = 0
 
    return H

In [None]:
hopkins(df.drop('country' , axis=1))

A Higher hopkins score indicates that clustering can be peformed on the dataset.  

## Scaling

Standard-Scaler scales the features around the centre with mean 0 and with a standard deviation of 1.

In [None]:
df_scale = df[['child_mort', 'exports', 'health', 'imports', 'income','inflation', 'life_expec', 'total_fer', 'gdpp']]

# instantiating the scaler
scaler = StandardScaler()

# fit and transform
df_scale = scaler.fit_transform(df_scale)
df_scale.shape

In [None]:
df_scale = pd.DataFrame(df_scale)
df_scale.columns = ['child_mort', 'exports', 'health', 'imports', 'income','inflation', 'life_expec', 'total_fer', 'gdpp']
df_scale.head()

Dataframe "df_scale" is ready for clustering

# Clustering

## K-Means Clustering

### Elbow methord for determing K

In [None]:
# elbow-curve/SSD
ssd = []
n_cluster = list(range(1,10))
for num_clusters in n_cluster:
    kmeans = KMeans(n_clusters=num_clusters, max_iter=50)
    kmeans.fit(df_scale)
    
    ssd.append(kmeans.inertia_)
    
# plot the SSDs for each n_clusters
plt.plot(n_cluster,ssd,marker='o',markersize=7)
plt.vlines(x=3, ymax=ssd[-1], ymin=ssd[0], colors="g", linestyles="-")
plt.hlines(y=ssd[2], xmax=9, xmin=1, colors="r", linestyles="-")
plt.xlabel('Number of clusters',fontsize=15)
plt.ylabel('Sum of Squared distance',fontsize=15)
plt.title("Elbow Curve")
plt.show()

### Silhouette Score for determing K

In [None]:
ss = []
for k in range(2,11):
    kmeans = KMeans(n_clusters = k).fit(df_scale)
    ss.append([k, silhouette_score(df_scale, kmeans.labels_)])

plt.plot(pd.DataFrame(ss)[0], pd.DataFrame(ss)[1],marker='o',markersize=7)
plt.xlabel('Number of Clusters',fontsize=15)
plt.ylabel('Silhouette Width',fontsize=15)
plt.title("Silhouette Score")
plt.show()

##### Based on Elbow curve and Silhouette Analysis Curve, we decide to make a trade-off and choose the optimum value of K as 3

## Running K-means with K = 3

In [None]:
kmeans = KMeans(n_clusters=3, max_iter=50,random_state = 14)
kmeans.fit(df_scale)

In [None]:
kmeans.labels_

In [None]:
# Entering the Cluster in the column 'cluster_K' for further analysis

cluster_K = pd.DataFrame(kmeans.labels_, columns = ['cluster_K'])

In [None]:
# Saving the new dataframe for further analysis

df_cluster = df.copy()

In [None]:
# Combing the cluster with cluster labels extracted from K-means

df_cluster = pd.concat([df_cluster, cluster_K ], axis =1)

In [None]:
df_cluster.head()

In [None]:
# To check How many datapoints we have in each cluster
df_cluster.cluster_K.value_counts().reset_index()

### Visualising the clusters 

Analysing the clusters by comparing how 'gdpp', 'child_mort' and 'income' vary for each cluster of countries to recognise and differentiate the clusters of developed countries from the clusters of under-developed countries.

In [None]:
# Scatter-plot:

f, axes = plt.subplots(1, 3, figsize=(20,5))
sns.scatterplot(x='income', y='child_mort', hue='cluster_K', data=df_cluster, palette='Set1',ax=axes[0]);
sns.scatterplot(x='gdpp', y='income', hue='cluster_K', data=df_cluster, palette='Set1',ax=axes[1]);

sns.scatterplot(x='gdpp', y='child_mort', hue='cluster_K', data=df_cluster, palette='Set1',ax=axes[2]);

In [None]:
# Box-plot:

f, axes = plt.subplots(1, 3, figsize=(25,7))
sns.boxplot(x='cluster_K',y='gdpp',data=df_cluster,ax=axes[0])
axes[0].set_title('GDP per capita',fontsize=15)
sns.boxplot(x='cluster_K',y='income',data=df_cluster,ax=axes[1])
axes[1].set_title('Income per person',fontsize=15)
sns.boxplot(x='cluster_K',y='child_mort',data=df_cluster,ax=axes[2])
axes[2].set_title('Child Mortality rate',fontsize=15)
plt.show()

#### Inferences
 - Countries with low gdpp,income and high child mortality are Under-developed countries (cluster_K = 2)
 - Countries with high gdpp,income and low child mortality are Developed countries (cluster_K = 1)
 - Countries with low gdpp,income and low child mortality are Developing countries (cluster_K = 0)


## Cluster Profiling

Cluster profiling is based on major three factors:
 1. Child Mortality
 2. Income
 3. GDPP

Creating a new dataframe for these variable along with the cluster ID

In [None]:
df_clusterK = df_cluster[['child_mort','income','gdpp','cluster_K']]
df_clusterK = df_clusterK.groupby('cluster_K').mean()
df_clusterK

## Visualising the profiled variables of the dataset via barplot 

In [None]:
df_clusterK.plot(kind='bar',logy=True);

#### Inferences:
 - Based on above plot, we know that countries with cluster ID 2 are Under-Developed and requires aid
 - Our main focus would be on the countries extracte below and top 5 countries in need of aid are reported to the company

In [None]:
df_cluster[df_cluster['cluster_K'] ==2]['country'].reset_index(drop=True)

### Extracting top 5 countries which have low income, gdpp and high child mortality

In [None]:
top5_Kmeans = df_cluster[df_cluster['cluster_K'] ==2].sort_values(by = ['income', 'gdpp', 'child_mort'], ascending = [True, True, False]).head(5)
top5_Kmeans.reset_index(drop=True)

 ###### Above results are based on K-means clustering. We'll be further performing heirachical clustering on our dataframe to compare the results and conclude our analysis

## Heirarchichal Clustering

In [None]:
# Taking the already scaled dataset

df_scale.head()

In [None]:
df_cluster.head()

### Performing Single Linkage

In [None]:
# single linkage
mergings = linkage(df_scale, method="single", metric='euclidean')
dendrogram(mergings)
plt.title("Single Linkage")
plt.show()

### Performing Complete Linkage

In [None]:
# complete linkage
mergings = linkage(df_scale, method="complete", metric='euclidean')
dendrogram(mergings)
plt.title("Complete Linkage")
plt.show()

#### Inferences:
 - The dendogram produced by single linkage is not well structured
 - The dendogram produced by complete linkage is having proper tree-like structure
 
### Based on above complete linkage, Creating the hierarchichal clustering model by taking n = 3

In [None]:
# 3 clusters
cluster_H = cut_tree(mergings, n_clusters=3).reshape(-1, )
cluster_H

In [None]:
# assign cluster labels
df_cluster['cluster_H'] = cluster_H
df_cluster.head()

In [None]:
df_cluster.cluster_H.value_counts().reset_index()

### Visualising the clusters 

Analysing the clusters by comparing how 'gdpp', 'child_mort' and 'income' vary for each cluster of countries to recognise and differentiate the clusters of developed countries from the clusters of under-developed countries.

In [None]:
# Scatter-Plot : 

f, axes = plt.subplots(1, 3, figsize=(20,5))
sns.scatterplot(x='income', y='child_mort', hue='cluster_H', data=df_cluster, palette='Set1',ax=axes[0]);
sns.scatterplot(x='gdpp', y='income', hue='cluster_H', data=df_cluster, palette='Set1',ax=axes[1]);
sns.scatterplot(x='gdpp', y='child_mort', hue='cluster_H', data=df_cluster, palette='Set1',ax=axes[2]);

In [None]:
# Boxplot :
f, axes = plt.subplots(1, 3, figsize=(20,5))
sns.boxplot(x='cluster_H', y='child_mort', data=df_cluster,ax=axes[0]);
axes[0].set_title('Child Mortality Rate',fontsize=15)
sns.boxplot(x='cluster_H', y='gdpp', data=df_cluster,ax=axes[1]);
axes[1].set_title('GDP per capita',fontsize=15)
sns.boxplot(x='cluster_H', y='income', data=df_cluster,ax=axes[2]);
axes[2].set_title('Income per person',fontsize=15)
plt.show()

#### Inferences
 - Since the size of the cluster varies significantly, we can't categorize the countries based on the level of developement

## Cluster Profiling

Cluster profiling is based on major three factors:
 1. Child Mortality
 2. Income
 3. GDPP

Creating a new dataframe for these variable along with the cluster ID

In [None]:
df_clusterH = df_cluster[['child_mort','income','gdpp','cluster_H']]
df_clusterH = df_clusterH.groupby('cluster_H').mean()
df_clusterH

### Visualising the profiled variables of the dataset via barplot 

In [None]:
df_clusterH.plot(kind = 'bar',logy=True);

#### Inferences:
 - Based on above plot, we know that countries with cluster ID 0 are Under-Developed and requires aid
 - Our main focus would be on the countries extracte below and top 5 countries in need of aid are reported to the company

In [None]:
df_cluster[df_cluster['cluster_H'] ==0]['country'].reset_index(drop=True)

### Extracting the top 5 countries which have low income, gdpp and high child mortality

In [None]:
top5_Hier = df_cluster[df_cluster['cluster_H'] ==0].sort_values(by = ['income', 'gdpp', 'child_mort'], ascending = [True, True, False]).head(5)
top5_Hier.reset_index(drop=True)

# Conclusion

#### I Choose K-Means Clustering Algorithm over Hierarchical Clustering Algorithm:

 - The cluster_K value counts were properly divided and visualizing each cluster was possible.

 - In both the methods, 3 clusters were formed but K-means gave significant plots.

 - After grouping all the countries into 3 groups by using some socio-economic and health factors, we can determine the overall development of the countries.

 - Here, the countries are categorised into list of developed countries, developing countries and under-developed countries.

 - In Developed countries, we can see the GDP per capita and income is high where as Death of children under 5 years of age per 1000 live births i.e. child-mort is very low, which is expected.

 - In Developing countries and Under-developed countries, the GDP per capita and income are low and child-mort is high. Specially, for under-developed countries, the death rate of children is very high.

# Recommendations:

## The top countries that are in need of aid are presented below:

In [None]:
for countries in top5_Kmeans.country:
    print(countries)

### The primary focus for the funding should be on countires of this segment.

+ The major approach should be to provide better health facilities as this would increase the life expectancy and decrese the child mortality.
 
+ Other focus should be on increasing the exports as that would give an increase in income and thus, GDPP will increase.

+ The difference between the average of socio-economic factors of developed countries as compared to under-developed countries is quite huge.This funding would definitely help in improved conditions for above mentioned countries even though it might not show a significant impact