In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In in this notebook I will going to cluster the countries by using unsupervised learning.I am going to use two techniques here first one is **K-means clustering** and the second one is **Hierarchical clustering**

The motive of clustering the countries here is to help international NGOs to decide how much money they need to spend on different countries for their development

In last I am going to make a list of under-developing,developing and developed countries by doing some analysis

In [None]:
# import libraries

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import scipy.cluster.hierarchy as hcluster
from sklearn.cluster import AgglomerativeClustering

In [None]:
# path 
data_dict_path = '../input/unsupervised-learning-on-country-data/data-dictionary.csv'
country_path = '../input/unsupervised-learning-on-country-data/Country-data.csv'

In [None]:
# read csv files
dict_df = pd.read_csv(data_dict_path)
df = pd.read_csv(country_path)

In [None]:
# first few rows of dictionary dataset
dict_df.head()

In [None]:
# first few rows of countrty dataset
df.head()

In [None]:
# shape of dataset
df.shape

In [None]:
#some basic info
df.info()
print(50*'-')
dict_df.info()

In [None]:
#some basic statistical data
df.describe()

In [None]:
#null value 
df.isnull().sum()

In [None]:
#null value
dict_df.isnull().sum()

Let's see the number of unique countries present in our dataset

In [None]:
#number of unique countries
df['country'].nunique()

As we are going to use Unsupervised learning technique we don't need the country column here.So we are going to drop it

In [None]:
#drop country column
data=df.drop(['country'],axis=1)

In [None]:
data.head()

In [None]:
#correlation 
corr_matrix=data.corr()
sns.heatmap(corr_matrix,annot=True)

From above we can conclude that:<br>

1. gdpp and income,imports and exports,child_mort and total_fert are highly positive correlated<br>
2. whereas life_expec and child_mort are highly negative correlated

In [None]:
#box plot
fig, ax = plt.subplots(3, 3, figsize=(15, 15))
bp=sns.boxplot(y=df.child_mort,ax=ax[0, 0])
ax[0, 0].set_title('Child Mortality Rate')
bp=sns.boxplot(y=df.health,ax=ax[0, 1])
ax[0, 1].set_title('Health')
bp=sns.boxplot(y=df.income,ax=ax[0, 2])
ax[0,2].set_title('Income per Person')
bp=sns.boxplot(y=df.inflation,ax=ax[1, 0])
ax[1,0].set_title('Inflation')
bp=sns.boxplot(y=df.imports,ax=ax[1,1])
ax[1, 1].set_title('Imports')
s=sns.boxplot(y=df.life_expec,ax=ax[1, 2])
ax[1,2].set_title('Life Expectancy')
s=sns.boxplot(y=df.total_fer,ax=ax[2,0])
ax[2,0].set_title('Total Fertility')
s=sns.boxplot(y=df.gdpp,ax=ax[2, 1])
ax[2,1].set_title('GDP per Capita')
s=sns.boxplot(y=df.exports,ax=ax[2,2])
ax[2,2].set_title('Exports')
plt.show()

From above box-plots we can see that their are so many outliers in our dataset.Most of the outliers are in income per person,GDP per captia and Exports.Now we can remove the outliers but we are not going to do it as our dataset is very small (167 rows only)

In [None]:
sns.pairplot(df)

# Scaling data

Our dataset is not scaled some values are much bigger than others,if we will not scale our data our model will not going to perform well.So now we are are going to scale our data for this we are going to use a StandardScaler library<br>
**StandardScaler** transform the data such the the mean will be 0 and variance will be 1.

**Note:-** Scaling data is necessary just for the algorithms which is based on the distance like K-means clustering and Hierarchical clustering

In [None]:
#scaling data
scaling=StandardScaler()
scaled=scaling.fit_transform(data)

In [None]:
scaled_df=pd.DataFrame(scaled,columns=data.columns)

# princt scaled dataset
scaled_df.head()

# K-Means Clustering

K-means Clustering is the most popular unsupervised machine learning algorithm.It is a centroid-based or distance-based algorithm.The woking of the alorithms is as follows:<br>
1. First we initialize k points called means randomly
2. Then we categorize each item to its closest mean and we update the mean's coordinates,which are the averages of the items     categorized in the mean so far
3. We repeat the process for a given number of iteartions and at the end,we have our clusters

To decide how many number of clusters consider we are going to use a most popular elbow method

In [None]:
# plot elbow curve

a=[]
K=range(1,10)
for i in K:
    kmean=KMeans(n_clusters=i)
    kmean.fit(data)
    a.append(kmean.inertia_)
    
plt.plot(K,a,marker='o')
plt.title('Elbow Method',fontsize=15)
plt.xlabel('Number of clusters',fontsize=15)
plt.ylabel('Sum of Squared distance',fontsize=15)
plt.show()

As we can see the elbow or a knee like bend is at 3.So choosing 3 as a number of clusters 

In [None]:
#chosing no. of clusters as 3 and refitting kmeans model
kmeans = KMeans(n_clusters = 3,random_state = 111)
kmeans.fit(scaled_df)

In [None]:
#count number of records in every cluster
pd.Series(kmeans.labels_).value_counts()

Now we are going to check how our model is,using **Silhouette Coefficient** 

In [None]:
#calculate how good our model is
#calculate Silhouette Coefficient for K=3

metrics.silhouette_score(scaled_df, kmeans.labels_)

# Prediction

In [None]:
#predicting values
cluster_labels = kmeans.fit_predict(scaled_df)

In [None]:
preds = kmeans.labels_
kmeans_df = pd.DataFrame(df)
kmeans_df['KMeans_Clusters'] = preds
kmeans_df.head(10)

In [None]:
#save a kmeans file
kmeans_df.to_csv('kmeans_result.csv',index=False)

# Visualization of clusters

In [None]:
#visulization of clusters child mortality vs gdpp
sns.scatterplot(kmeans_df['child_mort'],kmeans_df['gdpp'],hue='KMeans_Clusters',data=kmeans_df) 
plt.title("Child Mortality vs gdpp", fontsize=15)
plt.xlabel("Child Mortality", fontsize=12)
plt.ylabel("gdpp", fontsize=12)
plt.show()

In [None]:
#visulization of clusters inflation vs gdpp
sns.scatterplot(kmeans_df['inflation'],kmeans_df['gdpp'],hue='KMeans_Clusters',data=kmeans_df) 
plt.title("inflation vs gdpp", fontsize=15)
plt.xlabel("inflation", fontsize=12)
plt.ylabel("gdpp", fontsize=12)
plt.show()

From above two clusters graph we can conclude that<br>
1. Country having high child-mortality, low GDP per catia and low inflation(The measurement of the annual growth rate of the Total GDP) is a **under-developing country**
2. Country having low child-mortality, high gdpp and high infaltion is the **developed country**


**So here we conclude that**<br>
0 = **under-developing country**<br>
1 = **developing country**<br>
2 = **developed country**<br>

In [None]:
#find number of developed country,developing country,under-developed country
under_developing=kmeans_df[kmeans_df['KMeans_Clusters']==0]['country']
developing=kmeans_df[kmeans_df['KMeans_Clusters']==1]['country']
developed=kmeans_df[kmeans_df['KMeans_Clusters']==2]['country']

print("Number of deveoped countries",len(under_developing))
print("Number of developing countries",len(developing))
print("Number of under-developing countries",len(developed))

In [None]:
#list of developed countries
list(developed)

In [None]:
#list of developing countries
list(developing)

**Let's check that is India is present in developing countries list**

In [None]:
for i in developing:
    if i == 'India':
        print('Yes', i , 'is present in developing countries list')     

In [None]:
#list of under-developing countries
list(under_developing)

# Hierarchical Clustering

Also called Hierarchical cluster analysis or HCA is an unsupervised clustering algorithm which involves creating clusters that have predominant ordering from top to bottom.
This clustering technique is divided into two types:
1. Agglomerative Hierarchical Clustering
2. Divisive Hierarchical Clustering

Agglomerative Hierarchical Clustering<br>
The Agglomerative Hierarchical Clustering is the most common type of hierarchical clustering used to group objects in clusters based on their similarity. It’s also known as AGNES (Agglomerative Nesting). It's a “bottom-up” approach: each observation starts in its own cluster, and pairs of clusters are merged as one moves up the hierarchy.

Dendogram<br>
A Dendrogram is a type of tree diagram showing hierarchical relationships between different sets of data.

In [None]:
#plotting dendogram
plt.figure(figsize=(50, 12))
dend=hcluster.dendrogram(hcluster.linkage(scaled_df,method='ward'))

From above dendogram we can take minimum no of clusters as 2 and maximum number of cluster as 5.As we can see fro dendogram 3 in the the right no of clusters ,so we are going to take 3 no of clusters
 


In [None]:
# Getting labels from Agglomearative Hierarchical clustering
hcluster = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward')  
hcluster.fit_predict(scaled_df)
hcluster_label = hcluster.labels_

In [None]:
hcluster_df = pd.DataFrame(df)
#adding hcluster labels in hcluster_df
hcluster_df['hcluster'] = hcluster_label
#first few rows of hcluster_df
hcluster_df.head()

# Visualization hcluster

In [None]:
#visulazing hcluster results
#child mortality vs exports 
sns.scatterplot(hcluster_df['child_mort'],hcluster_df['gdpp'],hue='hcluster',data=hcluster_df)
plt.title("Child Mortality vs gdpp", fontsize=15)
plt.xlabel("Child Mortality", fontsize=12)
plt.ylabel("gdpp", fontsize=12)
plt.show()

In [None]:
#visulazing hcluster results
sns.scatterplot(hcluster_df['inflation'],hcluster_df['gdpp'],hue='hcluster',data=hcluster_df)
plt.title("Inflation vs gdpp", fontsize=15)
plt.xlabel("Inflation", fontsize=12)
plt.ylabel("gdpp", fontsize=12)
plt.show()

From above we can conclude that:-<br>
0 = developed country<br>
1 = developing country<br>
2 = under-developing country

In [None]:
#find number of developed country,developing country,under-developed country
developed=hcluster_df[hcluster_df['hcluster']==0]['country']
developing=hcluster_df[hcluster_df['hcluster']==1]['country']
under_developing=hcluster_df[hcluster_df['hcluster']==2]['country']

print("Number of deveoped countries",len(developed))
print("Number of developing countries",len(developing))
print("Number of under-developing countries",len(under_developing))

**If you like the notebook then don't forget to upvote it**