In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/unsupervised-learning-on-country-data/Country-data.csv')

In [None]:
data.head()

**Attribute Information**


1. country : Name of the country
2. child_mort : Death of children under 5 years of age per 1000 live births
3. exports : Exports of goods and services per capita. Given as %age of the GDP per capita
4. health : Total health spending per capita. Given as %age of GDP per capita
5. imports : Imports of goods and services per capita. Given as %age of the GDP per capita
6. Income : Net income per person
7. Inflation : The measurement of the annual growth rate of the Total GDP
8. life_expec : The average number of years a new born child would live if the current mortality patterns are to remain the same
9. total_fer : The number of children that would be born to each woman if the current age-fertility rates remain the same.
10. gdpp : The GDP per capita. Calculated as the Total GDP divided by the total population.


**Problem Statement :**
To categorise the countries using some socio-economic and health factors that determine the overall development of the country. Then you need to suggest the countries such that
1. Under-developing country
2. Developing country
3. Developed country

## EDA 

In [None]:
data.info()

Here, using Exports and Imports we can find Trade Deficiency i.e  _Export - Import_

In [None]:
data['Trade_Deficiency'] = data['exports'] - data['imports']

In [None]:
data.describe()

In [None]:
data.isnull().sum()

***There is no missing data***

In [None]:
data_copy = data.copy()

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(data.corr(method='spearman'), square=True, cmap='coolwarm', annot=True)

In [None]:
corr_values = data.corr(method='spearman')

# Simplify by emptying all the data below the diagonal
tril_index = np.tril_indices_from(corr_values)

# Make the unused values NaNs
for coord in zip(*tril_index):
    corr_values.iloc[coord[0], coord[1]] = np.NaN
    
# Stack the data and convert to a data frame
corr_values = (corr_values
               .stack()
               .to_frame()
               .reset_index()
               .rename(columns={'level_0':'feature1',
                                'level_1':'feature2',
                                0:'correlation'}))

# Get the absolute values for sorting
corr_values['abs_correlation'] = corr_values.correlation.abs()

In [None]:
corr_values.sort_values(by = 'abs_correlation', ascending=False).head(10)

**EDA Inferences:-**

>1. Child Mortality rate decreases with Increase in GDPP. Both of them have a very high correlation, which is expected as developed contries(having higer GDPP) will have better healthcare and hence better chance of survival.

>2. Child Mortality rate is directly proportional to total fertility rate. Which is sort of a dependent feature in my opinion. Women are giving birth to more children if the previous ones didn't survive unfortunately.

>3. Child Mortality rate is iversely proportional to Life expectance. Which is again a very dependent feature, because if more number children unfortunately die too early it pulls down the overall life expectancy of the country.

>4. Inflation is weakly inversely proportional to GDPP which in my opinion might be due to the economic saturation in highly developed nations.

>6. Per capita Income is heavily correlated to GDPP because one is roughly a function of another.

>7. Import and exports increade with one another which implied the trading power of the contry as a whole grows; i.e, countries who export more are also likely to import something else more.

>8. Spendings on health increase with GDPP and Income, which is self-explanatory.

>9. Income rises with exports which might be because people generate income by generating goods and services which are later exported.

>10. Child Mortality rate is iversely proportional to Healthcare expenditure and Income. Which shows that unfortunate circumstances with low income groups is often responsible for a low life expentancy among children.


In [None]:
sns.pairplot(data = data)

**Outliers Treatment**

In [None]:
fig, ax = plt.subplots(4, 3, figsize=(15, 15))
bp=sns.boxplot(y=data.child_mort,ax=ax[0, 0])
ax[0, 0].set_title('Child Mortality Rate')
bp=sns.boxplot(y=data.health,ax=ax[0, 1])
ax[0, 1].set_title('Health')
bp=sns.boxplot(y=data.income,ax=ax[0, 2])
ax[0,2].set_title('Income per Person')
bp=sns.boxplot(y=data.inflation,ax=ax[1, 0])
ax[1,0].set_title('Inflation')
bp=sns.boxplot(y=data.imports,ax=ax[1,1])
ax[1, 1].set_title('Imports')
s=sns.boxplot(y=data.life_expec,ax=ax[1, 2])
ax[1,2].set_title('Life Expectancy')
s=sns.boxplot(y=data.total_fer,ax=ax[2,0])
ax[2,0].set_title('Total Fertility')
s=sns.boxplot(y=data.gdpp,ax=ax[2, 1])
ax[2,1].set_title('GDP per Capita')
s=sns.boxplot(y=data.exports,ax=ax[2,2])
ax[2,2].set_title('Exports')
s=sns.boxplot(y=data.Trade_Deficiency,ax=ax[3,1])
ax[3,1].set_title('Trade Deficiency')
ax[3,0].axis('off')
ax[3,2].axis('off')
plt.show()

There are some outliers in income, exports, imports and trade deficiency and as we have less data so we can not remove outliers

In [None]:
gdpp = data.sort_values(by = ['gdpp'], ascending = True)
plt.figure(figsize = [18,10])
plt.subplot(2,1,1)
sns.barplot(gdpp['country'].head(10),gdpp['gdpp'].head(10))
plt.title('Top 10 countries having lowest GDP per captial')
plt.subplot(2,1,2)
sns.barplot(gdpp['country'].tail(10),gdpp['gdpp'].tail(10))
plt.title('Top 10 countries having highest GDP per captial')
plt.show()

**Country having lowest GDP**
>1. Congo, Dem Rep
>2. Niger
>3. Madascar

**Country having highest GDP**
>1. Bahrain
>2. Barbados
>3. Estonia

In [None]:
Trade_Deficiency = data.sort_values(by = ['Trade_Deficiency'], ascending = True)
plt.figure(figsize = [18,10])
plt.subplot(2,1,1)
sns.barplot(Trade_Deficiency['country'].head(10),Trade_Deficiency['Trade_Deficiency'].head(10))
plt.title('Top 10 countries having lowest Trade Deficiency')
plt.subplot(2,1,2)
sns.barplot(Trade_Deficiency['country'].tail(10),Trade_Deficiency['Trade_Deficiency'].tail(10))
plt.title('Top 10 countries having highest Trade Deficiency')
plt.show()

**Country having lowest Trade Deficiency**
>1. Haiti
>2. Tonga
>3. Tajikistan

**Country having highest Trade Deficiency**
>1. Gabon
>2. Azerbaijan
>3. Libya

In [None]:
life_expec = data.sort_values(by = ['life_expec'], ascending = True)
plt.figure(figsize = [18,10])
plt.subplot(2,1,1)
sns.barplot(life_expec['country'].head(10),life_expec['life_expec'].head(10))
plt.title('Top 10 countries having lowest Life Experience')
plt.subplot(2,1,2)
sns.barplot(life_expec['country'].tail(10),life_expec['life_expec'].tail(10))
plt.title('Top 10 countries having highest Life Experience')
plt.show()

**Country having lowest Life Experience**
>1. Haiti
>2. Central African Republic
>3. Zambia

**Country having highest Life Experience**
>1. Tunisia
>2. Antihua and barbuda
>3. Barbados

In [None]:
fig, ax = plt.subplots(4, 3, figsize=(15, 15))
dp=sns.distplot(data['child_mort'],ax=ax[0, 0])
dp=sns.distplot(data['health'],ax=ax[0, 1])
dp=sns.distplot(data['income'],ax=ax[0, 2])
dp=sns.distplot(data['inflation'],ax=ax[1, 0])
dp=sns.distplot(data['imports'],ax=ax[1,1])
dp=sns.distplot(data['life_expec'],ax=ax[1, 2])
dp=sns.distplot(data['total_fer'],ax=ax[2,0])
dp=sns.distplot(data['gdpp'],ax=ax[2, 1])
dp=sns.distplot(data['exports'],ax=ax[2,2])
dp=sns.distplot(data['Trade_Deficiency'],ax=ax[3,1])
ax[3,0].axis('off')
ax[3,2].axis('off')
plt.show()

>1. From the above plot, most of the average income per person and gdp per capita is observed in the range of 0-15000
>2. On average, we can infer that life expectancy of a person for most of the countries is observed between 60-80
>3. Child mortality (Death of children under 5 years of age per 1000 live births) seems to be below 50 in most of the countries, only few countries child mortality is above 100.

***Data Scaling using Standard scaler***

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
# scale the data
data = data.drop(['country'], axis=1)
sc = StandardScaler()

In [None]:
data = sc.fit_transform(data)

# Principal Component Analysis

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA().fit(data)

In [None]:
plt.rcParams['figure.figsize'] = [6,6]
sns.set_style("whitegrid")
sns.set_context("talk")

# plot the Cumulative Summation of the Explained Variance
plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_))

# define the labels & title
plt.xlabel('Number of Components', fontsize = 15)
plt.ylabel('Variance (%)', fontsize = 15) 
plt.title('Explained Variance', fontsize = 20)

# show the plot
plt.show()

When can use **6 components**  

In [None]:
data_pca = PCA(n_components=6).fit(data).transform(data)

In [None]:
# store it in a new data frame
data_pca= pd.DataFrame(data = data_pca, columns = ['principal component 1', 'principal component 2',
                                                        'principal component 3','principal component 4',
                                                        'principal component 5','principal component 6'])

data_pca.head()

# Kmeans Clustering

In [None]:
from sklearn.cluster import KMeans

In [None]:
inertia = []
list_num_clusters = list(range(1,11))
for num_clusters in list_num_clusters:
    km = KMeans(n_clusters=num_clusters)
    km.fit(data_pca)
    inertia.append(km.inertia_)
    
plt.plot(list_num_clusters,inertia)
plt.scatter(list_num_clusters,inertia)
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')

In [None]:
kmeans = KMeans(n_clusters = 3)

In [None]:
kmeans.fit(data_pca)
kmeans_pred = kmeans.predict(data_pca)

In [None]:
data_copy['cluster'] = kmeans_pred

In [None]:
plt.figure(figsize = [15,10])
sns.scatterplot(x = 'child_mort', y = 'total_fer', data = data_copy, hue = 'cluster')

In [None]:
data_copy[data_copy['cluster'] == 0][:10]

**Cluster 1** are those developing countries, most of which are in South America and Asia

In [None]:
data_copy[data_copy['cluster'] == 1][:10]

**Cluster 2** are those  undeveloped countries, most of which are in Africa

In [None]:
data_copy[data_copy['cluster'] == 2][:10]

**Cluster 3** are those developed countries, most of which are in Europe, North America and some part of Asia

# Hierarchical Agglomerative Clustering

In [None]:
from sklearn.cluster import AgglomerativeClustering

In [None]:
ag = AgglomerativeClustering(n_clusters=3, linkage='ward', compute_full_tree=True)
ag = ag.fit(data_pca)
data_copy['agglom'] = ag.fit_predict(data_pca)

In [None]:
plt.figure(figsize = [15,10])
sns.scatterplot(x = 'child_mort', y = 'total_fer', data = data_copy, hue = 'agglom')

In [None]:
from scipy.cluster import hierarchy

In [None]:
Z = hierarchy.linkage(ag.children_, method='ward')

fig, ax = plt.subplots(figsize=(15,5))

den = hierarchy.dendrogram(Z, orientation='top', 
                           p=3, truncate_mode='lastp',
                           show_leaf_counts=True, ax=ax,)