In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA


import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load Data

In [None]:
df= pd.read_csv('/kaggle/input/unsupervised-learning-on-country-data/Country-data.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

# Exploratory Data Analysis

In [None]:
num_cols= [col for col in df.columns]
num_cols.remove('country')

plt.figure(figsize=(20,20))
i=1
for col in num_cols:
    plt.subplot(5,2,i)
    sns.distplot(df[col])
    i+=1


In [None]:
sns.scatterplot(x= 'gdpp', y='exports', data=df)

In [None]:
sns.scatterplot(x= 'gdpp', y='income', data=df)

In [None]:
sns.scatterplot(x= 'gdpp', y='inflation', data=df)

In [None]:
sns.scatterplot(x= 'gdpp', y='health', data=df)

In [None]:
sns.scatterplot(x= 'gdpp', y='life_expec', data=df)

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')

# Insights
* Child mortality is highly corelated to total fertility
* Child Mortality is highly negatively corelated to life expectancy
* Exports and imports are corelated
* Income and gdp are highly corelated

# Preprocessing

In [None]:
df.drop('country', axis=1, inplace=True)

In [None]:
columns=df.columns

In [None]:
from sklearn.preprocessing import StandardScaler

ss= StandardScaler()
df_scaled= ss.fit_transform(df)

In [None]:
df_scaled= pd.DataFrame(df_scaled,columns=columns)

In [None]:
df_scaled.head()

In [None]:
distortions=[]
sil_scores=[]

for i in range(2,10):
    kmeans= KMeans(n_clusters= i )
    kmeans.fit(df_scaled)
    distortions.append(kmeans.inertia_)
    label= kmeans.labels_
    sil_scores.append(silhouette_score(df_scaled,label))

In [None]:
plt.plot(np.arange(2,10,1) ,distortions)
plt.plot(np.arange(2,10,1), distortions, 'o')

In [None]:
plt.plot(np.arange(2,10,1) , sil_scores)
plt.plot(np.arange(2,10,1), sil_scores, 'o')

# K-Means

In [None]:
kmeans= KMeans(n_clusters= 3,n_init=10, init='random', tol=1e-04, max_iter=300 )
kmeans.fit(df_scaled)
y_pred= kmeans.predict(df_scaled)
y_pred

In [None]:
df_scaled['clusters']= y_pred

data= pd.read_csv('/kaggle/input/unsupervised-learning-on-country-data/Country-data.csv')
df_scaled['Country']= data['country']
df.head()

In [None]:
sns.scatterplot(x= 'gdpp', y='income', hue='clusters', data=df_scaled)

In [None]:
sns.scatterplot(x= 'gdpp', y='health', hue='clusters', data=df_scaled)

In [None]:
df_scaled.drop('Country', axis=1, inplace=True)

In [None]:
pca= PCA(n_components=2)
df_final= pca.fit_transform(df_scaled)

In [None]:
df_final_pca= pd.DataFrame(df_final, columns=['pca1', 'pca2'])
df_final_pca.head()

In [None]:
df_final_pca['cluster']= df_scaled['clusters']
df_final_pca.head()

# Final Outcome

In [None]:
plt.figure(figsize=(7,5))
ax = sns.scatterplot(x='pca1', y='pca2', hue='cluster', data=df_final_pca, palette='bright')

In [None]:
df_final_pca['Country']= data['country']
df_final_pca.head()

In [None]:
cluster_0= df_final_pca.loc[df_final_pca['cluster']==0]
cluster_0['Country'].unique()

In [None]:
cluster_1= df_final_pca.loc[df_final_pca['cluster']==1]
cluster_1['Country'].unique()

In [None]:
cluster_2= df_final_pca.loc[df_final_pca['cluster']==2]
cluster_2['Country'].unique()

# Conclusion

1. Cluster 0 contains the 3rd world or poor countries
1. Cluster 1 contains the developed countries
1. Cluster 2 contains the developing countries with average value of the parameters 

# Like and Upvote if you liked my Notebook :)