In [None]:
import pandas as pd 
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
data=pd.read_csv('../input/unsupervised-learning-on-country-data/Country-data.csv')


In [None]:
data.shape

In [None]:
data.head()

***Explotary Data Analysis***

In [None]:
for feature in data.columns :
    if feature!='country' :
        sns.histplot(data[feature],bins=20)
        plt.show()

In [None]:
for feature in data.columns :
    if feature!='country' :
        sns.boxplot(data[feature])
        plt.show()

In [None]:
sns.scatterplot(x='income',y='imports',data=data)

In [None]:
 sns.scatterplot(x='income',y='gdpp',data=data)

In [None]:
sns.scatterplot(x='income',y='exports',data=data)

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(data.corr(),annot=True)

**Observations**

1. Data has outliers.

2. Data is right skewed and need to  be scaled .

3. There  is very high realtion between some columns need to be removed.

4. The increase in income showed increase in gdpp.

***Data Preprocessing***

In [None]:
df=data['country']

In [None]:
from sklearn import preprocessing
encoder = preprocessing.LabelEncoder()
data['country']=encoder.fit_transform(data['country'])
data_h=data.copy()

In [None]:
# removing higly correlated columns 
data.drop('gdpp',axis=1,inplace=True)
data.drop('child_mort',axis=1,inplace=True)
data.head()

In [None]:
#### Lets compute the Interquantile range to calculate the boundaries
IQR=data.income.quantile(0.75)-data.income.quantile(0.25)
lower_bridge=data['income'].quantile(0.25)-(IQR*1.5)
upper_bridge=data['income'].quantile(0.75)+(IQR*1.5)
print(lower_bridge), print(upper_bridge)

In [None]:
data.loc[data['income']>=51967,'income']=51967
sns.boxplot(data['income'])

****Standarad Scaler does not perform well with outliers so outliers handled****

In [None]:
scaler = preprocessing.StandardScaler()

In [None]:
scaled_data= pd.DataFrame(scaler.fit_transform(data), columns=data.columns)
scaled_data

**K Means Clustering**

 Depict  k value from elbow method  

In [None]:
from sklearn.cluster import KMeans
Sum_of_squared_distances = []
for k in range(1,20):
    km = KMeans(n_clusters=k)
    km = km.fit(scaled_data)
    Sum_of_squared_distances.append(km.inertia_)

In [None]:
plt.figure(figsize=(7, 5))
plt.plot(range(1,20), Sum_of_squared_distances, 'b--')
plt.xlabel('K values')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()


In [None]:
from yellowbrick.cluster import SilhouetteVisualizer
for i in [2,3,4,5,6,7]:
    km = KMeans(n_clusters=i, init='k-means++', n_init=10, max_iter=100, random_state=42)
    visualizer = SilhouetteVisualizer(km, colors='yellowbrick')
    print("Silhouette score for k= "+str(i))
    visualizer.fit(scaled_data) 
    visualizer.show()


**According to elbow method and silhouette visualization k=2 is considered optimal value with good silhouette score**

In [None]:
from sklearn.metrics  import silhouette_score
for i in [2,3,4,5,6,7] :
    cluster = KMeans(n_clusters=i,random_state=42)
    cluster_labels = cluster.fit_predict(scaled_data)
    score = silhouette_score(scaled_data, cluster_labels)
    print("Score for k= {} is {}".format(i,score))

In [None]:
cluster = KMeans(n_clusters=2)
cluster_labels = cluster.fit_predict(scaled_data)

In [None]:
cluster.cluster_centers_

In [None]:
scaled_data['clusters']=cluster_labels
scaled_data.head()

**Visualizing**

As we have 10 features we cannot plot them on graph to see clusters so we need to do Principal component Analysis to convert 10 features to 2 features to be plotted on 2-D graph

In [None]:
from sklearn.decomposition import PCA
pca=PCA(n_components=2)

In [None]:
reduced_data=pd.DataFrame(pca.fit_transform(scaled_data),columns=['PCA1','PCA2'])

In [None]:
reduced_data['cluster']=cluster_labels 

In [None]:
reduced_data.shape

In [None]:
plt.figure(figsize=(7,7))

plt.scatter(reduced_data[reduced_data['cluster'] == 0].loc[:, 'PCA1'], reduced_data[reduced_data['cluster'] == 0].loc[:, 'PCA2'], color='red')
plt.scatter(reduced_data[reduced_data['cluster'] == 1].loc[:, 'PCA1'], reduced_data[reduced_data['cluster'] == 1].loc[:, 'PCA2'], color='blue')
plt.xlabel("PC1")
plt.ylabel("PC2")

plt.show()

In [None]:
for feature in scaled_data :
    if feature!='country' and feature!='clusters':
        sns.barplot(x='clusters',y=feature,data=scaled_data)
        plt.show()

**Observations**

1. The countries in cluster 0 is having high child_mort.

2. The countries in cluster 0 is having low  gdpp and low income.



In [None]:
data_ff={'country':np.array(df),'cluster':np.array(scaled_data['clusters'])}
country=pd.DataFrame(data=data_ff)

In [None]:
country

In [None]:
countries=country[country['cluster']==0]

In [None]:
countries

**Hierarchical Clustering**

In [None]:
data_red=reduced_data.copy()
data_red.drop('cluster',axis=1,inplace=True)

In [None]:
import scipy.cluster.hierarchy as shc  
dendro = shc.dendrogram(shc.linkage(data_red, method="ward"))  
plt.title("Dendrogrma Plot")  
plt.ylabel("Euclidean Distances")  
plt.xlabel("Countries")  
plt.show()  

Based on dendogram i choosed 2 cluster

In [None]:
from sklearn.cluster import AgglomerativeClustering  
hc=AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='ward')  

In [None]:
 hc.fit_predict(data_red)  

In [None]:
data_red['cluster']=hc.labels_
data_red.head()

In [None]:
plt.figure(figsize=(7,7))

plt.scatter(data_red[data_red['cluster'] == 0].loc[:, 'PCA1'], data_red[data_red['cluster'] == 0].loc[:, 'PCA2'], color='yellow')
plt.scatter(data_red[data_red['cluster'] == 1].loc[:, 'PCA1'], data_red[data_red['cluster'] == 1].loc[:, 'PCA2'], color='pink')
plt.xlabel("PC1")
plt.ylabel("PC2")

plt.show()

***From both the clusters w ecan suggest country names to be laid more focus on for providing aid.***