In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import plotly.express as px

In [None]:
df = pd.read_csv('../input/unsupervised-learning-on-country-data/Country-data.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
plt.figure(figsize=(12,8),dpi=150)
sns.heatmap(data=df.corr(),annot = True)

In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(data=df, x='income',y='life_expec',alpha=0.5)
plt.title('Relation between Life Expectancy and Income')
plt.xlabel('Income')
plt.ylabel('Life Expectancy')
plt.show()

In [None]:
#Based on the scatter plot, there is an outlier with a very low life Expectancy
df[df['life_expec']<40]

In [None]:
#Based on the scatter plot, there is an outlier with a very high Income
df[df['income']>120000]


In [None]:
plt.figure(figsize=(10,6))
sns.histplot(data=df, x='income',bins=30)
plt.title('Distirbution of Income')
plt.xlabel('Income')
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(data=df, x='gdpp',y='inflation',alpha=0.5)
plt.title('Relation between GDP and Inflation')
plt.xlabel('GDP')
plt.ylabel('Inflation')
plt.show()

In [None]:
##Based on the scatter plot, there is an outlier with a very high Inflation Rate

In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(data=df, x='income',y='inflation',alpha=0.5)
plt.title('Relation between Income and Inflation')
plt.xlabel('Income')
plt.ylabel('Inflation')
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(data=df, x='income',y='child_mort',alpha=0.5)
plt.title('Relation between Income and Child Mortality')
plt.xlabel('Income')
plt.ylabel('Child Mortality')
plt.show()

In [None]:
child_mortality = df[df['child_mort'] > df['child_mort'].mean()]

In [None]:
# The order of countries with child mortality rate above the average (in order)
child_mortality.sort_values(by='child_mort',ascending=False)

In [None]:
#Scaling data
scaler = StandardScaler()


In [None]:
scaled_df = scaler.fit_transform(df.drop('country',axis=1))

# 1. Using KMeans Cluster

In [None]:
ssd = []
sil = []
for k in range(1,30):
    model = KMeans(n_clusters= k)
    model.fit(scaled_df)
    labels = model.labels_
    ssd.append(model.inertia_)
    if k > 1:
        sil.append(silhouette_score(scaled_df,labels))


In [None]:
# Using the elbow method to select number of clusters
plt.figure(figsize=(12,8),dpi=150)
plt.plot(range(1,30),ssd,'o--')
plt.title('Elbow method to find number of clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('KMeans Inertia')
plt.show()

In [None]:
plt.figure(figsize=(12,8),dpi=150)
plt.plot(np.arange(2,30,1), sil, 'o--')
plt.title('Silhouette Score')
plt.show()

In [None]:
# 3 cluster chosen
k_model = KMeans(n_clusters=3)
k_model.fit(scaled_df)
labels = k_model.predict(scaled_df)

In [None]:
df['labels'] = labels

In [None]:
df

In [None]:
plt.figure(figsize=(10,6),dpi=150)
sns.scatterplot(data=df, x='gdpp',y='inflation',alpha=0.5,hue='labels',palette='Set1')
plt.title('Relation between GDP and Inflation with Labels')
plt.xlabel('GDP')
plt.ylabel('Inflation')
plt.show()

In [None]:
df.corr()['labels'].sort_values(ascending=False)
# Based on labels and correlations life expectancy, GDP and income have the highest impact on segmenting the countries.

# Plotting the labels on map

In [None]:
# adding country iso codes too dataframe (to visualize with plotly)
iso_codes = pd.read_csv('../input/country-iso-codes/country_iso_codes.csv')


In [None]:
#Adding iso codes to the original dataframe
iso_map = iso_codes.set_index('Country')['ISO Code'].to_dict()

In [None]:
df['ISO CODE']= df['country'].map(iso_map)

In [None]:
df

In [None]:
fig = px.choropleth(df, locations='ISO CODE',
                    color='labels', 
                    hover_name="country", 
                    color_continuous_scale=px.colors.sequential.Plasma)
fig.show()

# Principal Component Analysis

In [None]:
pca = PCA()
pca.fit(scaled_df)
pca_data = pca.transform(scaled_df)

In [None]:
#Calculating the percentage of varation for pca
per_var = np.round(pca.explained_variance_ratio_*100,decimals=1)

In [None]:
#plotting the Scree plot
pca_labels = ['PC' + str(x) for x in range(1,len(per_var)+1)]
plt.figure(figsize=(10,6),dpi=150)
plt.bar(x=range(1,len(per_var)+1),height=per_var,tick_label=pca_labels)
plt.ylabel('Percentage of Variance')
plt.xlabel('Principal Component')
plt.title('Scree Plot')
plt.show()
                                        
                                           

In [None]:
#Based on scree plot, we use PC1 and PC2 and draw PCA plot
pca_model = PCA(n_components=2)
df_pca = pca_model.fit_transform(scaled_df)

In [None]:
df_pca_= pd.DataFrame(df_pca, columns=['pca1', 'pca2'])

In [None]:
df_pca_

In [None]:
df_pca_['labels']= df['labels']
df_pca_['country'] = df['country']
df_pca_

In [None]:
plt.figure(figsize=(10,6),dpi=150)
ax = sns.scatterplot(data=df_pca_,x='pca1', y='pca2', hue='labels',palette='Set1')

In [None]:
df.groupby('labels').mean()