In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#Modules for EDA
import numpy as np 
import pandas as pd 
from matplotlib import pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

#ML modules
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
%matplotlib inline

# **Feature Descriptions**

In [None]:
dd = pd.read_csv('../input/unsupervised-learning-on-country-data/data-dictionary.csv')
dd.columns,dd.shape

In [None]:
for i in range(10):
    print(dd['Column Name'][i],":")
    print(dd['Description'][i])
    print('\n')

In [None]:
dd.set_index('Column Name',inplace=True)

In [None]:
df = pd.read_csv('../input/unsupervised-learning-on-country-data/Country-data.csv')
df.shape

In [None]:
df.info()

In [None]:
df.head()

In [None]:
#Global stats
df.describe()

# **Top 5 and Bottom 5 countries of each Category**

In [None]:
def get5(col,return_df=False,place='Top'):
    
    c5 = df[['country',col]].sort_values(by=col,ascending=False)
    if place=='Top':
        c5 = c5.head()
    elif place=='Bottom':
        c5 = c5.tail()
    else:
        raise ValueError('place can be "Top" or "Bottom"')
    plt.figure(figsize=(7,7))
    plt.title(f'{place} 5 countries in {col} category')
    sns.barplot(data=c5,y='country',x=col)
    
    plt.show()
    if return_df:
        return top5

* ## **Top 5**

In [None]:
for col in df.columns[1:]:
    get5(col)
    print('\n')

* ## **Bottom 5**

In [None]:
for col in df.columns[1:]:
    get5(col,place='Bottom')
    print('\n')

# **Overall Rankings (Barplots)**

In [None]:
def getBarplot(col,title,scale='log'):
    cdf = df[['country',col]].sort_values(by=col,ascending=False)
    plt.figure(figsize=(10,39))
    sns.barplot(data=cdf,y='country',x=col)
    plt.title(title)
    plt.xscale(scale)
    plt.show()

In [None]:
getBarplot('child_mort',title=dd.loc['child_mort'][0])

In [None]:
getBarplot('health',title=dd.loc['health'][0],scale='linear')

#### **Similarly you can check for other columns as well.**

# **Imports VS Exports**

In [None]:
for i in range(167):
    exp_imp = df[['imports','exports']].loc[i]
    
    exp_imp.plot(kind='bar',color=['blue','red'])
    plt.title(df['country'].loc[i])
    
    plt.show()

# **Feature scaling**

In [None]:
cols_to_scale = df.columns[1:]
cols_to_scale

In [None]:
scale = MinMaxScaler()
scalled = scale.fit_transform(df[cols_to_scale])

In [None]:
i=0
for column in cols_to_scale:
    df[column] = scalled[:,i]

In [None]:
df.head()

# **Clustering**

In [None]:
x = df.drop('country',axis=1)

In [None]:
x.describe()

# **Using Elbow Method to find optimal value for 'K'**

In [None]:
sse = []
k_rng = range(1,11)
for k in k_rng:
    km = KMeans(n_clusters=k)
    km.fit(df[df.columns[1:]])
    sse.append(km.inertia_)

In [None]:
plt.title('Elbow Method')
plt.plot(range(1,11),sse)
plt.show()

# **Value of K is 2**

In [None]:
km = KMeans(n_clusters=2)
predict = km.fit_predict(x)

In [None]:
predict

# **Mapping clusters and countries**

In [None]:
cluster_country = pd.DataFrame({'Country':df['country'],'Cluster':predict})
cluster_country.head()

# **Countries that belong to cluster 1**

In [None]:
cluster1 = cluster_country[cluster_country['Cluster'] == 1]
cluster1.shape

In [None]:
cluster1

# **Countries that belong to cluster 0**

In [None]:
cluster0 = cluster_country[cluster_country['Cluster'] == 0]
cluster0.shape

In [None]:
pd.set_option('max_rows',123)
cluster0

# **PCA**

In [None]:
pca = PCA(n_components=2)
data = pca.fit_transform(x)

In [None]:
data = pd.DataFrame({'x':data[:,0],'y':data[:,1],'Clusters':predict})

In [None]:
plt.figure(figsize=(7,7))
sns.scatterplot(data=data,x='x',y='y',hue='Clusters')
plt.title('PCA Result')
plt.show()