In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import json

from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.cluster import KMeans
from sklearn.cluster import MeanShift
from sklearn.cluster import estimate_bandwidth
from sklearn.cluster import DBSCAN
from sklearn.mixture import GaussianMixture

## 1. Data selection and Preprocessing


### 1.1 BMI Data

In [None]:
df = pd.read_csv('../input/who-obesity-by-country-2016/WHO_obesityByCountry_2016.csv')
df.head(10)

### 1.2 GDP Data

In [None]:
df_gdp = pd.read_csv('../input/countries-of-the-world/countries of the world.csv')
df_gdp.head()

In [None]:
df_gdp['GDP ($ per capita)']

### 1.3 Exception due to different country names between data

In [None]:
except_list = {"Bolivia (Plurinational State of)":"Bolivia"
,"Brunei Darussalam":"Brunei"
,"Congo" : "Republic of the Congo"
,"Democratic People's Republic of Korea" :"Korea, North"
,"Iran (Islamic Republic of)" : "Iran"
,"Lao People's Democratic Republic" : "Laos"
,"Micronesia (Federated States of)" : "Micronesia"
,"Republic of Korea" : "Korea, South"
,"Republic of Moldova" : "Moldova"
,"Russian Federation" : "Russia"
,"Sudan (former)" : "Sudan"
,"Syrian Arab Republic" : "Syria"
,"Republic of North Macedonia" : "Macedonia"
,"Timor-Leste" : "East Timor"
,"United Kingdom of Great Britain and Northern Ireland" : "United Kingdom"
,"United Republic of Tanzania" : "Tanzania"
,"United States of America" : "United States"
,"Venezuela (Bolivarian Republic of)" : "Venezuela"
,"Viet Nam" : "Vietnam"}

In [None]:
np.where(df_gdp['Country'] == 'Republic of the Congo')

### 1.4 Create new data frames


In [None]:
df_bmi = pd.DataFrame(index=range(0, 186), columns=['Both', 'Male', 'Female', 'GDP'])

In [None]:
df_bmi = df_bmi.replace(np.nan, 0.0)

In [None]:
cnt = 0
for i in range(len(df)):
    
    name = df["Unnamed: 0"].values[i]
    tmp = list(df_gdp['Country'].values)
    for j in range(len(tmp)):
        
        if tmp[j].find(name) != -1:
            try:
                df_bmi['Both'].values[cnt] = df['Both.sexes'].values[i]
                df_bmi['Male'].values[cnt] = df['Male'].values[i]
                df_bmi['Female'].values[cnt] = df['Female'].values[i]
                df_bmi['GDP'].values[cnt] = float(df_gdp['GDP ($ per capita)'][j])
                cnt += 1
            except:
                try:
                    df_bmi['Both'].values[cnt] = df['Both.sexes'].values[i]
                    df_bmi['Male'].values[cnt] = df['Male'].values[i]
                    df_bmi['Female'].values[cnt] = df['Female'].values[i]
                    df_bmi['GDP'].values[cnt] = float(df_gdp['GDP ($ per capita)'][j])
                    cnt += 1
                except:
                    continue
        else:

            try:
                if tmp[j].find(except_list[name]) != -1:
                    
                    df_bmi['Both'].values[cnt] = df['Both.sexes'].values[i]
                    df_bmi['Male'].values[cnt] = df['Male'].values[i]
                    df_bmi['Female'].values[cnt] = df['Female'].values[i]
                    df_bmi['GDP'].values[cnt] = float(df_gdp['GDP ($ per capita)'][j])
                    cnt += 1
            except:
                continue

In [None]:
print(df_bmi)

### 1.5 Nan Value Check

In [None]:
df_NanCheck = df_bmi.isnull()
print(df_NanCheck)

In [None]:
np.isnan(df_bmi)
np.where(np.isnan(df_bmi))

In [None]:
_ = np.nan_to_num(np.isnan(df_bmi))

In [None]:
df_bmi.fillna(df_bmi.mean(),inplace=True)

In [None]:
np.isnan(df_bmi)
np.where(np.isnan(df_bmi))

## 2.Clustering

In [None]:
def visualize_cluster_plot(clusterobj, dataframe, label_name, iscenter=True):
    if iscenter :
        centers = clusterobj.cluster_centers_
        
    unique_labels = np.unique(dataframe[label_name].values)
    markers=['o', 's', 'o', 's', 'o','s','o', 's', '*']
    isNoise=False

    for label in unique_labels:
        label_cluster = dataframe[dataframe[label_name]==label]
        if label == -1:
            cluster_legend = 'Noise'
            isNoise=True
        else :
            cluster_legend = 'Cluster '+str(label)
        
        plt.scatter(x=label_cluster['Com1'], y=label_cluster['Com2'], s=70, edgecolor='k', marker=markers[label], label=cluster_legend)
    for label in unique_labels:
        if iscenter:
            center_x_y = centers[label]
            plt.scatter(x=center_x_y[5], y=center_x_y[6], s=250, color='white',
                        alpha=0.9, edgecolor='k', marker=markers[label])
            plt.scatter(x=center_x_y[5], y=center_x_y[6], s=70, color='k', edgecolor='k', marker='$%d$' % label)
    if isNoise:
        legend_loc='upper center'
    else: legend_loc='upper right'
    
    plt.legend(loc=legend_loc)
    plt.show()

### 2.1 K-Means

In [None]:
KM = KMeans(n_clusters=3, max_iter=100)
KM.fit(df_bmi)

In [None]:
pca_KM = PCA(n_components=2)
pca_KM.fit(df_bmi)
df_bmi_pca_KM = pca_KM.fit_transform(df_bmi)

In [None]:
df_bmi['K_cluster'] = KM.labels_
df_bmi['Com1'] = df_bmi_pca_KM[:,0]
df_bmi['Com2'] = df_bmi_pca_KM[:,1]

In [None]:
print(df_bmi.head())
print(df_bmi.shape)
print(df_bmi_pca_KM.shape)

In [None]:
KM = KMeans(n_clusters=3, max_iter=100)
KM.fit(df_bmi)

In [None]:
plt.title("K-Means")
plt.xlabel('Com1')
plt.ylabel('Com2')
visualize_cluster_plot(KM, df_bmi, 'K_cluster' , iscenter = True)

In [None]:
score_samples_K = silhouette_samples(df_bmi,df_bmi['K_cluster'])

df_bmi['silhouette_coeff'] = score_samples_K
K_Means_avg_score_bmi = silhouette_score(df_bmi, df_bmi['K_cluster'])

print('K_Means_avg_score_bmi:' , K_Means_avg_score_bmi)

### 2.2 Mean

In [None]:
bandwidth = estimate_bandwidth(df_bmi)
print(bandwidth)

In [None]:
meanshift = MeanShift(bandwidth=bandwidth)

In [None]:
Mean_cluster_labels = meanshift.fit_predict(df_bmi)
df_bmi['Mean_cluster'] = Mean_cluster_labels

plt.title('Mean_Shift')
visualize_cluster_plot(meanshift, df_bmi, 'Mean_cluster' , iscenter = True)

In [None]:
score_samples_MS = silhouette_samples(df_bmi,df_bmi['Mean_cluster'])

In [None]:
df_bmi['silhouette_coeff'] = score_samples_MS
Mean_avg_score_bmi = silhouette_score(df_bmi, df_bmi['Mean_cluster'])

In [None]:
print('Mean_avg_score_bmi:', Mean_avg_score_bmi)

### 2.3 GMM 

In [None]:
GMM = GaussianMixture(n_components=3).fit(df_bmi)
GMM_label = GMM.fit_predict(df_bmi)

df_bmi['GMM_Label'] = GMM_label

plt.title('GMM')
visualize_cluster_plot(GMM, df_bmi, 'GMM_Label' , iscenter = False)

In [None]:
score_samples_GMM = silhouette_samples(df_bmi,df_bmi['GMM_Label'])

In [None]:
df_bmi['silhouette_coeff'] = score_samples_GMM
GMM_avg_score_bmi = silhouette_score(df_bmi, df_bmi['GMM_Label'])

In [None]:
print('GMM_avg_score_bmi:',GMM_avg_score_bmi)

### 2.2 DBSCAN

In [None]:
eps = 5000
num_min = 5

dbsacn = DBSCAN(eps = eps, min_samples= num_min)
dbsacn_label = dbsacn.fit_predict(df_bmi)
df_bmi['DBSCAN_label'] = dbsacn_label

pca = PCA(n_components=2)
pca_transformed = pca.fit_transform(df_bmi)

df_bmi['Com1'] = pca_transformed[:,0]
df_bmi['Com2'] = pca_transformed[:,1]

plt.title('DBSCAN')
visualize_cluster_plot(dbsacn, df_bmi, 'DBSCAN_label' , iscenter = False)

In [None]:
score_samples_DB = silhouette_samples(df_bmi,df_bmi['DBSCAN_label'])

In [None]:
df_bmi['silhouette_coeff'] = score_samples_DB
DBSCAN_avg_score_bmi = silhouette_score(df_bmi, df_bmi['DBSCAN_label'])

In [None]:
print('DBSCAN_avg_score_bmi:',DBSCAN_avg_score_bmi)

In [None]:
for i in range(3):
    tmp = df_bmi['DBSCAN_label'] == i
    print(np.mean(df_bmi["GDP"][tmp].values))

In [None]:
print(df_bmi)

In [None]:
tmp0 = df_bmi['DBSCAN_label'] == 0
GDP_0 = df_bmi["GDP"][tmp0]
BMI_0 = df_bmi["Both"][tmp0]

avg_GDP_0 = np.mean(GDP_0)
avg_BMI_0 = np.mean(BMI_0)

print(df_bmi["GDP"][tmp0])
print(df_bmi["Both"][tmp0])
print('Average GDP_0:', avg_GDP_0, 'Average BMI_0:', avg_BMI_0)

In [None]:
tmp1 = df_bmi['DBSCAN_label'] == 1
GDP_1 = df_bmi["GDP"][tmp1]
BMI_1 = df_bmi["Both"][tmp1]

avg_GDP_1 = np.mean(GDP_1)
avg_BMI_1 = np.mean(BMI_1)

print(df_bmi["GDP"][tmp1])
print(df_bmi["Both"][tmp1])
print('Average GDP_1:', avg_GDP_1, 'Average BMI_1:', avg_BMI_1)

In [None]:
tmp2 = df_bmi['DBSCAN_label'] == 2
GDP_2 = df_bmi["GDP"][tmp2]
BMI_2 = df_bmi["Both"][tmp2]

avg_GDP_2 = np.mean(GDP_2)
avg_BMI_2 = np.mean(BMI_2)

print(df_bmi["GDP"][tmp2])
print(df_bmi["Both"][tmp2])
print('Average GDP_2:', avg_GDP_2, 'Average BMI_2:', avg_BMI_2)