In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import json
from math import ceil

from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN, KMeans
from sklearn.base import clone
from sklearn.neighbors import NearestNeighbors
from yellowbrick.cluster import KElbowVisualizer

In [None]:
#!pip install --target=/Users/franz/opt/anaconda3/envs/Data_visualization/lib/python3.8/site-packages/ yellowbrick

## Data Loading

In [None]:
df = pd.read_csv("final_poster.csv")
european_union = json.load(open("european-union-countries.geojson", "r"))

In [None]:
df.rename(columns={"Unnamed: 0": "Country"}, inplace=True)
df["Country"].replace({"Czechia": "Czech Republic"}, inplace=True)
df.set_index('Country', inplace = True)
df.info()

## Data Preprocessing

In [None]:
from sklearn import preprocessing
df_before = df.copy()

cols = df.columns
rows = df.index

dic = {}
i = 0

for element in cols:
    dic[i]=element
    i = i + 1

x = df.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_scaled = pd.DataFrame(x_scaled)
df_scaled = df_scaled.rename(columns=dic)
df_scaled
df_scaled.head()

### Create Perspectives

In [None]:
health = df_scaled[["Obesity", "Diabetes Prevalence", "Cardiovascular Death Rate", 
         "Life Expectancy", "Health Expenditure (% of GDP)"]]
food = df_scaled.iloc[:,:20]

### Performing PCA for Dimensionality Reduction

In [None]:
# PCA
def explained_variance(dataframe):
    "Calculates the Eigenvalues for each Principal Component in a dataframe."
    
    pca = PCA()
    pca_feat = pca.fit_transform(dataframe)
    
    return np.cumsum(pca.explained_variance_ratio_), pca.n_components_

In [None]:
def get_principal_components(dataframe, nPC):
    """Gets PC values for each observation in dataframe and creates new dataframe."""
    
    pca = PCA(nPC)
    pca_feat = pca.fit_transform(dataframe)
    pca_names = [f"PC{i}" for i in range(pca.n_components_)]
    pca_df = pd.DataFrame(pca_feat, index=dataframe.index, columns=pca_names)
    return pca_df

In [None]:
# Food
print(explained_variance(food)[0].round(3))

In [None]:
# Health
print(explained_variance(health)[0].round(3))

In [None]:
# Total
total = pd.concat((food, health), axis=1)
print(explained_variance(total)[0].round(3))

PCA nur für Food, bei Health kann man auch gleich alle Variablen benutzen weil cumulative explained variance nicht bei PC1 oder " richtig reinkickt.

In [None]:
pca_food = get_principal_components(food, 8)
pca_total = get_principal_components(food, 9)

## Clustering K-Means
### Food

In [None]:
# KMeans Elbow
model = KMeans(pca_food)
visualizer = KElbowVisualizer(model, k=(1,10))

visualizer.fit(pca_food)    
visualizer.show()    

In [None]:
number_clusters = 5
kmclust = KMeans(
    n_clusters=number_clusters, 
    init='k-means++',
    n_init=10, 
    random_state=27,
    tol= 1e-4)
km_labels_food = kmclust.fit_predict(pca_food)
km_labels_food

### Health

In [None]:
# KMeans Elbow
model = KMeans(health)
visualizer = KElbowVisualizer(model, k=(1,10))

visualizer.fit(health)    
visualizer.show()    

In [None]:
number_clusters = 3
kmclust = KMeans(
    n_clusters=number_clusters, 
    init='k-means++',
    n_init=10, 
    random_state=27,
    tol= 1e-4)
km_labels_health = kmclust.fit_predict(health)
km_labels_health

### Total

In [None]:
# KMeans Elbow
model = KMeans(total)
visualizer = KElbowVisualizer(model, k=(1,10))

visualizer.fit(total)    
visualizer.show()    

In [None]:
number_clusters = 4
kmclust = KMeans(
    n_clusters=number_clusters, 
    init='k-means++',
    n_init=10, 
    random_state=27,
    tol= 1e-4)
km_labels_total = kmclust.fit_predict(total)
km_labels_total

### Evaluating K-Means

In [None]:
def get_ss(df):
    ss = np.sum(df.var() * (df.count() - 1))
    return ss  # return sum of sum of squares of each df variable

In [None]:
kmeans_food = pd.concat((pca_food, pd.Series(km_labels_food, name='labels')), axis=1)
sst = get_ss(kmeans_food.drop(columns=["labels"]))  # get total sum of squares
ssw_labels = kmeans_food.groupby(by='labels').apply(get_ss)  # compute ssw for each cluster labels
ssb = sst - np.sum(ssw_labels)
r2 = ssb / sst
print("R^2 kmeans:", r2)

In [None]:
kmeans_health = pd.concat((health, pd.Series(km_labels_health, name='labels')), axis=1)
sst = get_ss(kmeans_health.drop(columns=["labels"]))  # get total sum of squares
ssw_labels = kmeans_health.groupby(by='labels').apply(get_ss)  # compute ssw for each cluster labels
ssb = sst - np.sum(ssw_labels)
r2 = ssb / sst
print("R^2 kmeans:", r2)

In [None]:
kmeans_total = pd.concat((total, pd.Series(km_labels_total, name='labels')), axis=1)
sst = get_ss(kmeans_total.drop(columns=["labels"]))  # get total sum of squares
ssw_labels = kmeans_total.groupby(by='labels').apply(get_ss)  # compute ssw for each cluster labels
ssb = sst - np.sum(ssw_labels)
r2 = ssb / sst
print("R^2 kmeans:", r2)

## Visualization of Clusters

In [None]:
clusters = pd.concat((df_scaled, pd.Series(km_labels_food, name='cluster_food'),
                      pd.Series(km_labels_health, name='cluster_health'),
                      pd.Series(km_labels_total, name='cluster_total'),
                      pd.Series(rows, name='Country'),
                      ), axis=1)
clusters.head()

## Geoplot

In [None]:
state_id_map = {}
for feature in european_union["features"]:
    feature["id"] = feature["properties"]["gu_a3"]
    state_id_map[feature["properties"]["name_long"]] = feature["id"]

In [None]:
# create new column with polygon info
clusters["id"] = clusters["Country"].apply(lambda x: state_id_map[x])
clusters.head()

### Food

In [None]:
fig = px.choropleth_mapbox(
    clusters,
    locations="id",
    geojson=european_union,
    color="cluster_food",
    hover_name="Country",
    hover_data=["Life Expectancy"],
    title="clusters by food",
    mapbox_style="carto-positron", # kann hier auch darkmode u.a. - mit api token noch mehr aber to much
    center={"lat": 56.5, "lon": 11},
    zoom=2.5,
    opacity=0.5, # können das hier weniger transparent machen
    height=600,
    width=550
)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

fig.show()

### Health

In [None]:
fig = px.choropleth_mapbox(
    clusters,
    locations="id",
    geojson=european_union,
    color="cluster_health",
    hover_name="Country",
    hover_data=["Life Expectancy"],
    title="Clusters by health",
    mapbox_style="carto-positron", # kann hier auch darkmode u.a. - mit api token noch mehr aber to much
    center={"lat": 56.5, "lon": 11},
    zoom=2.5,
    opacity=0.5, # können das hier weniger transparent machen
    height=600,
    width=550
)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

fig.show()

### Total

In [None]:
fig = px.choropleth_mapbox(
    clusters,
    locations="id",
    geojson=european_union,
    color="cluster_total",
    hover_name="Country",
    hover_data=["Life Expectancy"],
    title="Clusters by health",
    mapbox_style="carto-positron", # kann hier auch darkmode u.a. - mit api token noch mehr aber to much
    center={"lat": 56.5, "lon": 11},
    zoom=2.5,
    opacity=0.5, # können das hier weniger transparent machen
    height=600,
    width=550
)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

fig.show()

## Cluster Analysis