# Table of contents
* [Basic EDA](#1)
* [Derive Additional Features](#2)
* [Correlation of Features](#3)
* [Principal Component Analysis](#4)
* [Clustering with k-means and Interpretation](#5)
* [Overview of Data and Export](#6)

In [None]:
# packages

# standard
import numpy as np
import pandas as pd

# plots
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# PCA / Clustering
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [None]:
# import data and show full table
df = pd.read_csv('../input/argentina-provincial-data/argentina.csv')
df

<a id='1'></a>
# Basic EDA

In [None]:
# list all features
features_orig = ['gdp', 'illiteracy', 'poverty', 'deficient_infra',
                 'school_dropout', 'no_healthcare', 'birth_mortal', 'pop',
                 'movie_theatres_per_cap', 'doctors_per_cap']

In [None]:
# summary statistics
df[features_orig].describe()

In [None]:
# barplot of all the features
for f in features_orig:
    fig = plt.figure(figsize = (12,4))
    sns.barplot(x='province', y=f, data=df)
    plt.xticks(rotation=90) 
    plt.grid()
    plt.title(f)
    plt.show()

<a id='2'></a>
# Derive Additional Features

In [None]:
# GDP per capita
df['gdp_per_cap'] = np.round(df['gdp'] / df['pop'],3)

# population in percent
sum_pop = df['pop'].sum()
print('Overall population: ',sum_pop)

df['pop_perc'] = np.round(100 * df['pop'] / sum_pop,4)

features_new = ['gdp_per_cap', 'pop_perc']

In [None]:
# barplots of additional features
for f in features_new:
    fig = plt.figure(figsize = (12,4))
    sns.barplot(x='province', y=f, data=df)
    plt.xticks(rotation=90) 
    plt.grid()
    plt.title(f)
    plt.show()

In [None]:
# collect all features
features = features_orig + features_new

<a id='3'></a>
# Correlation

In [None]:
# calc RANK correlation of features
corr_mat = df[features].corr(method='spearman')
# plot (rank) correlation matrix
fig = plt.figure(figsize = (12,9))
sns.heatmap(corr_mat, annot=True, cmap="RdYlGn")
plt.title('Spearman (rank) correlation')
plt.show()

<a id='4'></a>
# Principal Component Analysis (PCA)

In [None]:
# select features
features4pca = features.copy()
features4pca.remove('pop_perc') # use population data only once
features4pca.remove('gdp') # use GDP only in relative version (per capita)
print('Using the following features:')
print(features4pca)

In [None]:
# use only selected features for PCA
df4pca = df[features4pca]
# standardize first
df4pca_std = StandardScaler().fit_transform(df4pca)
# define 3D PCA
pc_model = PCA(n_components=3)
# calc PCA
pc = pc_model.fit_transform(df4pca_std)
# add PCA results to original data frame
df['pc_1'] = pc[:,0]
df['pc_2'] = pc[:,1]
df['pc_3'] = pc[:,2]

### Now let's visualize the PCA results:

In [None]:
# interactive plot of top 3 principal components
fig = px.scatter_3d(df, x='pc_1', y='pc_2', z='pc_3',
                    color='province',
                    hover_data=['province'],
                    opacity=0.5)
fig.update_layout(title='PCA 3D')
fig.show()

<a id='5'></a>
# Clustering with k-means

In [None]:
# define cluster algorithm and parameters
n_cl = 4 # number of clusters
kmeans = KMeans(init='random', n_clusters=n_cl, n_init=10, max_iter=300, random_state=99)

# and run it on scaled data (we will simply re-use the data from the PCA excercise)
kmeans.fit(df4pca_std);

In [None]:
# append cluster variable to data frame
df['cluster'] = kmeans.labels_.astype('object')

### Show the clusters

In [None]:
# show provinces of each cluster
for c in range(4):
    print('Cluster ' + str(c) + ':')
    print(df[df.cluster==c].province.value_counts().index.tolist())
    print()

In [None]:
# visualize clusters using PCA components
fig = px.scatter_3d(df, x='pc_1', y='pc_2', z='pc_3',
                    color='cluster',
                    hover_data=['province'],
                    opacity=0.5)
fig.update_layout(title='Clusters')
fig.show()

### Buenos Aires seems to be a class of its own.
### In order to get a better understanding of the other clusters let's check the feature means for each cluster:

In [None]:
# use groupby to calc means per cluster
mean_features = ['illiteracy', 'poverty', 'deficient_infra', 'school_dropout',
                 'no_healthcare', 'birth_mortal', 'movie_theatres_per_cap',
                 'doctors_per_cap', 'gdp_per_cap', 'pc_1', 'pc_2', 'pc_3']

cluster_stats_mean = df.groupby(['cluster'])[mean_features].mean()
cluster_stats_mean

### For some features it makes more sense to look at the sum:

In [None]:
# use groupby to calc sum for each cluster
cluster_stats_sum = df.groupby(['cluster'])[['gdp','pop','pop_perc']].sum()
cluster_stats_sum['gdp_per_cap'] = np.round(cluster_stats_sum['gdp'] / cluster_stats_sum['pop'],2)
cluster_stats_sum

### Another option to visualize the clusters is a parallel plot:

In [None]:
# parallel plot showing the original features by cluster
fig = px.parallel_coordinates(df[features4pca+['cluster']], color='cluster')
fig.show()

### Let's try an interpretation of the clusters now:
* Cluster 0: Buenos Aires - Province around the capital and by far largest city
* Cluster 1: Richer provinces
* Cluster 2: Poorer provinces
* Cluster 3: Rich provinces (high GDP per capita) with low population, very low illiteracy and school dropout. High healthcare quality and "cultural value" (measured by movies/theatres per capita)

<a id='6'></a>
# Overview of data and export

In [None]:
# look at the full table including all the new columns
df

In [None]:
# save data frame in CSV file
df.to_csv('df_prep.csv')