World Development Indicators - worldbank.org <br/>
Fabio Cardoso - Dec/2021 <br/><br/>

Datasets: <br/><br/>

Country Indicators: <br/>
https://datacatalog.worldbank.org/search/dataset/0037712/World-Development-Indicators <br/>
Note: dataset updated with the topics presented in https://data.worldbank.org/indicator?tab=all - Utils notebook. <br/><br/>

Country Codes ISO 3166: <br/>
https://www.iban.com/country-codes <br/><br/>

Note: <br/>
For some orphans indicators, the topic 'private-sector' was assumed - Utils notebook. <br/>
. Firms visited or required meetings with tax officials (% of firms) <br/>
. Net ODA provided to the least developed countries (% of GNI) <br/>
. Net ODA provided, to the least developed countries (current US )−𝑁𝑒𝑡𝑂𝐷𝐴𝑝𝑟𝑜𝑣𝑖𝑑𝑒𝑑,𝑡𝑜𝑡𝑎𝑙(−𝑁𝑒𝑡𝑂𝐷𝐴𝑝𝑟𝑜𝑣𝑖𝑑𝑒𝑑,𝑡𝑜𝑡𝑎𝑙(𝑐𝑜𝑛𝑠𝑡𝑎𝑛𝑡2015𝑈𝑆) <br/>
. Net ODA provided, total (current US$)

In [69]:
# Parameters

cloud = True
topic = 'general'
key_in ='WDI treated-step5-'+topic+'.sav'
key1_out = 'wdi_clusters_'+topic+'.parquet' #saving for sqlquery/cognos analytics
key2_out = 'wdi_clusters_'+topic+'.csv' #saving local for cognos embedded
clusterkm_field_name = 'cluster_km_'+topic
clusterdb_field_name = 'cluster_db_'+topic
clustergm_field_name = 'cluster_gm_'+topic
eps = 97 #EPC in DBScan algorithm influence dimensions quantity. 

In [70]:
# Imports

import pandas as pd
import numpy as np
import zipfile
import matplotlib.pyplot as plt
import zlib
import pickle
from threading import Thread
from multiprocessing import Pool
import time
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler

#for clustering
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.mixture import GaussianMixture

#for dimentionaluty reduction
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.manifold import Isomap, LocallyLinearEmbedding, LocallyLinearEmbedding
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Library for sav reading
!pip install pyreadstat

# Library used in the dataframe compatibility with parquet format
!pip install unidecode

In [None]:
# The code was removed by Watson Studio for sharing.

In [73]:
# Storage instantiation

if cloud:
    import types
    import pandas as pd
    import ibm_boto3
    from botocore.client import Config

    def __iter__(self): return 0

    cnx_fcsinsights = ibm_boto3.client(
        service_name = 's3',
        ibm_api_key_id = parm_ibm_api_key_id,
        ibm_service_instance_id = parm_ibm_service_instance_id,
        ibm_auth_endpoint = parm_ibm_auth_endpoint,
        config = Config(signature_version='oauth'),
        endpoint_url = parm_endpoint_url
    )

bkt_in = parm_bkt_in
bkt_out = parm_bkt_out

In [74]:
# Read data

if cloud:
    cnx_fcsinsights.download_file(Bucket=bkt_in, Key=key_in, Filename=key_in)
    df = pd.read_spss(key_in)
else:
    df = pd.read_spss('F:/WDI Working Files/'+key_in)

In [75]:
# Clustering Kmeans

kmeans = KMeans(n_clusters=5, random_state=0)
clusterskm = kmeans.fit_predict(df.drop(columns=['Country_Code']).values)
df[clusterkm_field_name] = clusterskm

In [76]:
# Clustering DBScan

dbscan = DBSCAN(eps=97)
clustersdb = dbscan.fit_predict(df.drop(columns=['Country_Code']).values)
df[clusterdb_field_name] = clustersdb
df.groupby(by=[clusterdb_field_name]).count()[df.columns[0]]

cluster_db_general
-1    140
 0     30
 1     24
 2     16
 3      5
Name: Country_Code, dtype: int64

In [77]:
# For GM, it is necessary to apply dimentionality reduction (memory issue).

pca = PCA(n_components = df['Country_Code'].nunique())
pc = pca.fit_transform(df.drop(columns=['Country_Code']), df['Country_Code'])

In [78]:
# Clustering Gaussian Mixture

gaussian = GaussianMixture(n_components=5)
clustersgm = gaussian.fit_predict(pc)
df[clustergm_field_name] = clustersgm

In [79]:
# Maintaining the main fields for presentation

df = df[['Country_Code', clusterkm_field_name, clusterdb_field_name, clustergm_field_name]]

In [80]:
# Aligning clusters by their size order

clusters_km_index = df.groupby(by=[clusterkm_field_name]).count()[df.columns[0]].index.values
clusters_km_sizes = df.groupby(by=[clusterkm_field_name]).count()[df.columns[0]].values
clusters_km_order = np.take_along_axis(clusters_km_index, np.argsort(clusters_km_sizes), axis=0)

clusters_db_index = df.groupby(by=[clusterdb_field_name]).count()[df.columns[0]].index.values
clusters_db_sizes = df.groupby(by=[clusterdb_field_name]).count()[df.columns[0]].values
clusters_db_order = np.take_along_axis(clusters_db_index, np.argsort(clusters_db_sizes), axis=0)

clusters_gm_index = df.groupby(by=[clustergm_field_name]).count()[df.columns[0]].index.values
clusters_gm_sizes = df.groupby(by=[clustergm_field_name]).count()[df.columns[0]].values
clusters_gm_order = np.take_along_axis(clusters_gm_index, np.argsort(clusters_gm_sizes), axis=0)

i=0
for clu in clusters_km_order:
    i +=1
    df[clusterkm_field_name] = df[clusterkm_field_name].apply(lambda x: i+1000 if x==clu else x)
df[clusterkm_field_name] = df[clusterkm_field_name].apply(lambda x: x-1000)

i=0
for clu in clusters_db_order:
    i +=1
    df[clusterdb_field_name] = df[clusterdb_field_name].apply(lambda x: i+1000 if x==clu else x)
df[clusterdb_field_name] = df[clusterdb_field_name].apply(lambda x: x-1000)

i=0
for clu in clusters_gm_order:
    i +=1
    df[clustergm_field_name] = df[clustergm_field_name].apply(lambda x: i+1000 if x==clu else x)
df[clustergm_field_name] = df[clustergm_field_name].apply(lambda x: x-1000)

In [82]:
# Remove spaces from column names

import unidecode
def remove_space_from_cols_names(df):
    rens = dict()
    for col in df.columns:
        col1 = col.lower() #lowercase
        col2 = col1.replace(" ", "_") #remove espaço
        col3 = unidecode.unidecode(col2) #remove acento
        rens.update({col:col2})
    return df.rename(columns=rens)

df = remove_space_from_cols_names(df)

In [83]:
# Save result for presentation

if cloud:
    df.to_parquet(key1_out, index=False)
    cnx_fcsinsights.upload_file(Bucket=bkt_out, Key=key1_out, Filename=key1_out)
else:
    df.to_parquet('F:/WDI Working Files/'+key1_out, index=False)

In [84]:
# The code was removed by Watson Studio for sharing.

In [None]:
# Saving local

if cloud:
    df.to_csv(key2_out, index=False)
    cnx_local.upload_file(Bucket=bucket, Key=key2_out, Filename=key2_out)

In [85]:
print('ok')

ok
