# Importación de librerias y de datos

In [2]:
import pandas as pd
import numpy as np
import plotly.express as px

In [33]:
paises = pd.read_csv("world-data-2023.csv")

Se puede ver que todas las columnas tiene valores NaN para algún país

In [35]:
paises.info()
paises

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 35 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Country                                    195 non-null    object 
 1   Density
(P/Km2)                            195 non-null    object 
 2   Abbreviation                               188 non-null    object 
 3   Agricultural Land( %)                      188 non-null    object 
 4   Land Area(Km2)                             194 non-null    object 
 5   Armed Forces size                          171 non-null    object 
 6   Birth Rate                                 189 non-null    float64
 7   Calling Code                               194 non-null    float64
 8   Capital/Major City                         192 non-null    object 
 9   Co2-Emissions                              188 non-null    object 
 10  CPI                       

Unnamed: 0,Country,Density\n(P/Km2),Abbreviation,Agricultural Land( %),Land Area(Km2),Armed Forces size,Birth Rate,Calling Code,Capital/Major City,Co2-Emissions,...,Out of pocket health expenditure,Physicians per thousand,Population,Population: Labor force participation (%),Tax revenue (%),Total tax rate,Unemployment rate,Urban_population,Latitude,Longitude
0,Afghanistan,60,AF,58.10%,652230,323000,32.49,93.0,Kabul,8672,...,78.40%,0.28,38041754,48.90%,9.30%,71.40%,11.12%,9797273,33.939110,67.709953
1,Albania,105,AL,43.10%,28748,9000,11.78,355.0,Tirana,4536,...,56.90%,1.20,2854191,55.70%,18.60%,36.60%,12.33%,1747593,41.153332,20.168331
2,Algeria,18,DZ,17.40%,2381741,317000,24.28,213.0,Algiers,150006,...,28.10%,1.72,43053054,41.20%,37.20%,66.10%,11.70%,31510100,28.033886,1.659626
3,Andorra,164,AD,40.00%,468,,7.20,376.0,Andorra la Vella,469,...,36.40%,3.33,77142,,,,,67873,42.506285,1.521801
4,Angola,26,AO,47.50%,1246700,117000,40.73,244.0,Luanda,34693,...,33.40%,0.21,31825295,77.50%,9.20%,49.10%,6.89%,21061025,-11.202692,17.873887
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190,Venezuela,32,VE,24.50%,912050,343000,17.88,58.0,Caracas,164175,...,45.80%,1.92,28515829,59.70%,,73.30%,8.80%,25162368,6.423750,-66.589730
191,Vietnam,314,VN,39.30%,331210,522000,16.75,84.0,Hanoi,192668,...,43.50%,0.82,96462106,77.40%,19.10%,37.60%,2.01%,35332140,14.058324,108.277199
192,Yemen,56,YE,44.60%,527968,40000,30.45,967.0,Sanaa,10609,...,81.00%,0.31,29161922,38.00%,,26.60%,12.91%,10869523,15.552727,48.516388
193,Zambia,25,ZM,32.10%,752618,16000,36.19,260.0,Lusaka,5141,...,27.50%,1.19,17861030,74.60%,16.20%,15.60%,11.43%,7871713,-13.133897,27.849332


# Preprocesamiento de datos

- Se eliminan todos los *datapoints* con algún valor NaN, lo que baja el total de paises a 110.
- Se convierten los *string* a *float* cuando corresponde:
    - Notar que hay columnas con signos "%" y otras con signos "$".
    - Notar que hay columnas con valores numericos pero en formato *string*, con "," como separador de miles.
    - Notar como columnas no numéricas son de poca importancia para la agrupación de paises.

In [50]:
pd.options.mode.chained_assignment = None # Para evitar warning innecesario
paises_clean = paises.dropna()

#Remover signos % y $
for col in paises_clean.columns:
    if isinstance(paises_clean[col][0], str):
        #Remover signos %   
        if "%" in paises_clean[col][0]:  
            paises_clean[col] = paises_clean[col].str.rstrip("%")
        #Remover signos $
        if "$" in paises_clean[col][0]:  
            paises_clean[col] = paises_clean[col].str.replace("$", "", regex=True)

for col in paises_clean.columns:                
    if isinstance(paises_clean[col][0], str):
        #Remover separadores de miles ,
        paises_clean[col] = paises_clean[col].str.replace(",", "")
        # Convertir a float si se puede
        try:
            paises_clean[col] = paises_clean[col].astype("float")
        except:
            print("Columna no numérica: " + col)


Columna no numérica: Country
Columna no numérica: Abbreviation
Columna no numérica: Capital/Major City
Columna no numérica: Currency-Code
Columna no numérica: Largest city
Columna no numérica: Official language


- Eliminar columnas no numéricas

In [51]:
data = paises_clean.drop(["Country", "Abbreviation", "Capital/Major City", "Currency-Code", "Largest city", "Official language"], axis = 1)
data

Unnamed: 0,Density\n(P/Km2),Agricultural Land( %),Land Area(Km2),Armed Forces size,Birth Rate,Calling Code,Co2-Emissions,CPI,CPI Change (%),Fertility Rate,...,Out of pocket health expenditure,Physicians per thousand,Population,Population: Labor force participation (%),Tax revenue (%),Total tax rate,Unemployment rate,Urban_population,Latitude,Longitude
0,60.0,58.1,652230.0,323000.0,32.49,93.0,8672.0,149.90,2.3,4.47,...,78.4,0.28,38041754.0,48.9,9.3,71.4,11.12,9797273.0,33.939110,67.709953
1,105.0,43.1,28748.0,9000.0,11.78,355.0,4536.0,119.05,1.4,1.62,...,56.9,1.20,2854191.0,55.7,18.6,36.6,12.33,1747593.0,41.153332,20.168331
2,18.0,17.4,2381741.0,317000.0,24.28,213.0,150006.0,151.36,2.0,3.02,...,28.1,1.72,43053054.0,41.2,37.2,66.1,11.70,31510100.0,28.033886,1.659626
4,26.0,47.5,1246700.0,117000.0,40.73,244.0,34693.0,261.73,17.1,5.52,...,33.4,0.21,31825295.0,77.5,9.2,49.1,6.89,21061025.0,-11.202692,17.873887
6,17.0,54.3,2780400.0,105000.0,17.02,54.0,201348.0,232.75,53.5,2.26,...,17.6,3.96,44938712.0,61.3,10.1,106.3,9.79,41339571.0,-38.416097,-63.616672
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185,281.0,71.7,243610.0,148000.0,11.00,44.0,379025.0,119.62,1.7,1.68,...,14.8,2.81,66834405.0,62.8,25.5,30.6,3.85,55908316.0,55.378051,-3.435973
186,36.0,44.4,9833517.0,1359000.0,11.60,1.0,5006302.0,117.24,7.5,1.73,...,11.1,2.61,328239523.0,62.0,9.6,36.6,14.70,270663028.0,37.090240,-95.712891
187,20.0,82.6,176215.0,22000.0,13.86,598.0,6766.0,202.92,7.9,1.97,...,16.2,5.05,3461734.0,64.0,20.1,41.8,8.73,3303394.0,-32.522779,-55.765835
191,314.0,39.3,331210.0,522000.0,16.75,84.0,192668.0,163.52,2.8,2.05,...,43.5,0.82,96462106.0,77.4,19.1,37.6,2.01,35332140.0,14.058324,108.277199


- Se aplica normalización Min-Max sobre todo el dataset, ya que los datos son númericos.

In [52]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
data = scaler.fit_transform(data)

# T-SNE de Scikit-Learn

- Se importa y declara t-SNE con dos componentes para graficar.
- Se entrena el modelo con los datos preprocesados.

In [53]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, init="pca", learning_rate="auto", random_state=42)
datos_2d = tsne.fit_transform(data)

# Visualización de resultados

- Se usa la librería Plotly para generar gráficos interactivos.
- Se crea DataFrame para visualización, uniendo *paises_clean* con los componentes extraidos usando t-SNE
- Se usa *paises_clean* porque tiene los valores originales, antes del preprocesamiento.
- Se colorea según logaritmo del **GDP** del país.
> Se usa el logaritmo debido a la gran diferencia entre valores de paises grandes como China y EE.UU con el resto 

In [60]:
df_tsne = pd.DataFrame(datos_2d, columns = ["t-SNE dimension 1", "t-SNE dimension 2"])

paises_clean.reset_index(drop=True, inplace=True)# Para que calcen las filas de t-SNE con las de paises_clean

df = pd.concat([paises_clean, df_tsne], axis=1) #Concatenar columnas de t-SNE con paises_clean
df["GDP"] = np.log(paises_clean["GDP"])# Calcular el logaritmo del GDP 
fig = px.scatter(df, "t-SNE dimension 1", "t-SNE dimension 2", hover_data = paises_clean.columns, hover_name = "Country", color = "GDP")
# Para graficar con nombres descomentar linea siguiente
fig = px.scatter(df, "t-SNE dimension 1", "t-SNE dimension 2", hover_data = paises_clean.columns, text = "Country", hover_name = "Country", color = "GDP")

# Para colocar titulo centrado
fig.update_layout(
    title=dict(
        text="Visualización de Paises usando t-SNE <br> Coloreados por Producto Interno Bruto",
        font=dict(size=22),
        x=0.5,
        xref="paper"
    )
)

fig.write_image("images/paises_by_gdp_text.svg")
fig.write_html("html/paises_by_gdp_text.html")
fig.show()

## Gráfico multiple

- Se crea gráfico interactivo con desplegable para colorear por característica deseada.

In [69]:
import plotly.graph_objects as go
fig = go.Figure()

traces = []

# Elegir las columnas según las que se coloreará (cada una aparecerá en el desplegable)
traces_cols = paises_clean.columns
traces_cols = traces_cols.drop(["Country", "Abbreviation", "Capital/Major City", "Currency-Code", "Largest city", "Official language", "Calling Code"])

# Por cada columna deseada, generar gráfico idem al anterior, pero coloreando según distintas características
# A cada grafico se le denominará "trace"
for col in traces_cols:
    trace_act = list(px.scatter(df, "t-SNE dimension 1", "t-SNE dimension 2", hover_data = paises_clean.columns, text = "Country", hover_name = "Country", color = col, title = "Titulo").select_traces())
    traces = traces + trace_act

# Agregar todas las "trace" al gráfico
fig.add_traces(traces)
# Matriz de True en la diagonal y False else
eye = np.eye(len(traces_cols)).astype(bool)

# Creación de los botones del desplegable
# Se coloca titulo según característica usada para colorear
# La "magia" ocurre con el argumento 'visible', recibe array de booleanos
# Se muestra el "trace" donde 'visible' es True
# Por ej si hay 3 "trace" entonces 'visible' = [True, False, False] muestra sola el primer "trace"
buttons_list = []
for i in range(len(traces_cols)):
    col = traces_cols[i]
    button_act = dict(label = col,
                  method = 'update',
                  args = [{'visible': eye[i, :]},
                          {'title': "Visualización usando t-SNE - Paises coloreados por " + col,
                           'showlegend':True}])
    
    buttons_list.append(button_act)

# Se agregan botones de desplegable creados
fig.update_layout(
    updatemenus=[go.layout.Updatemenu(
        active=0,
        buttons=list(buttons_list)
        )
    ])

fig.write_html("html/paises-tsne.html") # Guardar gráfico interactivo como html
fig.show()