<p style="text-align: center; font-size: 100px; color: white; font-weight: bold;">INDEX OF ECONOMIC FREEDOM</p>
<center><img src="heritage.png"></center>
<center>[En línea: www.northafricapost.com]</center>
<p style="text-align: center; color:white; font-size:50px;">COMPLETE EDA AND MACHINE LEARNING MODEL</p>


In [18]:
# librerias básicas
import numpy as np 
import pandas as pd
from math import sqrt

# ML
import sklearn as sklearn
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score, explained_variance_score, mean_absolute_percentage_error

# importamos librerias graficación 
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
pio.templates.default = 'plotly_dark'
import folium

# quitamos warnings que nos puedan aparecer en las celdas
import warnings
warnings.filterwarnings('ignore')

# importamos librerias streamlit
import streamlit as st
import streamlit.components.v1 as components
from PIL import Image
import base64
from streamlit.elements.utils import (check_callback_rules, check_session_state_rules, get_label_visibility_proto_value)
from geopy.geocoders import Nominatim

# translate 
from googletrans import Translator

<p style="color:white; font-size:40px;">EDA</p>

In [19]:
df = pd.read_csv('freedomeconomicindex.csv')

<p style="color:white; font-size:20px;">Contexto y escenario del estudio </p>

In [20]:
# creamos un mapa en folium con nuestras columnas de latitud y longitud
worldmap = folium.Map(location=[df.iloc[0]['Latitude'], df.iloc[0]['Longitude']], zoom_start=4, tiles="Stamen Terrain")

# agregamos un marcador para cada punto
for i, row in df.iterrows():
    folium.Marker(location=[row['Latitude'], row['Longitude']]).add_to(worldmap)

# guardamos el mapa
worldmap.save("map.html")

# mostramos el mapa
worldmap 

**Vamos a analizar 186 países dentro del índice, estos países se dividen en varios regiones dependiendo de su ubicación**

In [21]:
# hacemos un group by de las regiones y los paises y los contabilizamos
grouped = df.groupby('Region')
counts = grouped['Country'].count()
result = counts.reset_index()

# añadimos los colores y creamos el pie
colors = ['#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A']
fig = go.Figure(data=go.Pie(values=result['Country'], labels=result['Region'], marker=dict(colors=colors)))
fig.update_layout(title_text='Count of Countries per Region',width=1200,height=800)
fig.show()


**Se observa como la mayoría del territorio ocupado por los países analizados se distribuye entre África (35,8%) y Asia (23,1), siendo Europa la menor parte del territorio mundial (24.2%).**

In [22]:
# creamos un groupby de varias variables macroeconomicas y la variable 'Region'
grouped_df = df.groupby(['Region'])['Population (Millions)'].sum().reset_index()
grouped_df_inflation = df.groupby(['Region'])['Inflation (%)'].sum().reset_index()
grouped_df_unemployment = df.groupby(['Region'])['Unemployment (%)'].sum().reset_index()
grouped_df_public_debt = df.groupby(['Region'])['Public Debt (% of GDP)'].sum().reset_index()

# añadimos los cores
colors = ['#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A']

# creamos el subplot
fig = make_subplots(rows=2, cols=2, subplot_titles=("Population (Millions) by Region","Inflation (%) by Region", "Unemployment (%) by Region","Public Debt (% of GDP) by Region" ))

# añadimos el primer subplot 
fig.add_trace(go.Bar(x=grouped_df['Region'], y=grouped_df['Population (Millions)'], marker=dict(color=colors), name="Population (Millions)"), row=1, col=1)
fig.update_yaxes(title_text="Population (Millions)", row=1, col=1)

# añadimos el segundo subplot 
fig.add_trace(go.Bar(x=grouped_df_inflation['Region'], y=grouped_df_inflation['Inflation (%)'], marker=dict(color=colors), name="Inflation (%)"), row=1, col=2)
fig.update_yaxes(title_text="Inflation (%)", row=1, col=2)

# añadimos el tercer subplot 
fig.add_trace(go.Bar(x=grouped_df_unemployment['Region'], y=grouped_df_unemployment['Unemployment (%)'], marker=dict(color=colors), name="Unemployment (%)"),row=2, col=1)
fig.update_yaxes(title_text="Unemployment (%)",row=2, col=1)

# # añadimos el cuarto subplot 
fig.add_trace(go.Bar(x=grouped_df_public_debt['Region'], y=grouped_df_public_debt['Public Debt (% of GDP)'], marker=dict(color=colors), name="Public Debt (% of GDP)"), row=2, col=2)
fig.update_yaxes(title_text="Public Debt (% of GDP)", row=2, col=2)


# añadimos un update para ciertos paramétros que quedan por añadir
fig.update_layout(title_text='Subplots of Population and Economic Indicators per Region')
fig.update_layout(width = 2000, height = 1000, showlegend=True)
fig.update_xaxes(tickangle=15)


- **La mayoría de la población se concentra en Asia**
- **Las regiones con mayor inflación son la zona Subsahariana y América**
- **Europa y la zona de África Subsahariana son las regiones con más desempleo**
- **Además Europa y Africa Subsahariana son las zonas con una deuda pública mayor (% del PBI)**

In [23]:
# creamos la variable corr con las columnas que se nos pide realizando una correlación de Pearson (r = cov(X, Y) / (std(X) * std(Y))
corr = df[['Country', 'Region', 'World Rank', 'Region Rank', '2019 Score',
            'Property Rights', 'Judical Effectiveness', 'Government Integrity',
            'Tax Burden', "Gov't Spending", 'Fiscal Health', 'Business Freedom',
            'Labor Freedom', 'Monetary Freedom', 'Trade Freedom',
            'Investment Freedom', 'Financial Freedom', 'Tariff Rate (%)',
            'Income Tax Rate (%)', 'Corporate Tax Rate (%)', 'Tax Burden % of GDP',
            "Gov't Expenditure % of GDP", 'Population (Millions)',
            'GDP (Billions, PPP)$', 'GDP Growth Rate (%)',
            '5 Year GDP Growth Rate (%)', 'GDP per Capita (PPP)$',
            'Unemployment (%)', 'Inflation (%)', 'FDI Inflow (Millions)',
            'Public Debt (% of GDP)', 'Latitude', 'Longitude']].corr()

In [24]:
# creamos un gráfico de calor utilizando la matriz de correlación
fig = px.imshow(corr, color_continuous_scale=px.colors.sequential.Jet)
fig.update_layout(width = 2000, height = 800)
fig.update_layout(title ='Pearson correlation matrix' )
fig.show()

<p style="color:white; font-size:20px;">Tras ver las variables que tienen una correlación 0.70<=, empezamos a hacer nuestro análisis exploratorio. </p>

<p style="color:white; font-size:20px;">World Rank y Region Rank </p>

In [25]:
# hacemos un subplot de una comparativa de el Worl Rank con el Region Rank 
fig = make_subplots(rows=1, cols=2, column_widths=[0.5, 0.6], row_heights=[0.6], 
                    specs=[[{"type": "scattergeo"}, {"type": "scattergeo"}]], 
                    subplot_titles=("World Rank 2019 Freedom Index","Region Rank 2019 Freedom Index"))

# añadimos el primer subplot y sus características
fig.add_trace(
    go.Scattergeo(lon = df['Longitude'], lat = df['Latitude'], text = df['Country'] + ' (' + df['World Rank'].astype(str) + ')', mode = 'markers',
                marker = dict(size = 20, sizemode = 'diameter', color = df['World Rank'], colorscale = 'Plasma', reversescale = False),
                name = 'World Rank'), row=1, col=1)

# añadimos las características del plot
fig.update_layout(
        title = 'World Rank & Region Rank 2019 Freedom Index',
        geo = dict(
        scope = 'world',
        showland = True,
        landcolor = 'White',
        showcountries = True,
        countrycolor = 'Black',
        showocean=True,
        oceancolor="LightBlue",
        lakecolor="LightBlue",
        projection = dict(type = "orthographic"),
        lonaxis = dict(range = [-270, 270]),
        lataxis = dict(range = [-270, 270])
        
    ),
)

# añadimos el segundo subplot y sus características
fig.add_trace(
    go.Scattergeo(lon = df['Longitude'], lat = df['Latitude'], text = df['Country'] + ' (' + df['Region Rank'].astype(str) + ')', mode = 'markers',
                marker = dict(size = 20, sizemode = 'diameter', color = df['Region Rank'], colorscale = 'Plasma', reversescale = False),
                name = 'Region Rank'), row=1, col=2)

# añadimos las características del plot
fig.update_layout(
        geo2 = dict(
        scope = 'world',
        showland = True,
        landcolor = 'White',
        showcountries = True,
        countrycolor = 'Black',
        showocean=True,
        oceancolor="LightBlue",
        lakecolor="LightBlue",
        projection = dict(type = "orthographic"),
        lonaxis = dict(range = [-270, 270]),
        lataxis = dict(range = [-270, 270])
        
    ),
)

# damos el tamaño al plot y lo mostramos
fig.update_layout(width = 2000, height=800)
fig.show()

<p style="color:white; font-size:20px;">2019 score analysis </p>

In [26]:
# hacemos un top 20 de los mejores y peores paises valorados según el 2019 Score
top_20_countries = df.sort_values(by='2019 Score', ascending=False).head(20)
bottom_20_countries = df.sort_values(by='2019 Score', ascending=True).head(20)
# creamos dos columnas para poder hacer dos subplots y comparlaos
fig = make_subplots(rows=1, cols=2, shared_yaxes=True, subplot_titles=("Top 20 Countries by 2019 Score","Bottom 20 Countries by 2019 Score"))
colorscale = 'Viridis'
fig.add_trace(go.Bar(x=top_20_countries['Country'], y=top_20_countries['2019 Score'], name='Top 20', marker_color=top_20_countries['2019 Score'], marker_colorscale=colorscale), 1, 1)
fig.add_trace(go.Bar(x=bottom_20_countries['Country'], y=bottom_20_countries['2019 Score'], name='Bottom 20', marker_color=bottom_20_countries['2019 Score'], marker_colorscale=colorscale), 1, 2)
fig.update_layout(title ='Top 20 Countries / Bottom 20 Countries by Score 2019')
fig.update_layout(
    title='Relationship between the 2019 score and dependent variables',
    xaxis=dict(title='Top 20 Countries by 2019 Score'),
    xaxis2=dict(title='Bottom 20 Countries by 2019 Score'),
    yaxis=dict(title='2019 Score'),
    showlegend=True, 
    width=2000,
    height=800
)
fig.show()

**El 2019 Index Score, lo lideran países como Hong Kong, Singapur y Nueva Zelanda, mientras que el Score más bajo lo tienen países como Cuba, Venezuela y en último lugar Corea del Norte**

In [27]:
# hacemos un plot del 2019 Score por ciudades con un Scatter geo
fig = go.Figure(data=go.Scattergeo(
    lon = df['Longitude'],
    lat = df['Latitude'],
    text = df['Country'] + ' (' + df['2019 Score'].astype(str) + ')',
    mode = 'markers',
    marker = dict(
        size = 20,
        sizemode = 'diameter',
        color = df['2019 Score'],
        colorscale = 'Jet',
        showscale = True,
        reversescale = False
    ),
))

# añadimos la forma de globo terráqueo y además lo personalizamos
fig.update_layout(
    title = '2019 Score by Countries',
    width = 1200,
    height = 800,
    geo = dict(
        scope = 'world',
        showland = True,
        landcolor = 'White',
        showcountries = True,
        countrycolor = 'Black',
        showocean=True,
        oceancolor="LightBlue",
        lakecolor="LightBlue",
        projection = dict(type = "orthographic"),
        lonaxis = dict(range = [-270, 270]),
        lataxis = dict(range = [-270, 270])
    ),
)

fig.show()


In [28]:
# creamos un scatterplot para todas las variables relacionadas con el 2019 Score
fig = go.Figure(data=[go.Scatter(x=df['Property Rights'], y=df['2019 Score'], mode='markers', marker=dict(symbol='square'), name='Property Rights', text=df['Country']),
                    go.Scatter(x=df['Judical Effectiveness'], y=df['2019 Score'], mode='markers', marker=dict(symbol='triangle-up'), name='Judical Effectiveness', text=df['Country']),
                    go.Scatter(x=df['Government Integrity'], y=df['2019 Score'], mode='markers', marker=dict(symbol='circle'), name='Government Integrity', text=df['Country']),
                    go.Scatter(x=df['Tax Burden'], y=df['2019 Score'], mode='markers', marker=dict(symbol='cross'), name='Tax Burden', text=df['Country']),
                    go.Scatter(x=df['Business Freedom'], y=df['2019 Score'], mode='markers', marker=dict(symbol='x'), name='Business Freedom', text=df['Country']),
                    go.Scatter(x=df['Trade Freedom'], y=df['2019 Score'], mode='markers', marker=dict(symbol='pentagon'), name='Trade Freedom', text=df['Country']),
                    go.Scatter(x=df['Investment Freedom'], y=df['2019 Score'], mode='markers', marker=dict(symbol='star'), name='Investment Freedom', text=df['Country']),
                    go.Scatter(x=df['Financial Freedom'], y=df['2019 Score'], mode='markers', marker=dict(symbol='triangle-down'), name='Financial Freedom', text=df['Country'])],
                layout=go.Layout(title='Relationship between the 2019 score and its dependent variables.',
                                xaxis_title='Government/Finance-dependent variables',
                                yaxis_title='2019 Score',
                                showlegend=True, 
                                width=2000,
                                height=800))
fig.show()

In [29]:
# creamos un subplot para todas las variables relacionadas con el 2019 Score y las variables gubernamentales
fig = make_subplots(rows=1, cols=2, specs=[[{"type": "scatter"},{"type": "scatter"}]], subplot_titles=("Government dependent variables","Finance dependent variables") )

fig.add_trace(go.Scatter(x=df['Property Rights'], y=df['2019 Score'], mode='markers', marker=dict(symbol='square'), name='Property Rights', text=df['Country']), row = 1, col = 1)
fig.add_trace(go.Scatter(x=df['Judical Effectiveness'], y=df['2019 Score'], mode='markers', marker=dict(symbol='triangle-up'), name='Judical Effectiveness', text=df['Country']), row = 1, col = 1)
fig.add_trace(go.Scatter(x=df['Government Integrity'], y=df['2019 Score'], mode='markers', marker=dict(symbol='circle'), name='Government Integrity', text=df['Country']), row = 1, col = 1)
fig.add_trace(go.Scatter(x=df['Tax Burden'], y=df['2019 Score'], mode='markers', marker=dict(symbol='cross'), name='Tax Burden', text=df['Country']), row = 1, col = 1)

# añadimos un update para mejorar la visualización
fig.update_layout(title='Relationship between the 2019 score and Goverment dependent variables.',
                                xaxis_title='Government dependent variables',
                                yaxis_title='2019 Score',
                                showlegend=True, 
                                width=1600,
                                height=700)
# creamos un subplot para todas las variables relacionadas con el 2019 Score y las variables relacionadas con las finanzas
fig.add_trace(go.Scatter(x=df['Business Freedom'], y=df['2019 Score'], mode='markers', marker=dict(symbol='x'), name='Business Freedom', text=df['Country']), row = 1, col = 2)
fig.add_trace(go.Scatter(x=df['Trade Freedom'], y=df['2019 Score'], mode='markers', marker=dict(symbol='pentagon'), name='Trade Freedom', text=df['Country']), row = 1, col = 2)
fig.add_trace(go.Scatter(x=df['Investment Freedom'], y=df['2019 Score'], mode='markers', marker=dict(symbol='star'), name='Investment Freedom', text=df['Country']), row = 1, col = 2)
fig.add_trace(go.Scatter(x=df['Financial Freedom'], y=df['2019 Score'], mode='markers', marker=dict(symbol='triangle-down'), name='Financial Freedom', text=df['Country']), row = 1, col = 2)

# añadimos un update para mejorar la visualización
fig.update_layout(
    title='Relationship between the 2019 score and dependent variables',
    xaxis=dict(title='Government dependent variables', showgrid=True, zeroline=True, anchor='y2'),
    xaxis2=dict(title='Finance dependent variables', showgrid=True, zeroline=True, anchor='y2'),
    yaxis=dict(title='2019 Score', showgrid=True, zeroline=True),
    showlegend=True, 
    width=2000,
    height=800
)
fig.show()


<p style="color:white; font-size:20px;">Business Freedom analysis </p>

In [30]:
# hacemos un plot relacionando la variable business freedom con otras variables gubernamentales por ciudades
size = df['Business Freedom'] / df['Business Freedom'].mean() * 20 + 5 #  toma el valor de "Business Freedom" para cada país y lo normaliza dividiéndolo por la media de todos los valores de "Business Freedom". Luego, se multiplica por 20 para obtener un valor adecuado para el tamaño del marcador y se agrega 5 para asegurarse de que los marcadores sean lo suficientemente grandes para ser visibles en el gráfico
fig = go.Figure()
for col in ['2019 Score','Property Rights', 'Judical Effectiveness', 'Government Integrity']:
    fig.add_scatter(x=df[col], y=df['Business Freedom'], name=col, mode='markers', text=df['Country'],
                    marker=dict(size=size, sizemode='diameter'))
fig.update_layout(title = 'Relationship of business fredooom with governmental variables and 2019 Score',xaxis_title="2019 Score, Property Rights, Judical Effectiveness, Government Integrity",
                  yaxis_title="Business Freedom" , width=2000, height=800)

fig.show()

**Se puede observar que la libertad empresarial esta estrechamente relacionada con el 2019 Score, los derechos de propiedad, la eficacia judicial y la integridad del gobierno**

<p style="color:white; font-size:20px;">Invest Freedom analysis </p>

In [49]:
# relacionamos Financial Freedom con Investmen Freedom por ciudades 
fig = go.Figure()
fig.add_scatter(x=df['Financial Freedom'], y=df['Investment Freedom'], mode='markers',
                text=df['Country'], marker=dict(size=df['Investment Freedom'], sizemode='diameter'),
                marker_color=df['Investment Freedom'], marker_colorscale='Plasma')

fig.update_layout(title = "Relationship of Financial Freedom and Investment Freedom", xaxis_title="Financial Freedom", yaxis_title="Investment Freedom", width=2000, height=800,  showlegend=True)
fig.show()

<p style="color:white; font-size:20px;">GPD analysis </p>

In [None]:
fig = go.Figure()

for col, color, name in zip(['Population (Millions)', 'FDI Inflow (Millions)'], ['#19D3F3', '#FF6692'], ['Population (Millions)', 'FDI Inflow (Millions)']):
    fig.add_trace(go.Scatter(
        x=df[col],
        y=df['GDP (Billions, PPP)$'],
        mode='markers',
        marker=dict(
            size=10*(df['GDP (Billions, PPP)$'] / df['GDP (Billions, PPP)$'].max()),
            sizemode='diameter',
            sizeref=0.5,
            color=color,
            reversescale=True,
            opacity=0.8
        ),
        text=df['Country'],
        name=name
    ))

fig.update_layout(
    width=1200,
    height=600,
    xaxis_title="'Population (Millions)'/'FDI Inflow (Millions)'",
    yaxis_title="GDP (Billions, PPP)$",
    legend_title_text='Variable',
)

fig.show()

