# Optimal Data Visualization

## Load the data

- gapminder.xlsx

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd

In [14]:
df = pd.read_excel(r'/content/drive/MyDrive/PYTHON/EOI_samsung_python/Notebooks/Practica/Copy of gapminder.xlsx')

In [15]:
df

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap,iso_alpha,iso_num
0,Afghanistan,Asia,2007,43.828,31889923,974.580338,AFG,4
1,Albania,Europe,2007,76.423,3600523,5937.029526,ALB,8
2,Algeria,Africa,2007,72.301,33333216,6223.367465,DZA,12
3,Angola,Africa,2007,42.731,12420476,4797.231267,AGO,24
4,Argentina,Americas,2007,75.320,40301927,12779.379640,ARG,32
...,...,...,...,...,...,...,...,...
137,Vietnam,Asia,2007,74.249,85262356,2441.576404,VNM,704
138,West Bank and Gaza,Asia,2007,73.422,4018332,3025.349798,PSE,275
139,"Yemen, Rep.",Asia,2007,62.698,22211743,2280.769906,YEM,887
140,Zambia,Africa,2007,42.384,11746035,1271.211593,ZMB,894


In [32]:
df_sorted = df.sort_values('gdpPercap', ascending = False)

In [33]:
df_sorted

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap,iso_alpha,iso_num
95,Norway,Europe,2007,80.196,4627926,49357.190170,NOR,578
71,Kuwait,Asia,2007,77.588,2505559,47306.989780,KWT,414
113,Singapore,Asia,2007,79.972,4553009,47143.179640,SGP,702
134,United States,Americas,2007,78.242,301139947,42951.653090,USA,840
62,Ireland,Europe,2007,78.885,4109086,40675.996350,IRL,372
...,...,...,...,...,...,...,...,...
52,Guinea-Bissau,Africa,2007,46.388,1472041,579.231743,GNB,624
141,Zimbabwe,Africa,2007,43.487,12311143,469.709298,ZWE,716
17,Burundi,Africa,2007,49.580,8390505,430.070692,BDI,108
74,Liberia,Africa,2007,45.678,3193942,414.507341,LBR,430


## Create data visualization

### Import plotly library

In [7]:
import plotly.express as px

### Create scatter plot

To represent the relationship between two numerical continuous variables:

1. x-axis → `gdpPercap` (Money per Person)
2. y-axis → `lifeExp` (Life Expectancy)

In [16]:
fig_1 = px.scatter(df, x = df['gdpPercap'], y = df['lifeExp'])

In [17]:
fig_1

### Change the color based on the continent

In [19]:
fig_2 = px.scatter(df, x = 'gdpPercap', y = 'lifeExp', color = 'continent')

In [20]:
fig_2

### Change the size based on the population

In [22]:
fig_3 = px.scatter(df, x = 'gdpPercap', y = 'lifeExp', color = 'continent', size = 'pop')

In [23]:
fig_3

### Show the country name as you hover the mouse over the points

> You may use the `hover_name` parameter

In [24]:
fig_4 = px.scatter(df, x = 'gdpPercap', y = 'lifeExp', color = 'continent', size = 'pop', hover_name = 'country')

In [25]:
fig_4

## Data Analysis

### Which observation represents the biggest point?

In [27]:
#because we used the pop as the size parameter, we will get the biggest point by sorting the pop column and slicing the biggest value
biggest_point = df.sort_values('pop', ascending=False).iloc[0]
biggest_point

country            China
continent           Asia
year                2007
lifeExp           72.961
pop           1318683096
gdpPercap    4959.114854
iso_alpha            CHN
iso_num              156
Name: 24, dtype: object

### Which point is the furthest in the x-axis?

In [29]:
furthest_point_x = df.sort_values('gdpPercap').iloc[0]
furthest_point_x

country      Congo, Dem. Rep.
continent              Africa
year                     2007
lifeExp                46.462
pop                  64606759
gdpPercap          277.551859
iso_alpha                 COD
iso_num                   180
Name: 27, dtype: object

### Which point is the furthest in the y-axis?

In [44]:
furthest_point_y = df.sort_values('lifeExp', ascending = False).iloc[0]
furthest_point_y

country            Japan
continent           Asia
year                2007
lifeExp           82.603
pop            127467972
gdpPercap    31656.06806
iso_alpha            JPN
iso_num              392
Name: 66, dtype: object

## Create a tree map

> Understand what a tree map is in [plotly website](https://plotly.com/python/treemaps/).

In [57]:
import plotly.express as px
import numpy as np
df = px.data.gapminder().query("year == 2007")
fig = px.treemap(df, path=[px.Constant("Sabina"), 'continent', 'country'], values='pop',
                  color='lifeExp', hover_data=['iso_alpha'],
                  color_continuous_scale='RdBu',
                  color_continuous_midpoint=np.average(df['lifeExp'], weights=df['pop']))
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
fig.show()