In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as px

In [2]:
!pip install -U kaleido

Collecting kaleido
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
     |████████████████████████████████| 79.9 MB 173 kB/s            
[?25hInstalling collected packages: kaleido
Successfully installed kaleido-0.2.1


In [3]:
# px.data.gapminder() is a built-in Plotly Express sample database
df = px.data.gapminder().query("year == 2007")
df

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap,iso_alpha,iso_num
11,Afghanistan,Asia,2007,43.828,31889923,974.580338,AFG,4
23,Albania,Europe,2007,76.423,3600523,5937.029526,ALB,8
35,Algeria,Africa,2007,72.301,33333216,6223.367465,DZA,12
47,Angola,Africa,2007,42.731,12420476,4797.231267,AGO,24
59,Argentina,Americas,2007,75.320,40301927,12779.379640,ARG,32
...,...,...,...,...,...,...,...,...
1655,Vietnam,Asia,2007,74.249,85262356,2441.576404,VNM,704
1667,West Bank and Gaza,Asia,2007,73.422,4018332,3025.349798,PSE,275
1679,"Yemen, Rep.",Asia,2007,62.698,22211743,2280.769906,YEM,887
1691,Zambia,Africa,2007,42.384,11746035,1271.211593,ZMB,894


In [4]:
fig = px.sunburst(df, path=['continent', 'country'], values='pop',
                  color='lifeExp', hover_data=['iso_alpha'],
                  color_continuous_scale='RdBu',
                  color_continuous_midpoint=np.average(df['lifeExp'], weights=df['pop']))
# If a color argument is passed, the color of a node is computed as 
# the average of the color values of its children, weighted by their values.
fig.show()

In [5]:
# Get the country & continent database ready
df2 = df[['country','continent']].copy()

In [6]:
#change all the “Americas” to "South America" in the continent column.
df2.loc[df2["continent"] == "Americas", "continent"] = "South America"

In [7]:
# if country is Canada, Mexico, or United States change continent to "North America"
df2.loc[df2['country'] == 'Canada', "continent"] = "North America"
df2.loc[df2['country'] == 'Mexico', "continent"] = "North America"
df2.loc[df2['country'] == 'United States', "continent"] = "North America"

In [8]:
# selecting rows based on condition
df_Canada = df2.loc[df2['country'] == 'Canada']
df_Canada

Unnamed: 0,country,continent
251,Canada,North America


In [9]:
# selecting rows based on condition
df_Mexico = df2.loc[df2['country'] == 'Mexico']
df_Mexico

Unnamed: 0,country,continent
995,Mexico,North America


In [10]:
# selecting rows based on condition
df_US = df2.loc[df2['country'] == 'United States']
df_US

Unnamed: 0,country,continent
1619,United States,North America


In [11]:
dict_num_volcano = {
    "United States": 161,
    "Japan": 122,
    "Indonesia": 121,
    "Russia": 117,
    "Chile": 92,
    "Ethiopia": 53,
    "Papua New Guinea": 47,
    "Philippines": 38,
    "Mexico": 37,
    "Argentina": 36,
    "Ecuador": 35,
    "Iceland": 34,
    "New Zealand": 25,
    "Canada": 24,
    "Guatemala": 23,
    "Tonga": 21,
    "Kenya": 21,
    "El Salvador": 20,
    "France": 20,
    "Antarctica": 19
}   

In [12]:
# create Dataframe from a list of key, value pairs
df3 = pd.DataFrame(list(dict_num_volcano.items()), columns = ['country','num_volcano'])
df3

Unnamed: 0,country,num_volcano
0,United States,161
1,Japan,122
2,Indonesia,121
3,Russia,117
4,Chile,92
5,Ethiopia,53
6,Papua New Guinea,47
7,Philippines,38
8,Mexico,37
9,Argentina,36


In [13]:
# merge inner
df4 = pd.merge(left=df2, right=df3, left_on='country', right_on='country')
# Since 'country' is the only column name in both dataframes, we can skip
# the `left_on` and `right_on` arguments

# What's the size of the output data?
df4.shape
df4

Unnamed: 0,country,continent,num_volcano
0,Argentina,South America,36
1,Canada,North America,24
2,Chile,South America,92
3,Ecuador,South America,35
4,El Salvador,South America,20
5,Ethiopia,Africa,53
6,France,Europe,20
7,Guatemala,South America,23
8,Iceland,Europe,34
9,Indonesia,Asia,121


In [14]:
# df4 is missing 4 countries!  Let's list them
set(df3.country).difference(set(df4.country))

{'Antarctica', 'Papua New Guinea', 'Russia', 'Tonga'}

In [15]:
# NOTE: last_id is not len(df2.index), because some rows are missing
last_id = max(df2.index)

In [16]:
# add the missing rows to the end of the table
df2.loc[last_id + 1] = ['Antarctica', 'Antarctica'] 
df2.loc[last_id + 2] = ['Papua New Guinea', 'Oceania']
df2.loc[last_id + 3] = ['Russia', 'Europe']   # Russia actually spans Asia too.
df2.loc[last_id + 4] = ['Tonga', 'Oceania'] 

In [17]:
df2.tail(10)

Unnamed: 0,country,continent
1643,Venezuela,South America
1655,Vietnam,Asia
1667,West Bank and Gaza,Asia
1679,"Yemen, Rep.",Asia
1691,Zambia,Africa
1703,Zimbabwe,Africa
1704,Antarctica,Antarctica
1705,Papua New Guinea,Oceania
1706,Russia,Europe
1707,Tonga,Oceania


In [18]:
# merge inner
df4 = pd.merge(left=df2, right=df3, left_on='country', right_on='country')

# What's the size of the output data?
df4.shape
df4


Unnamed: 0,country,continent,num_volcano
0,Argentina,South America,36
1,Canada,North America,24
2,Chile,South America,92
3,Ecuador,South America,35
4,El Salvador,South America,20
5,Ethiopia,Africa,53
6,France,Europe,20
7,Guatemala,South America,23
8,Iceland,Europe,34
9,Indonesia,Asia,121


In [19]:
fig = px.sunburst(df4, path=['continent', 'country'], values='num_volcano',
                  title="Countries with the most volcanos 🌋(erupting in the last 12,000 years)")
# The volcano emoji will show up in the chart while in Jupyter notebooks.  
# However, it will not appear in the exported .png file

fig.update_layout(
    title_font_family="Times New Roman",
    title_font_color="red",
    title_x=0.5
)

# If a color argument is passed, the color of a node is computed as 
# the average of the color values of its children, weighted by their values.
fig.show()

In [20]:
fig.write_image("/kaggle/working/fig1.png")