# DataViz workflow

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

In [None]:
file = '/content/List of Countries by number of Internet Users - Sheet1.csv'

internet = pd.read_csv(file)


internet.head(10)

Unnamed: 0,Country or Area,Internet Users,Population,Rank,Percentage,Rank_pct
0,China,765367947,1409517397,1,0.54,116
1,India,461347554,1339180127,2,0.34,145
2,United States,244090854,324459463,3,0.75,68
3,Brazil,141206801,209288278,4,0.67,83
4,Japan,115845120,127484450,5,0.91,23
5,Russia,109446612,143989754,6,0.76,64
6,Indonesia,85242816,263991379,7,0.32,150
7,Mexico,82470752,129163276,8,0.64,92
8,Germany,69304405,82114224,9,0.84,36
9,Philippines,63003313,104918090,10,0.6,103


In [None]:
internet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215 entries, 0 to 214
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Country or Area  215 non-null    object
 1   Internet Users   215 non-null    object
 2   Population       215 non-null    object
 3   Rank             215 non-null    int64 
 4   Percentage       215 non-null    object
 5   Rank_pct         215 non-null    object
dtypes: int64(1), object(5)
memory usage: 10.2+ KB


In [None]:
# Check shape
internet.shape

(215, 6)

In [None]:
# Check if there are repeated entries in Countries column
internet['Country or Area'].duplicated().sum()

0

In [None]:
pd.options.display.float_format = '{:,.2f}'.format
internet.describe().round(3)

Unnamed: 0,Internet Users,Population,Rank,Percentage,Rank_pct
count,205.0,205.0,205.0,205.0,205.0
mean,17869447.64,36800536.98,103.08,0.55,105.44
std,66550740.43,140517057.62,59.46,0.28,60.63
min,5520.0,11192.0,1.0,0.01,1.0
25%,448260.0,1309632.0,52.0,0.3,54.0
50%,2811056.0,7364883.0,103.0,0.59,105.0
75%,9521056.0,24450561.0,154.0,0.79,158.0
max,765367947.0,1409517397.0,209.0,0.99,209.0


In [None]:
internet.corr().style.background_gradient(cmap='coolwarm')

Unnamed: 0,Internet Users,Population,Rank,Percentage,Rank_pct
Internet Users,1.0,0.959814,-0.377877,0.035129,-0.049747
Population,0.959814,1.0,-0.350031,-0.055248,-0.015487
Rank,-0.377877,-0.350031,1.0,-0.127292,0.29533
Percentage,0.035129,-0.055248,-0.127292,1.0,-0.372852
Rank_pct,-0.049747,-0.015487,0.29533,-0.372852,1.0


In [None]:
# Plot box plots
descr = internet.describe()
descr= internet.describe().T.reset_index().drop('count', axis=1)
descr= pd.melt(descr, id_vars='index', value_vars=descr.iloc[:,1:], var_name='vals')
px.box(descr, y= 'index', x='value', color='index')

In [None]:
internet.describe()

Unnamed: 0,Internet Users,Population,Rank,Percentage,Rank_pct
count,205.0,205.0,205.0,205.0,205.0
mean,17869450.0,36800540.0,103.082927,0.547317,105.439024
std,66550740.0,140517100.0,59.464258,0.283598,60.633037
min,5520.0,11192.0,1.0,0.01,1.0
25%,448260.0,1309632.0,52.0,0.3,54.0
50%,2811056.0,7364883.0,103.0,0.59,105.0
75%,9521056.0,24450560.0,154.0,0.79,158.0
max,765367900.0,1409517000.0,209.0,0.99,209.0


### Missing Data

In [None]:
internet.isnull().sum()

Country or Area    0
Internet Users     0
Population         0
Rank               0
Percentage         0
Rank_pct           0
dtype: int64

In [None]:
import re

# Check for non-numeric values
non_number = internet.Percentage.apply(lambda x: re.findall('^[1234567890]*', x) != ['0']) 
internet[non_number]

Unnamed: 0,Country or Area,Internet Users,Population,Rank,Percentage,Rank_pct
197,Jersey,38958,165314,198,41.03% (2012),-
201,Gibraltar,32494,34571,202,94.44% (2016),-
205,British Virgin Islands,14456,31196,206,37.60% (2012),141
206,Anguilla,12043,14909,207,81.57% (2016),44
209,Saint Helena,2906,4534,210,37.60% (2012),-
210,Falkland Islands,2881,2910,211,99.02% (2016),-
211,Montserrat,2833,5177,212,54.55% (2013),115
212,Wallis and Futuna,1383,11773,213,8.95% (2012),-
213,Niue,1034,1618,214,86.90% (2013),30
214,Ascension,361,806,215,41.03% (2012),-


In [None]:
# Drop those values
internet.drop(internet[non_number].index, axis=0, inplace=True)

# Assign correct value type
internet = internet.astype({'Percentage':float,
                            'Rank_pct': int})

In [None]:
internet.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 205 entries, 0 to 208
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Country or Area  205 non-null    object 
 1   Internet Users   205 non-null    int64  
 2   Population       205 non-null    int64  
 3   Rank             205 non-null    int64  
 4   Percentage       205 non-null    float64
 5   Rank_pct         205 non-null    int64  
dtypes: float64(1), int64(4), object(1)
memory usage: 11.2+ KB


### Relation Plots

In [None]:
# Pop size vs Internet Users
px.scatter(internet, 
           x= 'Population', y= 'Internet Users',
           size= 'Population', 
           trendline="ols",
           title= "Correlation between Population Size and Internet Users", 
           color= 'Internet Users',
           hover_data=['Country or Area'])

In [None]:
# Rank vs Pop Size
px.scatter(internet, 
           x= 'Population', y= 'Percentage',
           size= 'Population', 
           trendline="ols",
           title= "Population vs Percentage of it who are Internet Users", 
           color= 'Internet Users')

In [None]:
# Import Folium for plot
import folium

# Add population in millions column for better plotting
internet['pop_millions'] = internet.Population.div(1_000_000)

# Bring lat long by country
lat_long = pd.read_html('https://developers.google.com/public-data/docs/canonical/countries_csv')[0]
lat_long.reset_index(inplace=True)
# Correct Country names to match
lat_long.name.replace({'Myanmar [Burma]': 'Myanmar',
                       "Côte d'Ivoire": 'Ivory Coast',
                       "Congo [Republic]": 'Republic of the Congo',
                       "Palestinian Territories": 'Palestinian Authority',
                       "Macedonia [FYROM]": 'Macedonia',
                       "Timor-Leste": 'Timor Leste',
                       "Cape Verde": 'Cabo Verde'}, inplace=True)

internet['Country or Area'].replace({'South Sudan': 'Sudan',
                                     'Democratic Republic of the Congo': 'Republic of the Congo',
                                      'The Gambia': 'Gambia',
                                     'The Bahamas' : 'Bahamas',
                                     'Micronesia, Federated States of': 'Micronesia'}, inplace=True)

# json file for the world map
map_geo = '/content/globe.geo.json'
# Create the base map
m = folium.Map(location=[50, 0], zoom_start=3)

# Create Choropleth layer
folium.Choropleth(
    geo_data=map_geo,
    name='choropleth',
    data=internet,
    columns=['Country or Area', 'Percentage'],
    key_on='feature.properties.name',
    fill_color='Greens',
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name='Internet Users(%)').add_to(m)

# Add Bubbles for the size of the country
for country in internet['Country or Area']:
  folium.CircleMarker(location= [ lat_long.query('name == @country')['latitude'].values[0], lat_long.query('name == @country')['longitude'].values[0] ],
                      radius=internet.query('`Country or Area` == @country')['Population'].values[0]/25000000,
                      color='blue',
                      fill=True, 
                      fill_color='lightblue').add_to(m)

folium.LayerControl().add_to(m)

# View
m