# Visualize Internet Access for 39 Countries in the years 2005..2019
Values represent % of households in the country having internet access.

In [None]:
# packages

# standard
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# plots
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Import and data preparation

In [None]:
# load file
df = pd.read_csv('../input/internet-access-oecd/DP_LIVE_25112020170111306.csv')

In [None]:
# first glance
df.describe(include='all')

In [None]:
# INDICATOR, SUBJECT, MEASURE, FREQUENCY have only one unique value => we can safely remove those columns
df = df.drop(['INDICATOR', 'SUBJECT', 'MEASURE', 'FREQUENCY'], axis=1)

In [None]:
# Flag Codes don't seem to add much value either:
print(df['Flag Codes'].value_counts())
df = df.drop(['Flag Codes'], axis=1)

### Now we have a compact and clean dataset:

In [None]:
df

### There is still one caveat: Not for all countries we have values for all years 2005..2019!

In [None]:
# we can quickly check this by counting/plotting the frequencies
fig = plt.figure(figsize=(12,5))
df.LOCATION.value_counts().plot(kind='bar')
plt.grid()
plt.show()

In [None]:
# let's get a list of those countries with incomplete history
incomplete = df.LOCATION.value_counts()
incomplete = incomplete[incomplete<15]
incomplete = list(incomplete.index)
print('Countries having incomplete history:')
print(incomplete)

In [None]:
# evaluate min/max available year for all countries
year_stats = df.groupby('LOCATION', as_index=False).agg(
    min_year = pd.NamedAgg(column='TIME', aggfunc=min),
    max_year = pd.NamedAgg(column='TIME', aggfunc=max),
    n_years = pd.NamedAgg(column='TIME', aggfunc="count"))

# and show only those that do not have values for all years, i. e. 2005..2019
year_stats[year_stats.LOCATION.isin(incomplete)]

### It is interesting to note, that Iceland and Turkey have data for the first year 2005 as well as for the last year 2019, but the values are nevertheless incomplete, due to missings in between!

# Status as of 2019

In [None]:
# extract data for most recent year (2019)
df_2019 = df[df.TIME==2019]

In [None]:
# and plot the values
fig = plt.figure(figsize=(15,5))
sns.barplot(x=df_2019.LOCATION, y=df_2019.Value)
plt.title('Status 2019')
plt.grid()
plt.show()

In [None]:
# create also sorted version of the plot
df_2019_sort = df[df.TIME==2019].sort_values(by='Value', ascending=False)

fig = plt.figure(figsize=(15,5))
sns.barplot(x=df_2019_sort.LOCATION, y=df_2019_sort.Value)
plt.title('Status 2019 - Sorted')
plt.grid()
plt.show()

# Interactive plot of development by country

In [None]:
# create interactive line chart grouped by country using plotly
fig = px.line(df, x='TIME', y='Value', color='LOCATION',
             title='Development for all countries')
fig.show()

### Please note that you can select an indvidual country by double-clicking on the legend.

In [None]:
# we can easily also create a plot showing only the countries with INCOMPLETE values
fig = px.line(df[df.LOCATION.isin(incomplete)], x='TIME', y='Value', color='LOCATION',
               title='Development for countries having incomplete statistics')
fig.show()

# Compare development over 10 years

In [None]:
# extract data 10 years before 2019:
df_2009 = df[df.TIME==2009]

# get intersection of countries available both 2009 and 2019
countries_2009 = set(df_2009.LOCATION.unique())
countries_2019 = set(df_2019.LOCATION.unique())
common_countries = countries_2009.intersection(countries_2019)

# now extract common countries from both years
df1 = df_2009[df_2009.LOCATION.isin(common_countries)].reset_index(drop=True)
df2 = df_2019[df_2019.LOCATION.isin(common_countries)].reset_index(drop=True)

In [None]:
# and combine to one nice data frame
df_compare = df1
df_compare = df_compare.rename(columns={'Value': 'Value2009'})
df_compare = df_compare.drop('TIME', axis=1)
df_compare['Value2019'] = df2.Value
# df_compare['LOCATION_CHECK'] = df2.LOCATION # just for checking if rows are aligned correctly
df_compare['Diff_10y'] = df_compare.Value2019 - df_compare.Value2009
df_compare.head()

In [None]:
# visualize development using scatter plot
fig = px.scatter(df_compare, x='Value2009', y='Value2019', color='LOCATION',
             title='2019 vs 2009')
fig.show()

In [None]:
# correlation
cor_pearson = df_compare.Value2009.corr(df_compare.Value2019, method='pearson')
cor_spearman = df_compare.Value2009.corr(df_compare.Value2019, method='spearman')

print('Correlation 2019 vs 2009 - Pearson  : ', np.round(cor_pearson,4))
print('Correlation 2019 vs 2009 - Spearman : ', np.round(cor_spearman,4))

In [None]:
# plot change of percentage over 10 years
fig = plt.figure(figsize=(15,5))
sns.barplot(x=df_compare.LOCATION, y=df_compare.Diff_10y)
plt.title('Ten year development')
plt.grid()
plt.show()

In [None]:
# create again also sorted version of the plot
df_compare_sort = df_compare.sort_values(by='Diff_10y', ascending=False)

fig = plt.figure(figsize=(15,5))
sns.barplot(x=df_compare_sort.LOCATION, y=df_compare_sort.Diff_10y)
plt.title('Ten year development - Sorted')
plt.grid()
plt.show()

# Look at an individual country

In [None]:
sel_country = 'CRI'

df_sel = df[df.LOCATION==sel_country].copy()
df_sel['Value_Diff'] = df_sel.Value.diff() # calc/add year on year difference

In [None]:
# visualize development using scatter plot
fig = px.scatter(df_sel, x='TIME', y='Value',
             title='Development ' + sel_country)
fig.show()

In [None]:
# visualize incremental development using scatter plot
fig = px.scatter(df_sel, x='TIME', y='Value_Diff',
             title='Incremental Development ' + sel_country)
fig.show()