In [None]:
# ancilliary utility modules
import dataUtls  # source in repository
import osOps  # see github.com/romstroller/FileTools

In [None]:
import importlib

dataUtls = importlib.reload( dataUtls )
osOps = importlib.reload( osOps )

### OBJECTIVE
This report is a broad exploration of World Factbook data from the US Central 
Intelligence Agency, seeking to provide general point-of-interest insights on 
key datapoints in areas including health, gender, economy and environment, both 
directly from the sample and deriving new features through feature-maxima bar 
plots ("top tens"), and correlation scatter plots. 

The report implements data processing techniques including expression matching, 
correlation analysis, and z-score and probability density analysis.  

**Some example datapoints...**
- What countries have, at the same time, the most adults overweight and the 
most children underweight?
- 2
- 3
- 4

### DATA: acquisition, cleaning and feature engineering

In [None]:
# authenticate, download and load kaggle dataset
dfOR = osOps.OsKit().getKaggleSet( 'lucafrance', 'the-world-factbook-by-cia' )

# display data type-count
print( f"Data type-count: {dataUtls.typeCount( dfOR )}" )

#### State of information

- *[The World Factbook by CIA](https://www.kaggle.com/datasets/lucafrance/the-world-factbook-by-cia)
 (owner: Luca Franceschini, available at Kaggle.com)*

The objective concerns only the features which have a primary value that can be
included in numeric operations. The dataset is documented to have been acquired
by browser extraction, and contains a substantial amount of noise or extraneous 
information, in inconsistent format. Pattern matching is further complicated by 
the presence of historical data in the same cell, and variations of scale within 
many features.

Processing extracts the primary numeric value to float-type, and omits features
with null values beyond a minimal threshold (initially, the dataset is 74% 
null). For clarity, source code for cleaning operations is in the ancillary 
module. Sufficient processing has been undertaken only to serve observations
about the features included in the analysis. Substring segment frequencies are
analysed to derive feature unit. 

In [None]:
# parse distinct numbers via regular expression to dictionary
dfFbDict = dataUtls.generateMatchDct( dfOR )

# isolate clean float from number-match records
dataUtls.isolateClean( dfFbDict )

# generate dataframe from clean feature data
dfFloat = dataUtls.getCleanDF( dfFbDict, dfOR )

# enforce non-nan threshold ( av. dense + .5 sDev rounded ), convert numeric
dfColsClean = dataUtls.getNumericNonNan( dfFloat )

# review cleaned dataframe
print( dataUtls.cleanReport( dfOR, dfColsClean, 'loaded', 'clean' ) )

In [None]:
# Manual analysis: enforce has valid primary number value, mark scale diffs
# dataUtls.runScaleAnalysis(dfColsClean, cleanReman)

# Load completed scale/drop analysis
dropFeats, scaleNotes = dataUtls.unPklData( 'dropFeatrs', 'cleanNotes' )

# apply drop to flagged features
dfDropped = dataUtls.omitDropped( dfColsClean, dropFeats )

# adjust scale-variant values to unify scale
df, cleanCountries = dataUtls.flattenScale(
    dfDropped, dfFbDict, scaleNotes, dfOR, dropFeats )

# drop summative observation
df, popRs = dataUtls.popRows_byFtVal( df, 'Country', [ 'World' ] )

# numericise country col, get country-code refrerence dcts
dfN, ctrDct = dataUtls.numCtry( df )

# review numericised dataframe
print( dataUtls.cleanReport( dfColsClean, dfN, 'clean', 'numericised' ) )

# return "df" for maxima-minima analysis, and retain dfN for stat funcs
df = dfN.copy()
df[ 'Country' ] = df[ 'Country' ].replace( ctrDct )

In [None]:
importlib.reload(osOps)

# identify candidate units from substring frequency in feature data
# fts = [ f for f in dfN.columns if f != 'Country' ]
# unitDct = dataUtls.generateUnitDct( fts, dfOR )
# untDPath = osOps.OsKit().storePKL( unitDct, 'unitDct',  )

In [None]:
unitDctPkl =  osOps.OsKit().unPklData( 'unitDct.pkl' )
unitDct = unitDctPkl['unitDct.pkl']

In [None]:
fts = [ f for f in dfN.columns if f != 'Country' ]
for ft in fts:
    units = unitDct[ft]
    if units:
        print(ft)
        print( f"SCALE  [ {units[0][0].strip()} ]" )
        print( f"SCALE  [ {units[1][0].strip()} ]" )
    else: print( f"NONSTRING COL FOR FEAT: {ft} " )
    print()
    

In [None]:
unitDct['People and Society: Age structure - 55-64 years']

# iteratively: 
#   brack-open and any follow
#   remove any leading numbers
#   

#### OUTLIER ANALYSIS
Rather than clipping or smoothing as in many machine learning purposes, this 
report is especially interested in retaining and presenting genuine outliers 
(the sample maximums and minimums for various features). Here, outlier analysis 
is used to determine that outliers are not obviously the result of errors in 
the data.

For example, here we confirm that the highly differentiated probability density
of the 'Geography: Area - total' feature (strongly skewed toward the minimum, 
with many larger countries of singular size creating a "platykurtic" or very
flattened upper tail ) corresponds to observations described in the original 
data ("outliers" Russia and Antarctica are simply big!)

In [None]:
# identify outliers with z-score standardization
zThresh = dataUtls.getDF_ZThresh  # For showing observations above z-score
pDensity = dataUtls.showPDens  # display probability distrib. with fit

In [None]:
# visual Gaussian-fit check 
pDensity( dfN, 'Geography: Area - total' )

# checking with z-score 
zThresh( dfN, 'Geography: Area - total', 2.5, dfOR, ctrDct )

In [None]:
# ctrDct[1]
# ctrDct.i['Albania']
# ctrDct.i['Albania'][0]

By contrast, though still with a minimum-skewed distribution: in **Military and 
Security: Military expenditures**, North Korea's dramatic, isolated prominence 
at the "most militarized" end turns out to be the result of extraction error. 
They do still remain the world leader for the period, at between 20-25% of GDP.

In [None]:
ft = 'Military and Security: Military expenditures'
pDensity( dfN, ft )
z = zThresh( dfN, ft, 0.228, dfOR, ctrDct, ret=True )
print( f"\"{[ b for a, b in z.orVal.items() ][ 0 ]}\"" )

In [None]:
# # Random cycle though feature PD distributions
# fts = dfN.columns.to_list()
# pos = random.randint(1, len(fts) )
# pDensity( dfN, fts[pos] )
# zThresh( dfN, fts[pos], 2.5, dfOR, ctrDct )

# # zScore and g-fit with outliers excluded
# dfZ_area_e = zThresh( dfN, tArea, 2.5, dfOR, ctrDct, excl=True )
# pDensity( dfZ_area_e, 'E_VAL' )

### Exploration, Visualisation and Analysis
Reporting is broadly categorized into observations on:
1. Age, health and gender
2. Coal, Energy and Pollution
3. Economy
4. Geography and Environment

In [None]:
# dataUtls = importlib.reload( dataUtls )

# provide local access to required module data tools
showMax = dataUtls.showMaxima  # barplot for <=10 feature-max/min countries
pltSctr = dataUtls.plotScttr  # scatterplot distribution for feature pair
getRank = dataUtls.getRank  # return country's rank for value for feature
getVal = dataUtls.getVal  # return val for country for feature
getCorDct = dataUtls.getCorDct  # generate feature master correl. dict
getCTDict = dataUtls.getCThreshDct  # get dict for correlations at threshold
getTRep = dataUtls.getThreshReport  # report correlations-at-threshold
repCorr = dataUtls.repCorrel  # report correlation for feature pair
difsFld = dataUtls.showDiffsFilled  # For corrThreshDct, omit NaN-only diffs
fSet = dataUtls.fSetFromFeatures  # get frozenset key from feature names

#### Feature maxima
The highest and lowest values for features compiled by the CIA's World Factbook 
provide a convenient global view on important aspects of humanity and the 
environment. Some caution is necessary before interpreting these summaries 
due to some issues of accuracy around conflagration of time period (the CIA
resource mixes the latest reporting year, which can vary by decade or more).

#### Correlations and scatterplots
Perfect correlations (of significance -1 or 1) are rarely informative, in that 
they identify effectively identical, or duplicate, features. For this reason, 
they can be useful for reducing unnecessary dimensionality in large datasets.
 
Strong correlations will tend to be more self-evident (for example, "Total 
area" being near-perfectly correlated to "Total land", with some noise caused 
by variable water-area), but still provide an empirical, observational basis 
for testing assumptions. Scatter plots are useful for identifying or 
demonstrating where a clear pattern, like a linear correlation, is present, and 
also for an alternative view on observations made with different techniques.

***

In [None]:
# generate correlation significance between all features 
corrDct = getCorDct( df )

In [None]:
# Check perfect correlations. Confirmed duplicates, drop
for i in (perfC := getCTDict( 1, corrDct, df )):
    print( f"{perfC[i]['corr']=}:\n{perfC[i]['baseName']=}"
           f"\n{perfC[i]['compName']=}\n" )
    corrDct.pop( i )

print( 'Dropped duplicate features' )

**Some near-perfect correlations:** who'd have thought having more land
was associated with bigger countries?

In [None]:
# Examine near-perfect threshold (likely still duplicates or self-evident)
thresh_9_9_8 = getCTDict( 0.998, corrDct, df )
checkDifs_9_9_8 = difsFld( thresh_9_9_8, df )

In [None]:
# print a particular correlation report:
# print( f"{getTRep( thresh_9_9_9, fSet(10, 76) )}" )
# [20, 44](c-0.82): higher youth dependency, lower life expectancy at birth 

# For threshold 0.995-0.999
# thresh_995_999 = getCTDict( 0.800, corrDct, df, out_lim=0.850 )
# checkDifs_995_998 = difsFld( thresh_995_999, df )

# # T10 cycle
# start = 167
# cycleT10(  df, start, 1 )
# zThresh( df, totArea, 2.5, dfOR, ctrDct )
# start +=1

***
#### AGE, HEALTH AND GENDER

- Neat curvilinear distribution between **percent pop. over 64** and
**percent population under 15** showing a clear negative relationship.
- Likewise, in a positive trend, the strong association of **higher birthrate**
with **bigger proportion of 15-24yo's** rapidly decreases after the birthrate 
hits around 20%; after this point, that segment is relatively stable for higher 
birthrates. 

In [None]:
pltSctr( df, [
    'People and Society: Age structure - 65 years and over',
    'People and Society: Age structure - 0-14 years' ] )

pltSctr( df, [
    'People and Society: Birth rate',
    'People and Society: Age structure - 15-24 years' ] )

***
- Gulf countries' **sex ratio weighting** toward men is startling by the
global distribution, particularly Qatar & the Unite Arab Emirates.
Note we can see strong support for 
[Fisher's principle](https://en.wikipedia.org/wiki/Fisher%27s_principle): sex 
ratio is leptokurtic, closely gathered around the one-to-one ratio.

In [None]:
ft = 'People and Society: Sex ratio - total population'
zThresh( dfN, ft, 0.8, dfOR, ctrDct )
pDensity( dfN, ft )

At the same time, both have (by good measure) the highest proportion of
total population that is 25-54 years old.

In [None]:
showMax( 'People and Society: Age structure - 25-54 years', df )

Interesting combination - one can imagine a dominating social discourse might 
concern the relation of older men to working-age adults.
***
Some more points on **aging populations:** 

- in Palau and North Korea, there are more than twice as many old women as 
old men.
- with older populations, the rate of urbanization decreases
- with older populations, the number of physicians, and of broadband 
subscriptions, increases 

In [None]:
showMax( 'People and Society: Sex ratio - 65 years and over', df, asc=True )

pltSctr = dataUtls.plotScttr  # #######################

pltSctr( df, [
    'People and Society: Age structure - 55-64 years',
    'People and Society: Urbanization - rate of urbanization' ] )

pltSctr( df, [
    'People and Society: Age structure - 55-64 years',
    f'Communications: Broadband - fixed subscriptions - '
    f'subscriptions per 100 inhabitants' ], fts2=[
    'People and Society: Median age - total',
    f'People and Society: Physicians density' ] )

***
- Some suprise might arise from the data on **Health Expenditure** - despite the 
highest costs for *individuals* in the OECD (see 
[here](https://en.wikipedia.org/wiki/Health_care_prices_in_the_United_States) 
for example), the US Government is spending more on health than the rest of the 
world (Tuvalu excepted. Along with other island states in this top-ten, 
expenditure proportions might be considered less significant given the 
susceptibility of comparably small budgets to weighting.)

In [None]:
# People and Society: Current Health Expenditure
# Surprised to see US near the top with the impression given by private health 
# costs.
showMax( ft := 'People and Society: Current Health Expenditure', df )
# Only one country in t10 expenditure is also in t10 physician density (Cuba)

zThresh( dfN, ft, 2.5, dfOR, ctrDct )

***
- Highly interesting to see Monaco leading the top-ten for **Physician 
density** while at the same time trailing in the very bottom for **Health 
expenditure**. It could only be surmrised that citizens are privately funding 
most of their healthcare. 
- There are surprisingly **zero nations from the Anglosphere** here; 
more data would be needed to probe the following question: with fewer 
physicians per individual, is there a measurable failure of preventative 
care that might have been afforded by the access and familiary that 
community-embedded physicians might provide?

In [None]:
showMax( 'People and Society: Physicians density', df )
showMax( 'People and Society: Current Health Expenditure', df, asc=True )

***
- Southern African nations exclusively form the t10 **percent of population 
living with HIV/AIDs**

In [None]:
df[ 'People living with HIV/AIDs as percentage of population' ] = (
    df[ 'People and Society: HIV/AIDS - people living with HIV/AIDS' ] /
    df[ 'People and Society: Population' ])
showMax( 'People living with HIV/AIDs as percentage of population', df )


- What is the **lowest-HIV prevalence in the S.A. high-HIV/AIDS region**?

In [None]:
# 
sthEquatAfrica = dfOR[
    (dfOR[ "Geography: Map references" ] == "Africa")
    & (dfOR[ "Geography: Geographic coordinates" ].str.contains( "S" )
       ) ].Country

showMax( 'People living with HIV/AIDs as percentage of population',
    df[ df[ 'Country' ].isin( sthEquatAfrica ) ], asc=True,
    sub="African Nations South of Equator" )

- For the countries that are not islands, Angola - being large, and close to 
the HIV/AIDs epicentre, appears to have some form of strongly inhibiting factor.
A look at recent history identifies a cause for the low prevalence: civil war. 

> The 27-year civil war in Angola, lasting from 1975 until 2002, kept the spread 
> of HIV to a minimum due to large parts of the country being inaccessible to 
> people infected with the virus. During the civil war, individuals from 
> neighboring countries such as Zambia, Botswana, and Zimbabwe (all countries 
> with high prevalence rates of HIV) were also not allowed to come into the 
> country, which played a significant role in controlling the spread of HIV.
> [(source: Wikipedia)](https://en.wikipedia.org/wiki/HIV/AIDS_in_Angola#History)

***
- **Gender and Tobacco usage**: nearly half the people in Nauru and Burma smoke.
However, when limited to females, European nations remain in the t10, while
the Asia-Pacific nations Burma, Kiribati, Timor Leste, PNG and Indonesia 
disappear (the men are the smokers). </br> </br> The Islands pattern is 
strikingly reversed for Nauru, where it is female smoking alone which places it 
at number one, with male smoking at 140th place!

In [None]:
getRank( df, 'Nauru', 'People and Society: Tobacco use - male' )
showMax( 'People and Society: Tobacco use - total', df )
showMax( 'People and Society: Tobacco use - male', df )
showMax( 'People and Society: Tobacco use - female', df )

# No Smoking:
# showMax( 'People and Society: Tobacco use - total', df, asc=True )



***
- **Generational weight disparity**: where is there the highest observations 
for both adult obesity prevalence and children 4 years and under who are 
underweight? </br> </br> These maxima are limited to nations reporting **above-
mean observations** for both features (total seven).

In [None]:
uFeat = 'People and Society: Children under the age of 5 years underweight'
oFeat = 'People and Society: Obesity - adult prevalence rate'

df[ 'Generational weight disparity' ] = (df[ uFeat ] + df[ oFeat ])

aboveMeans = [ country for country in df[ 'Country' ] if (
    df.loc[ df[ 'Country' ] == country, uFeat ].iloc[ 0 ]
    >= df[ uFeat ].mean() and
    df.loc[ df[ 'Country' ] == country, oFeat ].iloc[ 0 ]
    >= df[ oFeat ].mean()) ]

showMax( 'Generational weight disparity',
    df[ df[ 'Country' ].isin( aboveMeans ) ],
    sub="% adults obese + % children underweight, both above-mean" )

***
- **Education disparities** - by a modest yet significant margin, Australia has 
the largest percent total competing teriary, and the position **holds true for 
women** as much as men.

In [None]:
expectFeatStr = (
    "People and Society: School life expectancy (primary to tertiary "
    "education) -")
showMax( f'{expectFeatStr} total', df )
showMax( f'{expectFeatStr} male', df )
showMax( f'{expectFeatStr} female', df )


***
- **School-completion disparity in the sexes**: Liechtenstein stands out at 
least in terms of supposed development,having one of the highest GDPs per 
capita in the world.

In [None]:
ft = 'School completion disparity between sexes'
df[ ft ] = (df[ f'{expectFeatStr} male' ] - df[ f'{expectFeatStr} female' ])
showMax( ft, df, sub="Average lifetime in education, male minus female" )


***
- Most **emmigration**, and **populations in greatest contraction**: two very 
strong categories: either islands (esp. Pacific), or eastern Europe

In [None]:

showMax( 'People and Society: Net migration rate', df, asc=True )
showMax( 'People and Society: Population growth rate', df, asc=True )

***
### COAL, ENGERY AND POLLUTION
#### Not a glitch: China's appetite
Just for scale, this is what the production of the top-ten coal producers looks 
like. 

In [None]:
showMax( 'Energy: Coal - Production', df )
zThresh( dfN, 'Energy: Coal - Production', 0.3, dfOR, ctrDct )
showMax( 'Energy: Coal - Consumption', df )
zThresh( dfN, 'Energy: Coal - Consumption', 0.3, dfOR, ctrDct )

# compare: Energy: Electricity - Consumption

- For both **Coal production and consumption**, The only nation falling outside 
three standard deviations of the mean, falls outside by around ***fourteen*** 
standard deviations. More astonishingly, China remains a net importer - they 
consume this and more. 
- Between production to consumption, Australia disappears down to 199th in the 
world; quite a feat for the fourth-largest producer, whereas the rest of the 
top ten producers are in the top ten consumers (excepting Kazakhstan, who drops 
out similarly as consumer to 197th).

In [None]:
# update this to return a percap or perGDP ranking.
getRank( df, 'Australia', 'Energy: Coal - Consumption' )
getRank( df, 'Kazakhstan', 'Energy: Coal - Consumption' )

- Some figures in greater detail:

In [None]:
prodFeat = 'Energy: Coal - Production'
consFeat = 'Energy: Coal - Consumption'

chinaProd = df[ df.Country == 'China' ][ prodFeat ].sum()
chinaCsmp = df[ df.Country == 'China' ][ consFeat ].sum()

notChinaProd = df[ df.Country != 'China' ][ prodFeat ].sum()
notChinaCsmp = df[ df.Country != 'China' ][ consFeat ].sum()

# get longest string length to pad report field
pad = (max( [ len( str( i ) )
    for i in [ chinaProd, notChinaProd, chinaCsmp, notChinaCsmp ] ] ))

print( f"PRODUCTION: China's production is "
       f"[ {(chinaProd / notChinaProd):,.2f} ] times that of the rest of world\n"
       f"   [ {chinaProd:>{pad},.2f} ]: China's coal production\n"
       f"   [ {notChinaProd:>{pad},.2f} ]: rest of world combined\n" )
print( f"\nCONSUMPTION: China's consumption is "
       f"[ {(chinaCsmp / notChinaCsmp):,.2f} ] times the size\n"
       f"of the rest of world combined.\n"
       f"   [ {chinaCsmp:>{pad},.2f} ]: China's coal consumption\n"
       f"   [ {notChinaCsmp:>{pad},.2f} ]: rest of world combined\n" )

***
- We can get an image of a country's **relationship with coal** if we look at the 
combined production and imports in ratio to exports. Where this ratio is above 
one, a country has exported above the total produced and imported, meaning it 
has sold reserves. Refining further to only the countries whose coal exports 
are above the world-mean, we can see who has a strong reliance on coal exports. 
</br> </br> In order, the refinement exludes Venezuela, Belarus and Eswatini such that 
Russia, South Africa and the Phillipines entered the top "sellers"

In [None]:
df[ 'Coal: Exports-to-Total-Holdings ratio' ] = (
    df[ 'Energy: Coal - Exports' ] /
    (df[ 'Energy: Coal - Production' ] +
     df[ 'Energy: Coal - Imports' ]))

aboveMeans = [ country for country in df[ 'Country' ] if (
    df.loc[ df[ 'Country' ] == country, 'Energy: Coal - Exports' ].iloc[ 0 ]
    >= df[ 'Energy: Coal - Exports' ].mean()) ]

showMax( 'Coal: Exports-to-Total-Holdings ratio',
    df[ df[ 'Country' ].isin( aboveMeans ) ] )

***
Naturally, on the topic of coal, a look at the **top CO2 emitters**:
- The world's clustering for **CO2 emissions vs coal consumption**: China out 
in the distance
- **Dirty consumers**: highest emissions per coal consumption.

CO2 EMISSIONS

In [None]:
yFeat = 'Energy: Carbon dioxide emissions - From coal and metallurgical coke'
pltSctr( df, [ 'Energy: Coal - Consumption', yFeat ] )

# While we are looking at these features, an interesting insight:
#   Scoring the dirtiest 

df[ 'Coal/Metalurgical CO2 emissions PER Coal consumption' ] = (
    df[ 'Energy: Coal - Consumption' ] / df[ yFeat ])

showMax( 'Coal/Metalurgical CO2 emissions PER Coal consumption', df,
    sub='Dirtiest emitters per unit consumed' )


***
- Kenya's **Geothermal energy mix** is impressive. Good geology: </br> </br>
>In places where tectonic plates – consisting of the Earth's crust, and the 
upper mantle – are being pushed together or torn apart, this heat rises closer 
to the surface. One such place is Africa's Great Rift Valley, which runs 
7,000km (4,350 miles) across the eastern side of the continent.
> *(Source: 
> [BBC](https://www.bbc.com/future/article/20210303-geothermal-the-immense-volcanic-power-beneath-our-feet))*

In [None]:
ft = 'Energy: Electricity generation sources - Geothermal'
showMax( ft, df )
zThresh( dfN, ft, 2.5, dfOR, ctrDct )

***
#### ECONOMY
If you tend to assume that countries usually spend close to what they make in 
revenue, take confidence from seeing how both rise together in very close 
proportion all the way from lowest to highest (correlation of 0.99704)

In [None]:
x, y = 'Economy: Budget - revenues', 'Economy: Budget - expenditures'
print( f"[{corrDct[ fSet( df, x, y ) ]:.5f}] correlation for:\n   {x}\n   {y}" )


In [None]:
pltSctr( df, [
    'Economy: Budget - revenues',
    'Economy: Budget - expenditures' ] )

- **Services share of GDP**: Which sovereignties pay the bills almost entirely 
from desk-work?

In [None]:
showMax( 'Economy: GDP - composition, by sector of origin - services', df )

***
- **Inflation**: Venezuela dwarfs the world at fifteen times the standard 
deviation.

In [None]:
showMax( ft := 'Economy: Inflation rate (consumer prices)', df )
zThresh( dfN, ft, 0.0677, dfOR, ctrDct )

***
- **Robot Ranchers**: A surprise in the bottom ten **percent of workers in 
agriculture** is the USA: only ~0.7%, while the US **Agri-sector share of GDP** 
is still 22nd-highest in the world.
- Following on from this: who are the **most efficient farmers**? 

In [None]:
# Error in the scraped data for Tonga (date taken as percent). Excluding Tonga, 
showMax( 'Economy: Labor force - by occupation - agriculture',
    df[ df[ 'Country' ] != "Tonga" ], asc=True )

getRank( df, 'United States',
    'Economy: GDP - composition, by sector of origin - agriculture' )

df[ 'Agriculture: GDP composition to labour' ] = (
    df[ 'Economy: GDP - composition, by sector of origin - agriculture' ] /
    df[ 'Economy: Labor force - by occupation - agriculture' ])

showMax( 'Agriculture: GDP composition to labour',
    df[ df[ 'Country' ] != "Tonga" ], sub="Efficient farmers" )

In [None]:
# The most-equal countries have a strong representation of East 
# Eurpoean countries, along with Social Democracy commentators' darling, 
#   Norway. Jersey and Faroe would need to be more closely vetted.
showMax( 'Economy: Gini Index coefficient - distribution of family income',
    df, asc=True )

# gdp-by-imports: maldives only positive, and vastly so [...]
ft = 'Economy: GDP - composition, by end use - imports of goods and services'
showMax( ft, df )
_ = zThresh( dfN, ft, 2.5, dfOR, ctrDct )

#### GEOGRAPHY AND ENVIRONMENT

- **Most watery nations** ( total area by surface water ): what's going on
with the "British Indian Ocean Territory"?

In [None]:
# highest percent water area
df[ 'Water-area ratio' ] = (
    df[ 'Geography: Area - water' ] /
    df[ 'Geography: Area - total' ])
showMax( 'Water-area ratio', df )

The British Indian Ocean territory value here is an outlier arising from an
apparent inconsistency or ambiguity in geographic description - the territory 
is designated across a very disparate grouping of islands which include many
whole or partial atolls with large inner bodies of water. Some interesting 
reading both for observers of colonialism and for ongoing events in the 
projection of sovereign power across large sea vectors:

##### British Indian Ocean Territory
![British_Indian_Ocean_Territory](https://upload.wikimedia.org/wikipedia/commons/thumb/9/96/British_Indian_Ocean_Territory_in_United_Kingdom.svg/1466px-British_Indian_Ocean_Territory_in_United_Kingdom.svg.png)

WIKI: 
>The only inhabitants are British and U.S. military personnel and associated 
contractors, who collectively number around 3,000 (2018 figures). The 
forced removal of Chagossians from the Chagos Archipelago occurred between 
1968 and 1973. [...] Today, the exiled Chagossians are still trying 
to return, saying that the forced expulsion and dispossession was unlawful, 
but the UK government has repeatedly denied them the right of return. The 
islands are off-limits to Chagossians, casual tourists, and the media.

In [None]:
showMax( 'Water-area ratio',
    df[ df[ 'Geography: Land boundaries - total' ] >=
        df[ 'Geography: Coastline' ] ],
    sub="Where Coastline =< Land Boundaries" )

Excluding BIOT as an outlier should reveal a more intuitive distribution.
Better yet, to avoid catching so many islands, let's filter down to countries 
whose coastline is no longer than their land boundaries with other countries.

In [None]:
showMax( 'Water-area ratio',
    df[ df[ 'Geography: Land boundaries - total' ] >=
        df[ 'Geography: Coastline' ] ],
    sub="Where Coastline =< Land Boundaries" )

##### The coastline paradox
All coastlines are infinitely long, at least if you keep decreasing the unit of 
measurement. The "longest coast" could (at least logically) change for some 
smaller unit, given that it might may discover an underlying "flatness" in the 
reigning longest coastline, while discovering an underlying "texture" in the 
land boundary of another territory which produces a greater overall length
(see [here](https://medium.com/@drewjosselyn/fun-with-fractals-using-fractals-to-measure-the-coastline-length-of-one-of-canadas-most-iconic-16c3266af782)).

Nonetheless, at any scale, given the extent and countour of Canada's polar coast,
it seems likely remain out in front at any unit of measurement.

In [None]:
showMax( 'Geography: Coastline', df )

Elevation difference

In [None]:
df[ 'Maximum elevation difference' ] = (
    df[ 'Geography: Elevation - highest point' ] -
    df[ 'Geography: Elevation - lowest point' ])
showMax( 'Maximum elevation difference', df )

Countries with the largest difference between their lowest and highest point.
While the China-Nepal border dissects Siggamartha's highest point, China 
itself has a lower minimum elevation than Nepal.

Flattest places in the world: no point on natural ground is at an ascent of
more of than five meters from any other point. At #4 in the world, Pakistan's 
current floods are devastating partly due to this marked flatness.

Cayman Islands is easiest on the hips with a M.E.D of one meter.

In [None]:
showMax( 'Maximum elevation difference', df, asc=True )

In [None]:
# A pleasant pit-stop in the forests of Suriname:
showMax( 'Environment: Land use - forest', df )

# interestingly, Suriname does very little to leverage this as a resource 
#   advantage - Suriname's rank in %rev Forst rsrc is 174th.

getRank( df, 'Suriname',
    'Environment: Revenue from forest resources - forest revenues' )

# correl/sctr?

In [None]:
#   Fairly even, City-States understandably dominating the top 10. 
showMax( 'Environment: Urbanization - urban population', df )
showMax( 'Environment: Urbanization - urban population', df, asc=True )

# Of the least-urbanised, several are pacific territories with relatively 
#   unfamiliar names: Wallis and Futuna (FR), Montserrat (UK) and Tokelau (NZ). 
#   To help with attribution, if not decolonization, I submit that the UK and 
#   French territories swap names.

# Liechtenstein... is also there.
# "It is a testimony to the mere political expediency of the purchase that the Princes of Liechtenstein did not visit 
#   their new principality for almost 100 years." [citation needed]

In [None]:
# Who are the most and least trade-reliant for water? (Hi there, Middle East)
# Withdrawrals (municipal, industrial and agricultural) minus renewable sources
df[ 'Water withdrawal exposure to trade' ] = (
    (df[ 'Environment: Total water withdrawal - municipal' ] +
     df[ 'Environment: Total water withdrawal - industrial' ] +
     df[ 'Environment: Total water withdrawal - agricultural' ]) -
    df[ 'Environment: Total renewable water resources' ])

showMax( 'Water withdrawal exposure to trade', df,
    sub="Withdrawrals minus resources" )

# Brazil is sitting happy there around the Amazon. Russia and Canada just 
# melt vast amounts of snow.
showMax( 'Water withdrawal exposure to trade', df, asc=True,
    sub="Withdrawrals minus resources" )

In [None]:
# Ratio of irrigated land to total land

df[ 'Irrigated-area ratio' ] = (
    df[ 'Geography: Irrigated land' ] /
    df[ 'Geography: Area - total' ])
showMax( 'Irrigated-area ratio', df )

print( f"Irrigated area in Gaza Strip is "
       f"{getVal( df, 'Gaza Strip', 'Geography: Irrigated land' )} sqkm" )

print( f"Total area of Gaza Strip is "
       f"{getVal( df, 'Gaza Strip', 'Geography: Area - total' )} sqkm" )

Many members of this T10 may not surprise, as familiar origins of agricultural 
commodities, but the Gaza Strip may conjure a more arid image. The rank comes 
down to proportion and population: irrigation, as a [vital element of food production](https://socialsciences.mcmaster.ca/kubursi/ebooks/water.htm), 
takes up 240 of the territory's total 360 square kilometers, which is the third 
most densely populated territory in the world.

<font color='red'> In fact, population density and irrigated area ratio... [SCATTER]
</font>

In [None]:
showMax( 'Government: National heritage - total World Heritage Sites', df )
# the anti-empire question: what qualifies as world heritage? 
# -> All t10 is high-GDP (corrob.)

In [None]:
# In the course of this study, small island groups (many with colonial 
# names) I'm not familiar with keep showing up. Would like to dive into more 
# detail on each of these.

# the Kooky Islands Krew (ctIslands)
#   appears frequently on the bottom/top ten because it is takes less
#   of an event in global terms to have a proportionately large effect
#   on the small sample, be it a population, an area total and so on. 

kookyIslands = [
    'Anguilla',
    'Ashmore and Cartier Islands',
    'British Indian Ocean Territory',
    'British Virgin Islands',
    'Cape Verde',
    'Cayman Islands',
    'Cocos (Keeling) Islands',
    'Comoros',
    'Faroe Islands',
    'Montserrat',
    'Northern Mariana Islands',
    'Saint Barthelemy',
    'Saint Kitts and Nevis',
    'Saint Lucia',
    'Saint Pierre and Miquelon',
    'Saint Vincent and the Grenadines',
    'San Marino',
    'Sao Tome and Principe',
    'Tokelau',
    'Turks and Caicos Islands',
    'Wallis and Futuna' ]

In [None]:
# Things that China is in the top-ten/5/3/1 of
# (Who else are the "most top 10/5/3/1" countries?)
# combine bottom-tens somehow?

In [None]:
# FINALIZING:

# Add positional bar colour-gradient to barplots
#   https://stackoverflow.com/questions/60220089/how-to-add-color-gradients-according-to-y-value-to-a-bar-plot
# Generate unitDct which vis. fetches for feature by column key
# Group t10s and scatterplots by subject.
# move all/most defs to imports, except where important for process comm.