In [None]:
import sys

from FileTools import FileTools
# from pprint import pprint
from pathlib import Path
from zipfile import ZipFile
import pickle
#import random
import json
import time
import os

from matplotlib import pyplot as plt
#import seaborn as sns
import pandas as pd
import numpy as np
import math
import re

from kaggle.api.kaggle_api_extended import KaggleApi

In [None]:
# authenticate with API

currWorkDir = os.getcwd()
userDir = Path.home()
keyPath = f"{userDir}\\PYC\\ADMIN\\kaggle.json"

with open( keyPath, 'r' ) as f: keyDict = json.load( f )
userTitle, keyTitle = keyDict.keys()
kaggleUsername, kaggleKey = keyDict[ userTitle ], keyDict[ keyTitle ]

os.environ[ 'KAGGLE_USERNAME' ] = kaggleUsername
os.environ[ 'KAGGLE_KEY' ] = kaggleKey

api = KaggleApi()
api.authenticate()


In [None]:
# retrieve dataset
datasetOwner = 'lucafrance'
datasetName = 'the-world-factbook-by-cia'
api.dataset_download_files( f'{datasetOwner}/{datasetName}', path="." )

# await download
fTools = FileTools()
datasetFName = None
print( "Waiting for dataset download" )
while True:
    time.sleep( 1 )
    sortedFs = fTools.datesortFiles( currWorkDir, datasetName )
    if len( sortedFs ) == 0: continue
    datasetFName = list( sortedFs )[ 0 ]
    print( f"Latest: {datasetFName}" )
    break

In [None]:
# extract and identify datafiles

origDataDir = f"{currWorkDir}\\data_or"
if not os.path.exists( origDataDir ): os.makedirs( origDataDir )

if datasetFName and Path( datasetFName ).suffix == ".zip":
    with ZipFile( datasetFName, 'r' ) as zipF: zipF.extractall( origDataDir )

dataPaths = [ f"{origDataDir}\\{pth}" for pth in os.listdir( origDataDir )
    if Path( pth ).suffix == ".csv" ]

if len( dataPaths )>0:
    dffBook = pd.read_csv( [ pth for pth in dataPaths ][ 0 ] )
    print( "Got dffBook DF from extracted dataset at:\n", dataPaths[ 0 ] )
else: print( "Failed get CSV" ); sys.exit()

dffBook

In [None]:
# Analyse cell data for numbers and units

def matchNumbers( df_in, coIdex, patrn ):
    # get any number-pattern match from each row in a list
    return df_in.iloc[ :, coIdex ].str.findall( patrn )


def getRemainder( df_in, coIdex, pattrn ):
    # store non-number remainder of string (potential unit etc)
    return df_in.iloc[ :, coIdex ].str.replace( pattrn, '' )


# MATCHING NUMBERS
# REGEX:
#   capture group             (                 
#   zero/one                  [+-]?             possible number sign
#   1-3 nums                  \d{1,3}           up to three straight nums
#   non-capture subgroup      (?:               possible thousand-groups
#     comma and 3 nums          ,\d{3}          (sep. comma)
#     zero or more times        )*              
#   non-capture subgroup      (?:               then possible decimal part
#     decimal and 1+nums        \.\d+           
#     zero/one time             )?            
#   OR (alt. to last seq)     |                 or no groups, just              
#     0+ nums, dec, 1+nums      \d*\.\d+        more nums and poss decimal
#   OR (alt. to last seq)     |             
#     1+ nums                   \d+             or just more numbers.
#   Close capture group       )
#   ( match basic number last to capture greatest valid str segment )

patt = re.compile( r'([+-]?\d{1,3}(?:,\d{3})*(?:\.\d+)?|\d*\.\d+|\d+)' )

dfFbDict = { }  # to collect column data during cleaning
checkTypes = { }
colDex = 1
for colName in dffBook.columns[ colDex: ]:
    origCol = dffBook.iloc[ :, colDex ]
    # get match if string, store if already float, catch unexpected
    if type( origCol[ 0 ] ) == str: colDict = {
        'matchedNums': matchNumbers( dffBook, colDex, patt ),
        'remainder': getRemainder( dffBook, colDex, patt ) }
    elif type( origCol[ 0 ] ) == np.float64: colDict = {
        'matchedNums': origCol.astype( float ), 'remainder': [ ] }
    elif type( origCol[ 0 ] ) != float:
        checkTypes[ type( origCol[ 0 ] ) ] = colDex
        colDict = { 'matchedNums': origCol, 'remainder': [ ] }
    else: colDict = {
        'matchedNums': matchNumbers( dffBook, colDex, patt ),
        'remainder': getRemainder( dffBook, colDex, patt ) }
    colDict[ 'origCol' ] = origCol
    dfFbDict[ colName ] = colDict
    colDex += 1

for i in checkTypes: print( f"unexpected: {i} at {checkTypes[ i ]}" )


In [None]:
# split first match item from list as float to "clean", store else to "split"
def splitFirstOther( matchList ):
    firstVals, splitVals, checkVals = [ ], [ ], [ ]
    for mNum in range( len( matchList ) ):
        el = matchList[ mNum ]
        isFilldList = (type( el ) == list) and (len( el )>0)
        if isFilldList:  # remove any thousandcomma to support convert
            firstVals.append( float( ''.join( el[ 0 ].split( ',' ) ) ) )
            splitVals.append( [ v for v in el[ 1: ] ] )
        else:  # check all else are either nan or empty matchlist
            if ((type( el ) == list and len( el )>0) and
                (type( el ) != list and math.isnan( el ) == False)):
                checkVals.append( el )
            firstVals.append( np.nan )
            splitVals.append( np.nan )
    return firstVals, splitVals, checkVals


for colName in dfFbDict:
    colDict = dfFbDict[ colName ]
    (colDict[ 'clean' ],
    colDict[ 'splitVals' ],
    colDict[ 'checkVals' ]) = splitFirstOther( colDict[ 'matchedNums' ] )

# Raise message if got uncategorized data
for colName in dfFbDict:
    if len( dfFbDict[ colName ][ 'checkVals' ] )>0:
        print( f"Got checkvals for {colName}" )

In [None]:
# dictionary columns to DF, checking is now float
newCols = [ ]
dfFloat = dffBook.iloc[ :, 0 ]  # start with countries
for colName in dfFbDict:
    clean = pd.Series( dfFbDict[ colName ][ 'clean' ] )
    lenFloat = len( [ i for i in clean if type( i ) == float ] )
    if lenFloat>len( clean ) * 0.90:
        newCols.append( colName )
        dfFloat = pd.concat( [ dfFloat, clean ], axis=1 )
    else: print( "col is less than 90% float. Dropping..." )

dfFloat.columns = [ 'Country' ] + newCols
dfFloat


In [None]:
# Enforce non-nan threshold for rows and columns

def nanThreshold( notNan ):  # average plus .5 standard deviation (rounded)
    return int( (sum( notNan ) / len( notNan )) + 0.5 * np.std( notNan ) )


def nonNanFromDims( dfr, dim = 1 ):
    nonNans = [ ]
    for pos in range( 0, dfr.shape[ dim ] ):
        vals = (dfr.iloc[ :, pos ].tolist() if dim == 1
                else dfr.loc[ [ pos ] ].values.tolist()[ 0 ])
        
        nonNans.append( [ vals, len( [ v for v in vals
            if (type( v ) == float and not math.isnan( v )) ] ) ] )
    
    _thresh = nanThreshold( [ nval for _, nval in nonNans ] )
    keepVals = [ kval for kval, nnul in nonNans if nnul>=_thresh ]
    print( f"non-nan[ {len( keepVals )} ] thr[ {_thresh} ] dim[ {dim} ]" )
    
    return keepVals

In [None]:
# ROWS
dfRowsClean = pd.DataFrame( nonNanFromDims( dfFloat, dim=0 ) )

# add a columnindex row to track names of kept columns
dfRowsClean.loc[ -1 ] = dfFloat.columns
dfRowsClean.index = dfRowsClean.index + 1
dfRowsClean.sort_index( inplace=True )
dfRowsClean


In [None]:
# COLS
keepCols = nonNanFromDims( dfRowsClean, dim=1 )

# convert to numeric df
dfColsClean = pd.DataFrame( { col[ 0 ]: col[ 1: ] for col in keepCols } )
df = dfColsClean.apply( pd.to_numeric, errors='ignore' )

In [None]:
# final type check & add countries
dTypeDict = dfColsClean.dtypes
for itemName in list( dTypeDict.keys() ):
    if dTypeDict[ itemName ] != np.float64: print( f"NOT FLOAT: {itemName}" )
dfColsClean.insert( 0, 'Country', dfRowsClean.iloc[ :, 0 ].tolist()[ 1: ] )
dfColsClean

In [None]:
# Review clean DF
fbIsNa = dffBook.isna().sum().sum()
dfIsNa = dfColsClean.isna().sum().sum()
fbDim = dffBook.shape[ 0 ] * dffBook.shape[ 1 ]
dfDim = dfColsClean.shape[ 0 ] * dfColsClean.shape[ 1 ]
print( f"factbook originally shape: {dffBook.shape}" )
print( f"    NAN-density: {(fbIsNa / fbDim):.2f}% "
       f"({fbIsNa} NaN in {fbDim})" )
print( f"clean dataframe has shape: {dfColsClean.shape}" )
print( f"    NAN-density: {(dfIsNa / dfDim):.2f}% "
       f"({dfIsNa} NaN in {dfDim})" )

In [None]:
# Reporting to identify any differences of unit scale within features

In [None]:
# Add some further cleaning to remainder to reduce non-unit variations

cleanReman = { }

cleanPatts = [
    r'(\([^)]*\))',  # remove all bracketed
    r'(.* m)$|(.* m) '  # remove all before " m[endline/space]" (mtn names)
    ]

for colName in list( dfColsClean.columns )[ 1: ]:
    remainder = dfFbDict[ colName ][ 'remainder' ]
    for pattStr in cleanPatts:
        patt = re.compile( pattStr )
        remainder = remainder.str.replace( patt, '' )
    cleanReman.update( { colName: remainder } )

In [None]:
# CHECK FOR SCALE FACTORS

def runScaleAnalysis( dfr, remDict ):
    colList = list( dfr.columns )
    dropFeatrs = [ ]
    cleanNotes = { }
    
    for pos in range( 1, len( colList ) ):
        colNam = colList[ pos ]
        colSeg = dfr.iloc[ :, pos ].tolist()[ :10 ]
        remndr = set( remDict[ colNam ] )
        rMainPrint = ""
        for r in list( remndr )[ :25 ]:
            if type( r ) == float: rMainPrint = rMainPrint + f"{r}\n"
            else: rMainPrint = rMainPrint + f"{r[ :60 ]}\n"
        
        report = (
            f"COL [ {pos} ] {colNam}\n\n"
            f"CLEANVALS:\n{colSeg}\n\n"
            f"REMAINDER (unq in col: {len( remndr )}):\n{rMainPrint}\n")
        
        report_a = report + "\nACCEPT(A), BREAK(B), CLEAN/SCALE NOTE(C), DROP(D)"
        report_b = report_a + "\n\nPLEASE MAKE A SELECTION:\n\n"
        usinp = input( report_a )
        while usinp not in [ 'a', 'd', 'c', 'b' ]: usinp = input( report_b )
        if usinp == 'b': break
        elif usinp == 'a': continue
        elif usinp == 'd': dropFeatrs.append( colNam )
        else: cleanNotes.update( { colNam:
            input( f"{report[ :250 ]}...\n\n\nCLEANING/SCALE NOTE" ) } )
    
    return dropFeatrs, cleanNotes

# dropFeatrs, cleanNotes = runScaleAnalysis(dfColsClean, cleanReman)

In [None]:
# # SAVE SCALE ANALYSIS DATA
# stmp = fTools.dtStamp()
# fTools.storePKL( dropFeatrs, f'dropFeatrs_{stmp}', currWorkDir, subdir=None )
# fTools.storePKL( cleanNotes, f'cleanNotes_{stmp}', currWorkDir, subdir=None )

In [None]:
# LOAD SCALE ANALYSIS DATA
pklFiles = [ fi for fi in [ open( pth, 'rb' )
    for pth in [ list( dKey )[ 0 ]
        for dKey in [ fTools.datesortFiles( currWorkDir, fNam )
            for fNam in [ 'dropFeatrs', 'cleanNotes' ]
            ] ] ] ]

dropFeats, scaleNotes = [ pickle.load( fi ) for fi in pklFiles ]
for fi in pklFiles: fi.close()

In [None]:
df = dfColsClean.copy()
df.drop( dropFeats, axis=1, inplace=True )
scaleKeys = [ dkey for dkey in scaleNotes if dkey not in dropFeats ]
df


In [None]:
# update value by matching remainder scale fragment via country reference

dct = dfFbDict.copy()

scaleDict = {
    "million": 1000000,
    "billion": 1000000000,
    "trillion": 1000000000000 }

cleanCountries = list( df[ 'Country' ] )

for colName in scaleKeys:
    colVals = [ ]
    row = 0
    
    # checking remnantcol (HAS PRE-CLEAN ENTRIES) for match
    for remnt in dct[ colName ][ 'remainder' ]:
        country = dffBook[ 'Country' ][ row ]
        if country not in cleanCountries: row += 1; continue
        val = df.loc[ df[ 'Country' ] == country ][ colName ].iloc[ 0 ]
        if type( remnt ) == float: row += 1; colVals.append( val ); continue
        
        if remnt.startswith( "-$" ): val = 0 - val  # fix $ breaking neg float
        
        matches = [ ]
        for scale in scaleDict:  # apply lowest-index matched scale
            try: matches.append( [ remnt.index( scale ), scale ] )
            except ValueError: continue
        
        if len( matches )>0:  # sort by lowest index (first val of match)
            matchScale = sorted( matches, key=lambda x: x[ 0 ] )[ 0 ][ 1 ]
            val = val * scaleDict[ matchScale ]
        colVals.append( val )
        row += 1
    
    df[ colName ] = colVals

In [None]:
pd.options.display.float_format = '{:,.2f}'.format
df


In [None]:
# a few top-tens

In [None]:
# need to go back and get units
def showTopTen( featName, asc=False ):
    print( featName )
    
    df10 = pd.concat( [ df[ 'Country' ], pd.Series( df[ featName ] ) ],
        axis=1 ).sort_values( by=[ featName ], ascending=asc )[ :10 ]
    
    fig = plt.figure()
    fig.suptitle( "TOP TEN:\n" + featName, fontsize=10 )
    
    ax = fig.add_axes( [ 0, 0, 1, 1 ] )
    ax.bar( df10.iloc[ :, 0 ], df10.iloc[ :, 1 ] )
    ax.set_xticklabels( labels=df10.iloc[ :, 0 ], rotation=45, ha='right' )
    ax.ticklabel_format( axis='y', useOffset=False, style='plain' )
    for tick in ax.xaxis.get_major_ticks(): tick.label.set_fontsize( 14 )
    plt.show()


In [None]:
# largest countries. 
showTopTen( 'Geography: Area - total' )
# Some thoughts:
#   The invasion of Ukraine should not be considered an act of claustrophobia
#   Looking at sovereign territories as real-estate for future resources [..]
#   ...

# highest percent water area
df[ 'Water-area ratio' ] = (
    df[ 'Geography: Area - water' ] /
    df[ 'Geography: Area - total' ])
showTopTen( 'Water-area ratio' )

# It should be safe to posit that Canada is likely at least somewhat well-known 
# as a destination for those people with metal detectors you always see a few 
# hundred meters away at the beach.
showTopTen( 'Geography: Coastline' )

# greatest height from lowest point
# China may not have as high a mountain as Sagarmatha(?), but a hypothetical 
# mountain from its lowest to heighest elevation would dwarf it. 
df[ 'Elevation difference' ] = (
    df[ 'Geography: Elevation - highest point' ] -
    df[ 'Geography: Elevation - lowest point' ])
showTopTen( 'Elevation difference' )

# highest percent of population in the 65+ bracket (Japan by far)
showTopTen( 'People and Society: Age structure - 65 years and over' )
# Noting that, while Japan retains the lead, the difference is far less stark
# in terms of median age
showTopTen( 'People and Society: Median age - total' )

# Ratio of irrigated land to total land
df[ 'Irrigated-area ratio' ] = (
    df[ 'Geography: Irrigated land' ] /
    df[ 'Geography: Area - total' ])
showTopTen( 'Irrigated-area ratio' )

# Syria's far advancement in population growth must be linked to its even 
# greater leadership in net migration. What's the story there?
showTopTen( 'People and Society: Population growth rate' )
showTopTen( 'People and Society: Net migration rate' )
# Surprised to see Aus as one of the T10 countries for net mig rate

# UAE's men-to-women ratio is singularly, startlingly the most weighted to 
# the former in the in the 25-54 years, 55-64yo and 65+ brackets, while
# all runners-up change position or drop from the top. What gives?
showTopTen( 'People and Society: Sex ratio - 25-54 years' )
showTopTen( 'People and Society: Sex ratio - 55-64 years' )
# At the same time, has (by good measure) the highest value for
# the percentage of total population that is 25-54 years old.
showTopTen( 'People and Society: Age structure - 25-54 years' )
# Interesting combination - purely speculating, one can imagine a dominating
# social discourse might concern the relation of older men to working-age adults.
# This consideration is furnished with the dependency ratio, in which
# UAE is by very, very far the the world chart-topper:
showTopTen( 'People and Society: Dependency ratios - potential support ratio' )
# (What is the DR, and how would these three figures relate?)

# People and Society: Current Health Expenditure
#   US tops; surprised after hearing how much is copped by private citizens
showTopTen( 'People and Society: Current Health Expenditure' )
# The only country in top ten expenditure is in top ten physician density (Belg)
showTopTen( 'People and Society: Physicians density' )
# physician density strikes me as vital, as a population's relationship with
#   health-seeking, at a preventative stage, [is tied] to its familiarity and 
#   access to responsive human consultation.

# South Africa, and Southern Africa in general, has an enormous HIV problem.
# showTopTen( 'People and Society: HIV/AIDS - people living with HIV/AIDS' )
df['People living with HIV/AIDs as percentage of population'] = (
    df[ 'People and Society: HIV/AIDS - people living with HIV/AIDS' ] /
    df[ 'People and Society: Population' ])
showTopTen( 'People living with HIV/AIDs as percentage of population' )

# Nearly 50% of the people in Burma smoke
showTopTen( 'People and Society: Tobacco use - total' )
# However, while European nations remain in the top when limited to females,
# the leading Asia-Pacific nations Burma, PNG and Indonesia disappear,
# whereas for men they remain.
# People and Society: Tobacco use - female
# People and Society: Tobacco use - male

# who has the highest combined score for both obesity prevalence and 
# children 4- underweight? 
# list top ten where both scores are above mean for feature

# EDUCATION DISPARITIES:
# People and Society: Education expenditures vs 
    # People and Society: Literacy - total population
# People and Society: Literacy - male vs
    # People and Society: Literacy - female
    
# Pat on the back for Aus: by a modest yet significant margin, longest
#   "school life expectancy" (percent competing teriary?) total AND
#   the position holds for women as much as men.

# Environment: Air pollutants - carbon dioxide emissions
#   China leads by twice its nearest competitor, the US. See coal-relation (hah)
# Similar stats for Environment: Air pollutants - methane emission.

# A pleasant pit-stop in Finnish Forests:
# Environment: Land use - forest

In [None]:
listStart = 75
showNumber = 2

In [None]:
print(f"{listStart}:{listStart+showNumber}")
for i in list( df.columns )[ listStart:listStart+showNumber ]: showTopTen( i )
listStart += showNumber

# People and Society: Current Health Expenditure
#   US tops; surprised after hearing how much is copped by private citizens

In [None]:
# Things that China is in the top-ten/5/3/1 of
# (Who else are the "most top 10/5/3/1" countries?)
# combine bottom-tens somehow?

In [None]:
# Generate CORRELATION DICTIONARY where keys are correlations,
#   values are key-value pairs of baseCol : compareDict
#   CompareDict key-value is col num : correlation coefficient

# correlDict = { }
# baseCol = 1
# while True:
#     colCorrs = { }
#     for colPos in range( baseCol + 1, df.shape[ 1 ] ):
#         colCorrs[ colPos ] = df.iloc[ :, baseCol ].corr( df.iloc[ :, colPos ] )
#     correlDict[ baseCol ] = colCorrs
#     baseCol += 1
#     if baseCol == df.shape[ 1 ]: print( f"Completed correlations" ); break
# 


In [None]:
# move all most defs to import module, except where useful for process 
# communication
# Heh. mosdef.

# END