In [54]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [55]:
import pandas as pd

In [56]:
import visions
from visions import StandardSet, VisionsBaseType, String
from visions.relations import IdentityRelation

As programmer, data scientist or engineer we often work with data with a complex underlying standard. 
Think of phone numbers, email addresses, IBAN accounts and country codes.
We're often confronted with new datasets. The initial process of understanding the data is often refered to as exploratory data analysis.

In [57]:
df = pd.read_csv("https://raw.githubusercontent.com/lukes/ISO-3166-Countries-with-Regional-Codes/master/all/all.csv")

In [58]:
df.head()

Unnamed: 0,name,alpha-2,alpha-3,country-code,iso_3166-2,region,sub-region,intermediate-region,region-code,sub-region-code,intermediate-region-code
0,Afghanistan,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,,142.0,34.0,
1,Åland Islands,AX,ALA,248,ISO 3166-2:AX,Europe,Northern Europe,,150.0,154.0,
2,Albania,AL,ALB,8,ISO 3166-2:AL,Europe,Southern Europe,,150.0,39.0,
3,Algeria,DZ,DZA,12,ISO 3166-2:DZ,Africa,Northern Africa,,2.0,15.0,
4,American Samoa,AS,ASM,16,ISO 3166-2:AS,Oceania,Polynesia,,9.0,61.0,


In [78]:
print(df['intermediate-region'].value_counts())

Caribbean          28
Eastern Africa     22
Western Africa     17
South America      16
Middle Africa       9
Central America     8
Southern Africa     5
Channel Islands     2
Name: intermediate-region, dtype: int64


We see the country name, two and three letter codes, other unique identifiers, and region information. Looking at the (semantic) data types helps us explore the data. 

In [59]:
typeset = StandardSet()
typeset.detect_type(df)

{'name': String,
 'alpha-2': String,
 'alpha-3': String,
 'country-code': Integer,
 'iso_3166-2': String,
 'region': String,
 'sub-region': String,
 'intermediate-region': String,
 'region-code': Float,
 'sub-region-code': Float,
 'intermediate-region-code': Float}

The region-code and sub-region-code seem to be integers stored as float. This is confirmed when we use "infer_type". Under the hood visions traverses relations that are defined on the StandardSet, such as considering floats that are actually integers.

In [60]:
typeset.infer_type(df)

{'name': String,
 'alpha-2': String,
 'alpha-3': String,
 'country-code': Integer,
 'iso_3166-2': String,
 'region': String,
 'sub-region': String,
 'intermediate-region': String,
 'region-code': Integer,
 'sub-region-code': Integer,
 'intermediate-region-code': Integer}

"String" is a broad class for the two and three letter codes, and provides us with little grip. Let's extend the typeset.

In [61]:
def is_len_3(series):
    return (series.str.len() == 3).all()

In [62]:
class ThreeLetter(VisionsBaseType):
    @classmethod
    def get_relations(cls):
        return [
            IdentityRelation(cls, String)
        ]
    
    @classmethod
    def contains_op(cls, series):
        return is_len_3(series) and not series.hasnans

In [63]:
typeset += ThreeLetter

What we have done is create a new subtype of String, for two letter codes

In [77]:
typeset.infer_type(df)

{'name': String,
 'alpha-2': TwoLetter,
 'alpha-3': ThreeLetter,
 'country-code': Integer,
 'iso_3166-2': String,
 'region': String,
 'sub-region': String,
 'intermediate-region': String,
 'region-code': Integer,
 'sub-region-code': Integer,
 'intermediate-region-code': Integer}

Let's do the same for TwoLetter codes.

In [65]:
def is_len_2(series):
    return (series.str.len() == 2).all()

In [66]:
class TwoLetter(VisionsBaseType):
    @classmethod
    def get_relations(cls):
        return [
            IdentityRelation(cls, String)
        ]
    
    @classmethod
    def contains_op(cls, series):
        return is_len_2(series) and not series.hasnans

In [67]:
typeset += TwoLetter

In [76]:
typeset.infer_type(df)

{'name': String,
 'alpha-2': TwoLetter,
 'alpha-3': ThreeLetter,
 'country-code': Integer,
 'iso_3166-2': String,
 'region': String,
 'sub-region': String,
 'intermediate-region': String,
 'region-code': Integer,
 'sub-region-code': Integer,
 'intermediate-region-code': Integer}

Why didn't the code above recognize the two letter codes? 

In [69]:
df.loc[(df['alpha-2'].str.len() != 2)]

Unnamed: 0,name,alpha-2,alpha-3,country-code,iso_3166-2,region,sub-region,intermediate-region,region-code,sub-region-code,intermediate-region-code
153,Namibia,,NAM,516,ISO 3166-2:NA,Africa,Sub-Saharan Africa,Southern Africa,2.0,202.0,18.0


Namibia has the alpha-2 code NA. Using pandas.read_csv naively, we parsed it as np.NaN!

In [72]:
# We do this to prevent reloading the data, we could also use pd.read_csv(data, nan_filter=False)
df['alpha-2'] = df['alpha-2'].fillna('NA')

In [75]:
typeset.infer_type(df)

{'name': String,
 'alpha-2': TwoLetter,
 'alpha-3': ThreeLetter,
 'country-code': Integer,
 'iso_3166-2': String,
 'region': String,
 'sub-region': String,
 'intermediate-region': String,
 'region-code': Integer,
 'sub-region-code': Integer,
 'intermediate-region-code': Integer}

In [89]:
class Alpha2(VisionsBaseType):
    @classmethod
    def get_relations(cls):
        return [
            IdentityRelation(cls, TwoLetter)
        ]
    
    @classmethod
    def contains_op(cls, series):
        iso_3166_alpha_iso_2_codes = [
            'AF', 'AX', 'AL', 'DZ', 'AS', 'AD', 'AO', 'AI', 'AQ', 'AG', 'AR',
           'AM', 'AW', 'AU', 'AT', 'AZ', 'BS', 'BH', 'BD', 'BB', 'BY', 'BE',
           'BZ', 'BJ', 'BM', 'BT', 'BO', 'BQ', 'BA', 'BW', 'BV', 'BR', 'IO',
           'BN', 'BG', 'BF', 'BI', 'CV', 'KH', 'CM', 'CA', 'KY', 'CF', 'TD',
           'CL', 'CN', 'CX', 'CC', 'CO', 'KM', 'CG', 'CD', 'CK', 'CR', 'CI',
           'HR', 'CU', 'CW', 'CY', 'CZ', 'DK', 'DJ', 'DM', 'DO', 'EC', 'EG',
           'SV', 'GQ', 'ER', 'EE', 'SZ', 'ET', 'FK', 'FO', 'FJ', 'FI', 'FR',
           'GF', 'PF', 'TF', 'GA', 'GM', 'GE', 'DE', 'GH', 'GI', 'GR', 'GL',
           'GD', 'GP', 'GU', 'GT', 'GG', 'GN', 'GW', 'GY', 'HT', 'HM', 'VA',
           'HN', 'HK', 'HU', 'IS', 'IN', 'ID', 'IR', 'IQ', 'IE', 'IM', 'IL',
           'IT', 'JM', 'JP', 'JE', 'JO', 'KZ', 'KE', 'KI', 'KP', 'KR', 'KW',
           'KG', 'LA', 'LV', 'LB', 'LS', 'LR', 'LY', 'LI', 'LT', 'LU', 'MO',
           'MG', 'MW', 'MY', 'MV', 'ML', 'MT', 'MH', 'MQ', 'MR', 'MU', 'YT',
           'MX', 'FM', 'MD', 'MC', 'MN', 'ME', 'MS', 'MA', 'MZ', 'MM', 'NA',
           'NR', 'NP', 'NL', 'NC', 'NZ', 'NI', 'NE', 'NG', 'NU', 'NF', 'MK',
           'MP', 'NO', 'OM', 'PK', 'PW', 'PS', 'PA', 'PG', 'PY', 'PE', 'PH',
           'PN', 'PL', 'PT', 'PR', 'QA', 'RE', 'RO', 'RU', 'RW', 'BL', 'SH',
           'KN', 'LC', 'MF', 'PM', 'VC', 'WS', 'SM', 'ST', 'SA', 'SN', 'RS',
           'SC', 'SL', 'SG', 'SX', 'SK', 'SI', 'SB', 'SO', 'ZA', 'GS', 'SS',
           'ES', 'LK', 'SD', 'SR', 'SJ', 'SE', 'CH', 'SY', 'TW', 'TJ', 'TZ',
           'TH', 'TL', 'TG', 'TK', 'TO', 'TT', 'TN', 'TR', 'TM', 'TC', 'TV',
           'UG', 'UA', 'AE', 'GB', 'US', 'UM', 'UY', 'UZ', 'VU', 'VE', 'VN',
           'VG', 'VI', 'WF', 'EH', 'YE', 'ZM', 'ZW'
        ]
        return series.isin(iso_3166_alpha_iso_2_codes).all()

In [90]:
typeset += Alpha2

In [91]:
typeset.infer_type(df)

{'name': String,
 'alpha-2': ISO3166Alpha2,
 'alpha-3': ThreeLetter,
 'country-code': Integer,
 'iso_3166-2': String,
 'region': String,
 'sub-region': String,
 'intermediate-region': String,
 'region-code': Integer,
 'sub-region-code': Integer,
 'intermediate-region-code': Integer}

In [97]:
# typeset.plot_graph()

In [103]:
# Examples pulled from: https://www.xe.com/ibancalculator/sample/?ibancountry=france
iban_df = pd.DataFrame([
    {'iban': "LV80 BANK 0000 4351 9500 1", 'country': 'Latvia', 'country_code': 'LV'},
    {'iban': "FR14 2004 1010 0505 0001 3M02 606", "country": "Martinique", 'country_code': 'MQ'},
    {'iban': "FR14 2004 1010 1234 0001 3M02 606", "country": "France", 'country_code': 'FR'}
])

In [105]:
iban_df['iban_country_code'] = iban_df['iban'].str.slice(stop=2)

In [107]:
iban_df

Unnamed: 0,iban,country,country_code,iban_country_code
0,LV80 BANK 0000 4351 9500 1,Latvia,LV,LV
1,FR14 2004 1010 0505 0001 3M02 606,Martinique,MQ,FR
2,FR14 2004 1010 1234 0001 3M02 606,France,FR,FR


In [106]:
typeset.infer_type(iban_df)

{'iban': String,
 'country': String,
 'country_code': ISO3166Alpha2,
 'iban_country_code': ISO3166Alpha2}

In [74]:
# https://github.com/globalcitizen/php-iban/tree/master/utils/example-ibans

In [109]:
# https://www.xe.com/ibancalculator/sample/?ibancountry=martinique
# https://en.wikipedia.org/wiki/ISO_3166-2:MQ