# generate IBANs only to mutilate them later
Pop Quiz, Invoice Suckers!

Which of the following is a valid IBAN?
```
NL59 K1NAB 6291 4688 95   |   NL29 COBA 9059T 5649 88   |   NL47 ABNA 83Q91 2006 36
NL59 KNAB 629?1 4688 95   |   N\L29 COBA 9059 5649 88   |   NL47 ABNA_ 8391 2006 36
NL59 KNAB 6291 468 95     |   NL29 COBA 90595649 88     |   NL47 ABNA 8391 2006 6
NL59  KNAB 6291 4688 95   |   N	L29 COBA 9059 5649 88|   NL47   ABNA 8391 2006 36
NL59 KNAb 6291 4688 95    |   Nl29 COBA 9059 5649 88    |   NL47 ABnA 8391 2006 36
NL59 KNAB 6291 5688 95    |   NL29 COBA 9059 5749 88    |   NL47 ABNA 8391 3006 36
```
The right answer is: Probably none of them. The IBANs above were mutilated to simulate noise in our scanned invoices. I applied six different point-mutations to the strings. 

But is it really so difficult to combine six invalid IBANs into one valid IBAN? I believe a NN could learn to solve this. With the functions below it is possible to generate a training set for that NN. 

BTW, the correct IBANs are:
```
NL59 KNAB 6291 4688 95    |   NL29 COBA 9059 5649 88    |   NL47 ABNA 8391 2006 36
```

## notes on code
```
A formatted iban is called a fiban to distinguish a IBAN object from a string object.
(I suggest the name ciban for a compact iban.)
All functions (that handle lists of [cf]ibans) expect and return numpy arrays.

```

## where next?
* include cibans into the mix, Jannes' generator only uses the compact format
* generate a large pandas dataframe filled with some number of different versions (X) of the same fiban. 
* Add a column with the correct fiban (y).
* train a NN to take in different versions and predict the correct fiban.

Problems I see with that:
* How do we format the data for the NN's input? Onehot encoded characters, as Jannes' baseline?


In [1]:
!pip install -q schwifty

In [2]:
from schwifty import IBAN 
import numpy as np
import time as time
import re
import pandas as pd
import uuid

np.random.seed(int(time.time()))
  
# NL bank codes taken from https://www.betaalvereniging.nl/aandachtsgebieden/giraal-betalingsverkeer/bic-sepa-transacties/
dutch_bankcodes = "ABNA,ADYB,AEGO,ANAA,ANDL,ARBN,ARSN,ASNB,ATBA,BCDM,BCIT,BICK,BINK,BKCH,BKMG,BLGW,BMEU,BNDA,BNGH,BNPA,BOFA,BOFS,BOTK,BUNQ,CHAS,CITC,CITI,COBA,DEUT,DHBN,DLBK,DNIB,EBUR,FBHL,FLOR,FRGH,FRNX,FTSB,FVLB,GILL,HAND,HHBA,HSBC,ICBK,INGB,ISBK,KABA,KASA,KNAB,KOEX,KRED,LOCY,LOYD,LPLN,MHCB,MOYO,NNBA,NWAB,PCBC,RABO,RBRB,SNSB,SOGE,TEBU,TRIO,UBSW,UGBI,VOWA,ZWLB".split(",")

# print(dutch_bankcodes)

In [3]:
iban = IBAN.generate('NL', bank_code='ASNB', account_code='1007027382')
print(iban.formatted)
print(iban.compact)


NL56 ASNB 1007 0273 82
NL56ASNB1007027382


In [4]:
def generate_random_fibans(num_fibans, country_code='NL'):
  fibans = []
  for _ in range(num_fibans):
    bank_code = dutch_bankcodes[ np.random.randint(len(dutch_bankcodes)) ]
    account_code = str(np.random.randint(9999, high=9999999999))
    iban = IBAN.generate(country_code, bank_code=bank_code, account_code=account_code)
    fibans.append(iban.formatted)

  return np.array(fibans)

In [5]:
def validate_fibans(fibans, mode='all'):
  validations = []
  for fiban in fibans:
    try:
      IBAN(fiban)
    except ValueError as e:
      validations.append(False)
    else:
      validations.append(True)

  if mode=='all':
    judgment = np.all(validations)
  elif mode=='any':
    judgment = np.any(validations)

  return judgment, np.array(validations)


In [6]:
# these functions take in a string, a position in the string and sometimes a new character
# they return a new string

def insert_char(instring, position, insert):
  return instring[:position] + insert + instring[position:]

def delete_char(instring, position):
  return instring[:position] + instring[position+1:]

def replace_char(instring, position, replacement):
  return instring[:position] + replacement + instring[position+1:]


In [7]:
def mutilate_strings(string_list, mode='insval', num_mutils=1):
  """
  mutilates all strings in the input list at random position(s) of the string
  
  Choose a mode fom this list;
  insval:   insert valid character
  insinv:   insert invalid character
  del:      delete character
  inswhite: insert whitespace
  lower:    change character to lowercase
  plus:     increase digit by 1, (9 to 0)
  """

  
  valid_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
  invalid_chars = "!@#$%^&*()_+-=[]{}\\';:\"|?><,./`~\"'±§™"
  whitespaces =   "\t "   #  '\t\n\x0b\x0c\r '     turns out that some of these whitespaces are dangerous
  digits = "0123456789"
  caps_re = re.compile(r'([A-Z])')
  digits_re = re.compile(r'([0-9])')

  mutilated_list = []
  for string in string_list:
    
    if mode=='insval':
      # insert one or more VALid characters anywhere in the string
      for _ in range(num_mutils):
        p = np.random.randint(len(string)+1)
        c = valid_chars[ np.random.randint(len(valid_chars)) ]
        string = insert_char(string, p, c)
    elif mode=='insinv':
      # insert one or more INValid characters anywhere in the string
      for _ in range(num_mutils):
        p = np.random.randint(len(string)+1)
        c = invalid_chars[ np.random.randint(len(valid_chars)) ]
        string = insert_char(string, p, c)
    elif mode=='inswhite':
      # insert one or more WHITEspace characters anywhere in the string
      for _ in range(num_mutils):
        p = np.random.randint(len(string)+1)
        c = whitespaces[ np.random.randint(len(whitespaces)) ]
        string = insert_char(string, p, c)
    elif mode=='del':
      # delete one or more characters anywhere in the string
      for _ in range(num_mutils):
        p = np.random.randint(len(string)) + 1
        string = delete_char(string, p)
    elif mode=='lower':
      # change one or more characters to lowercase anywhere in the string
      for _ in range(num_mutils):
        caps_idx = [i.start() for i in caps_re.finditer(string) ]
        if caps_idx:
          p = caps_idx[np.random.randint(len(caps_idx))]
          replacement = (string[p]).lower()
          string = replace_char(string, p, replacement)
    elif mode=='plus':
      # change one or more characters to lowercase anywhere in the string
      for _ in range(num_mutils):
        digits_idx = [i.start() for i in digits_re.finditer(string) ]
        if digits_idx:
          p = digits_idx[np.random.randint(len(digits_idx))]
          replacement = (int(string[p]) + 1) % 10
          string = replace_char(string, p, str(replacement))
    # endif

    mutilated_list.append(string)
    
  return np.array(mutilated_list)


In [8]:
fibans = generate_random_fibans(3) # about 10s for 100k fibans in Colab
print("   |   ".join(fibans))

for m in ['insval', 'insinv', 'del', 'inswhite', 'lower', 'plus']:
  fibans_mut = mutilate_strings(fibans, mode=m)
  print("   |   ".join(fibans_mut))


NL13 BUNQ 7086 3621 85   |   NL06 GILL 4554 2868 16   |   NL17 SOGE 9237 2276 08
NOL13 BUNQ 7086 3621 85   |   NL06 GILL O4554 2868 16   |   NL17 SOGE 9237 2276E 08
NL13 BUNQ 7086 36@21 85   |   N[L06 GILL 4554 2868 16   |   NL17 SOGE 9237 2276 0^8
NL13BUNQ 7086 3621 85   |   NL6 GILL 4554 2868 16   |   NL17 SOGE 9237 2276 0
NL13 BUNQ 70 86 3621 85   |   NL06  GILL 4554 2868 16   |   NL17 SOGE 9237 2276 0 8
nL13 BUNQ 7086 3621 85   |   NL06 GILl 4554 2868 16   |   NL17 SOgE 9237 2276 08
NL23 BUNQ 7086 3621 85   |   NL06 GILL 4554 2968 16   |   NL17 SOGE 9247 2276 08


In [9]:
(judgement, index) = validate_fibans(fibans)
print(fibans[index])

(judgement, index) = validate_fibans(fibans_mut)
print(fibans_mut[index])


['NL13 BUNQ 7086 3621 85' 'NL06 GILL 4554 2868 16'
 'NL17 SOGE 9237 2276 08']
[]


In [10]:
fibans_suspect = np.array(['NL57 ARBN 6235 3382 93', 'NL09 C0BA 9088 1794 75', 'NL5I FVLB 4249 7745 45', 'NL73 BCIT 1427 5181 56', 'NL92 BCIT 2066 7794 36', 'NL67 FTSB 3188 3991 40', 'NL82 GILL 5931 7640 18'])
(judgement, index) = validate_fibans(fibans_suspect)


In [11]:
# from https://tsociety.io
cturban = 'NL54ABNA0473169371'
turban = IBAN(cturban)
fturbans = [turban.formatted]

print(fturbans)
print(validate_fibans(fturbans))

fturbans_plus = mutilate_strings(fturbans, mode='plus')
print(fturbans_plus)
print(validate_fibans(fturbans_plus))



['NL54 ABNA 0473 1693 71']
(True, array([ True]))
['NL54 ABNA 1473 1693 71']
(False, array([False]))


# plan
Generate 4 different versions of the same IBANs, keep the intact IBAN as a training target.

Store in four different files. Two columns per file: x and y, both as strings.

## Better plan
generate as above, but save mutilated IBANs in files that contain all 5 fields, for future stacker to chew on. Leave the 4 other fields empty for now. Also save the true IBANs in a file like that. Make rows correspond to each other using (fake) uuid's.
```
fake_uuid     x_name, x_kvk, x_iban, x_reference, x_total

fake_uuid     y_name, y_kvk, y_iban, y_reference, y_total
```

In [12]:
PLENTY = 47000
plentyname = "47k"


In [13]:
fibans = generate_random_fibans(PLENTY)  # 1M in 210 seconds, including the mutilations

fibans_v1 = mutilate_strings(fibans, mode='insval', num_mutils=4)   # insert 4 extra chars
fibans_v1 = mutilate_strings(fibans_v1, mode='lower')               # change 1 UPPER to lower case
fibans_v1 = mutilate_strings(fibans_v1, mode='del', num_mutils=2)  # delete 2 chars

fibans_v2 = mutilate_strings(fibans, mode='insval', num_mutils=2)   # insert 2 extra chars
fibans_v2 = mutilate_strings(fibans_v2, mode='del', num_mutils=2)   # delete 2 chars
fibans_v2 = mutilate_strings(fibans_v2, mode='inswhite', num_mutils=6)   # insert 6 whitespaces

fibans_v3 = mutilate_strings(fibans, mode='del', num_mutils=4)      # delete 4 chars

fibans_v4 = mutilate_strings(fibans, mode='insinv', num_mutils=4)      # insert 4 invalid chars

fake_uuids = []
nans = []
for _ in fibans:
    fake_uuids.append(str(uuid.uuid4()))
    nans.append('NaN')
#


In [14]:
train_dir = "./train_G_iban/"

COLUMNS = ['uuid', 'x_name', 'x_kvk', 'x_iban', 'x_reference', 'x_total']

train = pd.DataFrame(columns=COLUMNS)
train['uuid']   = fake_uuids
train['x_name']   = nans
train['x_kvk']   = nans
train['x_iban']   = fibans_v1
train['x_reference']   = nans
train['x_total']   = nans
train.to_csv(train_dir + 'iban_predicted_GX1_{}.csv'.format(plentyname), index=False)

train = pd.DataFrame(columns=COLUMNS)
train['uuid']   = fake_uuids
train['x_name']   = nans
train['x_kvk']   = nans
train['x_iban']   = fibans_v2
train['x_reference']   = nans
train['x_total']   = nans
train.to_csv(train_dir + 'iban_predicted_GX2_{}.csv'.format(plentyname), index=False)

train = pd.DataFrame(columns=COLUMNS)
train['uuid']   = fake_uuids
train['x_name']   = nans
train['x_kvk']   = nans
train['x_iban']   = fibans_v3
train['x_reference']   = nans
train['x_total']   = nans
train.to_csv(train_dir + 'iban_predicted_GX3_{}.csv'.format(plentyname), index=False)

train = pd.DataFrame(columns=COLUMNS)
train['uuid']   = fake_uuids
train['x_name']   = nans
train['x_kvk']   = nans
train['x_iban']   = fibans_v4
train['x_reference']   = nans
train['x_total']   = nans
train.to_csv(train_dir + 'iban_predicted_GX4_{}.csv'.format(plentyname), index=False)

# save the truth in a similar file
COLUMNS = ['uuid', 'y_name', 'y_kvk', 'y_iban', 'y_reference', 'y_total']

truth = pd.DataFrame(columns=COLUMNS)
truth['uuid']   = fake_uuids
truth['y_name']   = nans
truth['y_kvk']   = nans
truth['y_iban']   = fibans
truth['y_reference']   = nans
truth['y_total']   = nans
truth.to_csv(train_dir + 'iban_truth_GY_{}.csv'.format(plentyname), index=False)



In [15]:
truth.head()

Unnamed: 0,uuid,y_name,y_kvk,y_iban,y_reference,y_total
0,a6ec0d75-f2d3-4965-b626-fca8fe70c286,,,NL80 COBA 0969 0380 25,,
1,9ababcab-b944-4c15-a192-a57ee63a5ebd,,,NL32 BCIT 8598 1422 38,,
2,eb0715a4-d68f-4ac4-a6ab-429afb27cb24,,,NL47 DNIB 3721 8240 06,,
3,d71c3ace-7083-4c31-a52e-d4ae0afba8f7,,,NL04 RBRB 8108 2396 61,,
4,1e310847-f7e4-40f8-9d58-2441fc585902,,,NL19 BOFA 7398 1591 18,,


In [16]:
! ls -ltr {train_dir}


total 17880
-rw-rw-r-- 1 paperspace paperspace 3669791 Mar 16 14:47 iban_predicted_GX1_47k.csv
-rw-rw-r-- 1 paperspace paperspace 3858155 Mar 16 14:47 iban_predicted_GX2_47k.csv
-rw-rw-r-- 1 paperspace paperspace 3393175 Mar 16 14:47 iban_predicted_GX3_47k.csv
-rw-rw-r-- 1 paperspace paperspace 3808344 Mar 16 14:47 iban_predicted_GX4_47k.csv
-rw-rw-r-- 1 paperspace paperspace 3572045 Mar 16 14:47 iban_truth_GY_47k.csv
