# generate IBANs only to mutilate them later
Pop Quiz, Invoice Suckers!

Which of the following is a valid IBAN?
```
NL59 K1NAB 6291 4688 95   |   NL29 COBA 9059T 5649 88   |   NL47 ABNA 83Q91 2006 36
NL59 KNAB 629?1 4688 95   |   N\L29 COBA 9059 5649 88   |   NL47 ABNA_ 8391 2006 36
NL59 KNAB 6291 468 95     |   NL29 COBA 90595649 88     |   NL47 ABNA 8391 2006 6
NL59 KNAB 6291 4688 95    |   N	L29 COBA 9059 5649 88|   NL47 ABNA 8391 2006 36
NL59 KNAb 6291 4688 95    |   Nl29 COBA 9059 5649 88    |   NL47 ABnA 8391 2006 36
NL59 KNAB 6291 5688 95    |   NL29 COBA 9059 5749 88    |   NL47 ABNA 8391 3006 36
```
The right answer is: None of them. The IBANs above were mutilated to simulate noise in our scanned invoices. I applied six different point-mutations to the strings. 

But is it really so difficult to combine six invalid IBANs into one valid IBAN? I believe a NN could learn to solve this. With the functions below it is possible to generate a training set for that NN. 

BTW, the correct IBANs are:
```
NL59 KNAB 6291 4688 95    |   NL29 COBA 9059 5649 88    |   NL47 ABNA 8391 2006 36
```

## notes on code
```
A formatted iban is called a fiban to distinguish a IBAN object from a string object.
(I suggest the name ciban for a compact iban.)
All functions (that handle lists of [cf]ibans) expect and return numpy arrays.

```

## where next?
* include cibans into the mix, Jannes' generator only uses the compact format
* generate a large pandas dataframe filled with some number of different versions (X) of the same fiban. 
* Add a column with the correct fiban (y).
* train a NN to take in different versions and predict the correct fiban.

Problems I see with that:
* How do we format the data for the NN's input? Onehot encoded characters, as Jannes' baseline?


In [1]:
!pip install -q schwifty

In [20]:
from schwifty import IBAN 
import numpy as np
import time as time
import re
import pandas as pd
import keras

np.random.seed(int(time.time()))
  
# NL bank codes taken from https://www.betaalvereniging.nl/aandachtsgebieden/giraal-betalingsverkeer/bic-sepa-transacties/
dutch_bankcodes = "ABNA,ADYB,AEGO,ANAA,ANDL,ARBN,ARSN,ASNB,ATBA,BCDM,BCIT,BICK,BINK,BKCH,BKMG,BLGW,BMEU,BNDA,BNGH,BNPA,BOFA,BOFS,BOTK,BUNQ,CHAS,CITC,CITI,COBA,DEUT,DHBN,DLBK,DNIB,EBUR,FBHL,FLOR,FRGH,FRNX,FTSB,FVLB,GILL,HAND,HHBA,HSBC,ICBK,INGB,ISBK,KABA,KASA,KNAB,KOEX,KRED,LOCY,LOYD,LPLN,MHCB,MOYO,NNBA,NWAB,PCBC,RABO,RBRB,SNSB,SOGE,TEBU,TRIO,UBSW,UGBI,VOWA,ZWLB".split(",")

# print(dutch_bankcodes)

ModuleNotFoundError: No module named 'keras'

In [3]:
iban = IBAN.generate('NL', bank_code='ASNB', account_code='1007027382')
print(iban.formatted)
print(iban.compact)


NL56 ASNB 1007 0273 82
NL56ASNB1007027382


In [4]:
def generate_random_fibans(num_fibans, country_code='NL'):
  fibans = []
  for _ in range(num_fibans):
    bank_code = dutch_bankcodes[ np.random.randint(len(dutch_bankcodes)) ]
    account_code = str(np.random.randint(9999, high=9999999999))
    iban = IBAN.generate(country_code, bank_code=bank_code, account_code=account_code)
    fibans.append(iban.formatted)

  return np.array(fibans)

In [5]:
def validate_fibans(fibans, mode='all'):
  validations = []
  for fiban in fibans:
    try:
      IBAN(fiban)
    except ValueError as e:
      validations.append(False)
    else:
      validations.append(True)

  if mode=='all':
    judgment = np.all(validations)
  elif mode=='any':
    judgment = np.any(validations)

  return judgment, np.array(validations)


In [6]:
# these functions take in a string, a position in the string and sometimes a new character
# they return a new string

def insert_char(instring, position, insert):
  return instring[:position] + insert + instring[position:]

def delete_char(instring, position):
  return instring[:position] + instring[position+1:]

def replace_char(instring, position, replacement):
  return instring[:position] + replacement + instring[position+1:]


In [7]:
def mutilate_strings(string_list, mode='insval', num_mutils=1):
  """
  mutilates all strings in the input list at random position(s) of the string
  
  Choose a mode fom this list;
  insval:   insert valid character
  insinv:   insert invalid character
  del:      delete character
  inswhite: insert whitespace
  lower:    change character to lowercase
  plus:     increase digit by 1, (9 to 0)
  """

  
  valid_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
  invalid_chars = "!@#$%^&*()_+-=[]{}\\';:\"|?><,./`~\"'±§™"
  whitespaces = '\t\n\x0b\x0c\r '
  digits = "0123456789"
  caps_re = re.compile(r'([A-Z])')
  digits_re = re.compile(r'([0-9])')

  mutilated_list = []
  for string in string_list:
    
    if mode=='insval':
      # insert one or more VALid characters anywhere in the string
      for _ in range(num_mutils):
        p = np.random.randint(len(string)+1)
        c = valid_chars[ np.random.randint(len(valid_chars)) ]
        string = insert_char(string, p, c)
    elif mode=='insinv':
      # insert one or more INValid characters anywhere in the string
      for _ in range(num_mutils):
        p = np.random.randint(len(string)+1)
        c = invalid_chars[ np.random.randint(len(valid_chars)) ]
        string = insert_char(string, p, c)
    elif mode=='inswhite':
      # insert one or more WHITEspace characters anywhere in the string
      for _ in range(num_mutils):
        p = np.random.randint(len(string)+1)
        c = whitespaces[ np.random.randint(len(whitespaces)) ]
        string = insert_char(string, p, c)
    elif mode=='del':
      # delete one or more characters anywhere in the string
      for _ in range(num_mutils):
        p = np.random.randint(len(string)) + 1
        string = delete_char(string, p)
    elif mode=='lower':
      # change one or more characters to lowercase anywhere in the string
      for _ in range(num_mutils):
        caps_idx = [i.start() for i in caps_re.finditer(string) ]
        if caps_idx:
          p = caps_idx[np.random.randint(len(caps_idx))]
          replacement = (string[p]).lower()
          string = replace_char(string, p, replacement)
    elif mode=='plus':
      # change one or more characters to lowercase anywhere in the string
      for _ in range(num_mutils):
        digits_idx = [i.start() for i in digits_re.finditer(string) ]
        if digits_idx:
          p = digits_idx[np.random.randint(len(digits_idx))]
          replacement = (int(string[p]) + 1) % 10
          string = replace_char(string, p, str(replacement))
    # endif

    mutilated_list.append(string)
    
  return np.array(mutilated_list)


In [8]:
fibans = generate_random_fibans(3) # about 10s for 100k fibans in Colab
print("   |   ".join(fibans))

for m in ['insval', 'insinv', 'del', 'inswhite', 'lower', 'plus']:
  fibans_mut = mutilate_strings(fibans, mode=m)
  print("   |   ".join(fibans_mut))


NL85 LPLN 8569 5349 16   |   NL26 AEGO 2085 2103 43   |   NL31 DLBK 3955 9851 22
NL85 LPLN 8569 534R9 16   |   NL26 AEGO 2085 2103 413   |   NL31 DLBKI 3955 9851 22
NL85 LPLN 8569 53~49 16   |   NL2>6 AEGO 2085 2103 43   |   NL31' DLBK 3955 9851 22
NL85 LPLN 8569 5349 6   |   NL6 AEGO 2085 2103 43   |   NL31 DLBK 3955 981 22
NL85 LPLN 8569 5349 16   |   NL26 AEGO 2085 210
3 43   |   NL
31 DLBK 3955 9851 22
Nl85 LPLN 8569 5349 16   |   NL26 AEGo 2085 2103 43   |   Nl31 DLBK 3955 9851 22
NL85 LPLN 8579 5349 16   |   NL26 AEGO 2086 2103 43   |   NL31 DLBK 3965 9851 22


In [10]:
(judgement, index) = validate_fibans(fibans)
print(fibans[index])

(judgement, index) = validate_fibans(fibans_mut)
print(fibans_mut[index])


['NL85 LPLN 8569 5349 16' 'NL26 AEGO 2085 2103 43' 'NL31 DLBK 3955 9851 22']
[]


In [11]:
fibans_suspect = np.array(['NL57 ARBN 6235 3382 93', 'NL09 C0BA 9088 1794 75', 'NL5I FVLB 4249 7745 45', 'NL73 BCIT 1427 5181 56', 'NL92 BCIT 2066 7794 36', 'NL67 FTSB 3188 3991 40', 'NL82 GILL 5931 7640 18'])
(judgement, index) = validate_fibans(fibans_suspect)


In [12]:
# from https://tsociety.io
cturban = 'NL54ABNA0473169371'
turban = IBAN(cturban)
fturbans = [turban.formatted]

print(fturbans)
print(validate_fibans(fturbans))

fturbans_plus = mutilate_strings(fturbans, mode='plus')
print(fturbans_plus)
print(validate_fibans(fturbans_plus))



['NL54 ABNA 0473 1693 71']
(True, array([ True], dtype=bool))
['NL54 ABNA 0473 2693 71']
(False, array([False], dtype=bool))


In [14]:
fibans = generate_random_fibans(100)  # 1M in 210 seconds, including the mutilations

fibans_v1 = mutilate_strings(fibans, mode='insval', num_mutils=4)   # insert 4 extra chars
fibans_v1 = mutilate_strings(fibans_v1, mode='lower')               # change 1 UPPER to lower case
fibans_v1 = mutilate_strings(fibans_v1, mode='del', num_mutils=2)  # delete 2 chars

fibans_v2 = mutilate_strings(fibans, mode='insval', num_mutils=2)   # insert 2 extra chars
fibans_v2 = mutilate_strings(fibans_v2, mode='del', num_mutils=2)   # delete 2 chars
fibans_v2 = mutilate_strings(fibans_v2, mode='inswhite', num_mutils=6)   # insert 6 whitespaces

fibans_v3 = mutilate_strings(fibans, mode='del', num_mutils=4)      # delete 4 chars

fibans_v4 = mutilate_strings(fibans, mode='insinv', num_mutils=4)      # insert 4 invalid chars


In [15]:
train = pd.DataFrame(columns=['noise1', 'noise2', 'noise3', 'noise4', 'iban'])

train['noise1'] = fibans_v1
train['noise2'] = fibans_v2
train['noise3'] = fibans_v3
train['noise4'] = fibans_v4
train['iban']   = fibans

train.to_csv('iban_noisy_100.csv')


In [16]:
train.head()

Unnamed: 0,noise1,noise2,noise3,noise4,iban
0,NL66 RbB 65J01 C137211N3,\tN LQZ 66\r\rRBRB 6501 131 13,NL66RBB651 1371 13,NL66 RBR`B 6;501 1+371% 13,NL66 RBRB 6501 1371 13
1,oNL47 KNAB2266336 858 02,N47 K\nNAFB 2\t636 8Q586 0\r,NL47 KNAB 2636 860,"NL47 KNAB \2636 @""8~586 02",NL47 KNAB 2636 8586 02
2,nL73 NNBR 1346 2S53275 58,\rN73NNB\rA\t 1\r3462 32J\t75 5\r8,NL73 NNBA 1346327,'NL73 NNBA 1346 !;3275 5<8,NL73 NNBA 1346 3275 58
3,NL694 DHT 7609 2913 S25X,\nNL64\r DHBN 760 O2913 25,NL DHBN 7609 29325,"NL64@ DHBN! 7609] 2913 2""5",NL64 DHBN 7609 2913 25
4,InJL64 BUYNPQ592 7256 26,NL4 \tBUQ 592F1 Z7256\t 26,NL64 UNQ 921 256 6,NL)64 BUNQ 5?92{1 72[56 26,NL64 BUNQ 5921 7256 26


In [17]:
! ls -ltr


total 200
-rw-r--r--  1 rik  staff   1060 Mar 14 09:13 LICENSE
-rw-r--r--@ 1 rik  staff  31159 Mar 14 09:23 Wk6_Ortec_IBAN_generation.ipynb
-rw-r--r--@ 1 rik  staff  45008 Mar 14 09:46 Jannes_Baseline_Ortec.ipynb
-rw-r--r--  1 rik  staff    190 Mar 14 10:00 README.md
drwxr-xr-x  5 rik  staff    170 Mar 14 10:00 [34mtemplates[m[m
-rw-r--r--  1 rik  staff  12871 Mar 14 10:02 iban_noisy_100.csv


ModuleNotFoundError: No module named 'keras'

In [19]:
model = keras.models.Sequential()

model.add(model.layers.Dense(input_size=4))

NameError: name 'keras' is not defined