##  Data from File 5.2 Bank Accounts

### Step 1. Load required packages

In [1]:
import pandas as pd
import pandasql as ps
from pathlib import Path

#### Step 2 - Get sample from source file

In [8]:
sourceFolder = '/content/drive/Shareddrives/Esperanto/Supports/Suppliers/5_Suppliers_Payees_BankAccounts'
workFolder = '/content/drive/Shareddrives/Esperanto/Supports/Suppliers/Work/5_Bank_Accounts'
fileName = '5.2. SUPPLIER_BANK_ACCOUNTS_COPPEL.csv'
inputTextFile = open(Path(sourceFolder, fileName), 'r', encoding='latin-1')
if inputTextFile:
    numTextLines = 0
    while True:
        numTextLines += 1
        textLine = inputTextFile.readline()
        print(textLine)
        if numTextLines >10:
            break
    inputTextFile.close()

'SISTEMA_LEGADO';'NUMERO_PROVEEDOR';'FEEDER_IMPORT_BATCH_ID';'TEMP_EXT_PAYEE_ID';'TEMP_EXT_BANK_ACCT_ID';'BANK_NAME';'BRANCH_NAME';'COUNTRY_CODE';'BANK_ACCOUNT_NAME';'BANK_ACCOUNT_NUM';'CURRENCY_CODE';'FOREING_PAYMENT_USE_FLAG';'START_DATE';'END_DATE';'IBAN';'CHECK_DIGITS';'BANK_ACCOUNT_NAME_ALT';'BANK_ACCOUNT_TYPE';'ACCOUNT_SUFFIX';'DESCRIPTION';'AGENCY_LOCATION_CODE';'EXCHANGE_RATE_AGREEMENT_NUM';'EXCHANGE_RATE_AGREEMENT_TYPE';'EXCHANGE_RATE';'SECONDARY_ACCOUNT_REFERENCE';'ATTRIBUTE_CATEGORY';'ATTRIBUTE1';'ATTRIBUTE2';'ATTRIBUTE3';'ATTRIBUTE4';'ATTRIBUTE5';'ATTRIBUTE6';'ATTRIBUTE7';'ATTRIBUTE8';'ATTRIBUTE9';'ATTRIBUTE10';'ATTRIBUTE11';'ATTRIBUTE12';'ATTRIBUTE13';'ATTRIBUTE14';'ATTRIBUTE15'

'OBRAS';'GAVE860528F18';'100';'9';'0009';'HSBC';'PROVEEDORES';'MX';'';'4057367294';'MXN';'N';'';'';'';'';'';'';'';'';'';'';'';'';'';'';'021741040573672946';'';'';'';'';'';'';'';'';'';'';'';'';'';''

'GASTOS';'BIO111108JC6';'100';'12';'00012';'BANORTE';'PROVEEDORES';'MX';'';'0808351359';'MXN';'N';'

### Step 3 - Load bank account data

In [11]:
df = pd.read_csv(Path(sourceFolder,fileName), sep=';', quotechar=chr(39), encoding='latin-1')
df.dtypes

SISTEMA_LEGADO                   object
NUMERO_PROVEEDOR                 object
FEEDER_IMPORT_BATCH_ID            int64
TEMP_EXT_PAYEE_ID                 int64
TEMP_EXT_BANK_ACCT_ID             int64
BANK_NAME                        object
BRANCH_NAME                      object
COUNTRY_CODE                     object
BANK_ACCOUNT_NAME               float64
BANK_ACCOUNT_NUM                float64
CURRENCY_CODE                    object
FOREING_PAYMENT_USE_FLAG         object
START_DATE                      float64
END_DATE                        float64
IBAN                            float64
CHECK_DIGITS                    float64
BANK_ACCOUNT_NAME_ALT           float64
BANK_ACCOUNT_TYPE               float64
ACCOUNT_SUFFIX                  float64
DESCRIPTION                     float64
AGENCY_LOCATION_CODE            float64
EXCHANGE_RATE_AGREEMENT_NUM     float64
EXCHANGE_RATE_AGREEMENT_TYPE    float64
EXCHANGE_RATE                   float64
SECONDARY_ACCOUNT_REFERENCE     float64


### Step 4 - Validate Bank Account & Tax ID

In [17]:
modDf = df
modDf['TAXID_VALIDATION'] = False
modDf['CLABE_VALIDATION'] = False
modDf['NUMERO_PROVEEDOR'] = modDf['NUMERO_PROVEEDOR'].astype(str) 
modDf['ATTRIBUTE1'] = modDf['ATTRIBUTE1'].astype(str)
for index, row in modDf.iterrows():
    taxID = row['NUMERO_PROVEEDOR']
    if len(taxID)==12 or len(taxID)==13:
        modDf.loc[index, 'TAXID_VALIDATION'] = True
    if len(row['ATTRIBUTE1'].replace('.0',''))==18:
        modDf.loc[index, 'CLABE_VALIDATION'] = True
print(modDf.head(10))

  SISTEMA_LEGADO NUMERO_PROVEEDOR  FEEDER_IMPORT_BATCH_ID  TEMP_EXT_PAYEE_ID  \
0          OBRAS    GAVE860528F18                     100                  9   
1         GASTOS     BIO111108JC6                     100                 12   
2          OBRAS     BIO111108JC6                     100                 13   
3         GASTOS     DSE090511EW5                     100                 18   
4          OBRAS     DSE090511EW5                     100                 19   
5          OBRAS     TVC8607234U2                     100                 23   
6         GASTOS     TVC8607234U2                     100                 24   
7         GASTOS     ABS110113195                     100                 27   
8          OBRAS     ABS110113195                     100                 28   
9          OBRAS     DUO7511286H3                     100                 37   

   TEMP_EXT_BANK_ACCT_ID   BANK_NAME  BRANCH_NAME COUNTRY_CODE  \
0                      9        HSBC  PROVEEDORES    

### Step 5 - Creates a subset with elements whose taxt id and bank account are valid

In [20]:
subSetDf = modDf[(modDf['TAXID_VALIDATION']==True) & (modDf['CLABE_VALIDATION']==True)]
print(len(modDf.index) ,' records in original dataset')
print(len(subSetDf.index), ' records with valid taxt id and bank account')

171966  records in original dataset
160814  records with valid taxt id and bank account


### Step 6 - Performs analysis for duplicate records based on taxt id

In [26]:
sql_query = '''
SELECT NUMERO_PROVEEDOR,COUNT(*) AS numObs FROM subSetDf
GROUP BY NUMERO_PROVEEDOR
ORDER BY numObs DESC
'''
dupsAnalysis = ps.sqldf(sql_query)
print(len(dupsAnalysis.index), ' taxt ids in daset')
uniqueTaxdIds = dupsAnalysis[dupsAnalysis['numObs']==1]
print(len(uniqueTaxdIds.index), ' unique taxt ids')
duplicateTaxdIds = dupsAnalysis[dupsAnalysis['numObs']>1]
print(len(duplicateTaxdIds.index), ' duplicate taxt ids')

74641  taxt ids in daset
14  unique taxt ids
74627  duplicate taxt ids


In [28]:
print('unique taxt ids: ')
print(uniqueTaxdIds.head(10))
print('duplicate taxt ids: ')
print(duplicateTaxdIds.head(100))

unique taxt ids: 
      NUMERO_PROVEEDOR  numObs
74627     AME050428RN2       1
74628     CES100929KP8       1
74629     COR500328B10       1
74630     CSB080408DB8       1
74631     DIL1107085H3       1
74632     EME030203JAA       1
74633     FFM130226AY7       1
74634     ISE0402136VA       1
74635     MTE140127HH1       1
74636     OCO131019H46       1
duplicate taxt ids: 
   NUMERO_PROVEEDOR  numObs
0      DBM000228J35      14
1      ASO0408178B2      12
2      PMC9601107JA      12
3      COA010319I36      10
4      DGC910614EV3      10
..              ...     ...
95     MJM7401023F9       4
96     MNA840101GH6       4
97     MNO750115RY4       4
98     MPM8501014I9       4
99     MPX850101FX0       4

[100 rows x 2 columns]


In [30]:
sql_query = '''
SELECT numObs,COUNT(*) AS NumCases FROM duplicateTaxdIds
GROUP BY numObs
ORDER BY numObs DESC
'''
result = ps.sqldf(sql_query)
for index, row in result.iterrows():
    print(row['numObs'], ' duplicates, ', row['NumCases'], ' cases')

14  duplicates,  1  cases
12  duplicates,  2  cases
10  duplicates,  2  cases
8  duplicates,  9  cases
6  duplicates,  33  cases
4  duplicates,  82  cases
3  duplicates,  11148  cases
2  duplicates,  63350  cases


### Step 7 - Create sets for duplicates elimination rules validation

In [31]:
dupsSet1 = duplicateTaxdIds[duplicateTaxdIds['numObs']==2]
dupsSet2 = duplicateTaxdIds[duplicateTaxdIds['numObs']==3]
dupsSet3 = duplicateTaxdIds[duplicateTaxdIds['numObs']>3]

In [None]:
example1Df = uniqueTaxdIds.merge(subSetDf, on='NUMERO_PROVEEDOR', how='left')

In [9]:
dupsSet1 = duplicateTaxdIds[duplicateTaxdIds['numObs']==2]
dupsSet2 = duplicateTaxdIds[duplicateTaxdIds['numObs']==3]
dupsSet3 = duplicateTaxdIds[duplicateTaxdIds['numObs']>3]

workFolder = '/content/drive/Shareddrives/Esperanto/Supports/Suppliers/Work/5_Bank_Accounts'
ValidClabetDf = df.loc[df['CLABE_VALIDATION'] == True]
ValidClabetDf.to_csv(str(Path(workFolder,'ValidCabledDf.csv')), encoding='utf-8')

In [10]:
for column in list(ValidClabetDf.columns):
    print(column)

SISTEMA_LEGADO
NUMERO_PROVEEDOR
FEEDER_IMPORT_BATCH_ID
TEMP_EXT_PAYEE_ID
TEMP_EXT_BANK_ACCT_ID
BANK_NAME
BRANCH_NAME
COUNTRY_CODE
BANK_ACCOUNT_NAME
BANK_ACCOUNT_NUM
CURRENCY_CODE
FOREING_PAYMENT_USE_FLAG
START_DATE
END_DATE
IBAN
CHECK_DIGITS
BANK_ACCOUNT_NAME_ALT
BANK_ACCOUNT_TYPE
ACCOUNT_SUFFIX
DESCRIPTION
AGENCY_LOCATION_CODE
EXCHANGE_RATE_AGREEMENT_NUM
EXCHANGE_RATE_AGREEMENT_TYPE
EXCHANGE_RATE
SECONDARY_ACCOUNT_REFERENCE
ATTRIBUTE_CATEGORY
ATTRIBUTE1
ATTRIBUTE2
ATTRIBUTE3
ATTRIBUTE4
ATTRIBUTE5
ATTRIBUTE6
ATTRIBUTE7
ATTRIBUTE8
ATTRIBUTE9
ATTRIBUTE10
ATTRIBUTE11
ATTRIBUTE12
ATTRIBUTE13
ATTRIBUTE14
ATTRIBUTE15
CLABE_NUMCHAR
CLABE_VALIDATION


In [11]:
print(ValidClabetDf.head(10))

  SISTEMA_LEGADO NUMERO_PROVEEDOR  FEEDER_IMPORT_BATCH_ID  TEMP_EXT_PAYEE_ID  \
0          OBRAS    GAVE860528F18                     100                  9   
1         GASTOS     BIO111108JC6                     100                 12   
2          OBRAS     BIO111108JC6                     100                 13   
3         GASTOS     DSE090511EW5                     100                 18   
4          OBRAS     DSE090511EW5                     100                 19   
5          OBRAS     TVC8607234U2                     100                 23   
6         GASTOS     TVC8607234U2                     100                 24   
7         GASTOS     ABS110113195                     100                 27   
8          OBRAS     ABS110113195                     100                 28   
9          OBRAS     DUO7511286H3                     100                 37   

   TEMP_EXT_BANK_ACCT_ID   BANK_NAME  BRANCH_NAME COUNTRY_CODE  \
0                      9        HSBC  PROVEEDORES    

In [14]:
sql_query = '''
SELECT NUMERO_PROVEEDOR,ATTRIBUTE1,COUNT(*) AS numObs FROM ValidClabetDf
GROUP BY NUMERO_PROVEEDOR,ATTRIBUTE1
ORDER BY numObs DESC
'''
ValidClabeWithCaseNum = ps.sqldf(sql_query)
ValidClabeWithCaseNum['numCase'] = 0
numCases = 1
for index, row in ValidClabeWithCaseNum.iterrows():
    ValidClabeWithCaseNum.loc[index, 'numCase'] = numCases
    numCases = numCases+1
print(ValidClabeWithCaseNum.head(10))

  NUMERO_PROVEEDOR          ATTRIBUTE1  numObs  numCase
0            45286  044630256032451519       5        1
1            19022  002028752400228503       4        2
2            22309  012180001147036695       4        3
3            23649  014180655020198282       4        4
4            30410  021671040569200104       4        5
5            34057  062580001231008082       4        6
6            47930  002263701144418896       4        7
7    AAGE230113CD0  002700038305532064       4        8
8    AEME900920V46  014650605597572727       4        9
9     DMI041025M13  002028752400229159       4       10


In [15]:
ValidClabeWithCaseNum.to_csv(str(Path(workFolder,'ValidClabeWithCaseNum.csv')), encoding='utf-8')