##  Data from File 5.2 Bank Accounts

### Step 1. Load required packages

In [1]:
import pandas as pd
import pandasql as ps
from pathlib import Path

#### Step 2 - Get sample from source file

In [2]:
sourceFolder = '/content/drive/Shareddrives/Esperanto/Supports/Suppliers/5_Suppliers_Payees_BankAccounts'
workFolder = '/content/drive/Shareddrives/Esperanto/Supports/Suppliers/Work/5_Bank_Accounts'
fileName = '5.2. SUPPLIER_BANK_ACCOUNTS_COPPEL.csv'
inputTextFile = open(Path(sourceFolder, fileName), 'r', encoding='latin-1')
if inputTextFile:
    numTextLines = 0
    while True:
        numTextLines += 1
        textLine = inputTextFile.readline()
        print(textLine)
        if numTextLines >10:
            break
    inputTextFile.close()

'SISTEMA_LEGADO';'NUMERO_PROVEEDOR';'FEEDER_IMPORT_BATCH_ID';'TEMP_EXT_PAYEE_ID';'TEMP_EXT_BANK_ACCT_ID';'BANK_NAME';'BRANCH_NAME';'COUNTRY_CODE';'BANK_ACCOUNT_NAME';'BANK_ACCOUNT_NUM';'CURRENCY_CODE';'FOREING_PAYMENT_USE_FLAG';'START_DATE';'END_DATE';'IBAN';'CHECK_DIGITS';'BANK_ACCOUNT_NAME_ALT';'BANK_ACCOUNT_TYPE';'ACCOUNT_SUFFIX';'DESCRIPTION';'AGENCY_LOCATION_CODE';'EXCHANGE_RATE_AGREEMENT_NUM';'EXCHANGE_RATE_AGREEMENT_TYPE';'EXCHANGE_RATE';'SECONDARY_ACCOUNT_REFERENCE';'ATTRIBUTE_CATEGORY';'ATTRIBUTE1';'ATTRIBUTE2';'ATTRIBUTE3';'ATTRIBUTE4';'ATTRIBUTE5';'ATTRIBUTE6';'ATTRIBUTE7';'ATTRIBUTE8';'ATTRIBUTE9';'ATTRIBUTE10';'ATTRIBUTE11';'ATTRIBUTE12';'ATTRIBUTE13';'ATTRIBUTE14';'ATTRIBUTE15'

'OBRAS';'GAVE860528F18';'100';'9';'0009';'HSBC';'PROVEEDORES';'MX';'';'4057367294';'MXN';'N';'';'';'';'';'';'';'';'';'';'';'';'';'';'';'021741040573672946';'';'';'';'';'';'';'';'';'';'';'';'';'';''

'GASTOS';'BIO111108JC6';'100';'12';'00012';'BANORTE';'PROVEEDORES';'MX';'';'0808351359';'MXN';'N';'

### Step 3 - Load bank account data

In [3]:
df = pd.read_csv(Path(sourceFolder,fileName), sep=';', quotechar=chr(39), encoding='latin-1')
df.dtypes

SISTEMA_LEGADO                   object
NUMERO_PROVEEDOR                 object
FEEDER_IMPORT_BATCH_ID            int64
TEMP_EXT_PAYEE_ID                 int64
TEMP_EXT_BANK_ACCT_ID             int64
BANK_NAME                        object
BRANCH_NAME                      object
COUNTRY_CODE                     object
BANK_ACCOUNT_NAME               float64
BANK_ACCOUNT_NUM                float64
CURRENCY_CODE                    object
FOREING_PAYMENT_USE_FLAG         object
START_DATE                      float64
END_DATE                        float64
IBAN                            float64
CHECK_DIGITS                    float64
BANK_ACCOUNT_NAME_ALT           float64
BANK_ACCOUNT_TYPE               float64
ACCOUNT_SUFFIX                  float64
DESCRIPTION                     float64
AGENCY_LOCATION_CODE            float64
EXCHANGE_RATE_AGREEMENT_NUM     float64
EXCHANGE_RATE_AGREEMENT_TYPE    float64
EXCHANGE_RATE                   float64
SECONDARY_ACCOUNT_REFERENCE     float64


### Step 4 - Validate Bank Account & Tax ID

In [5]:
modDf = df
modDf['CLABE_VALIDATION'] = False
modDf['ATTRIBUTE1'] = modDf['ATTRIBUTE1'].astype(str)
for index, row in modDf.iterrows():
    if len(row['ATTRIBUTE1'].replace('.0',''))==18:
        modDf.loc[index, 'CLABE_VALIDATION'] = True
modDf = modDf[modDf['NUMERO_PROVEEDOR'].notna()]
print('the orignal dataset contains ', len(df.index), ' records')
print('the modified dataset contains ', len(modDf.index), ' records.')
print(modDf.head(10))

the orignal dataset contains  171966  records
the modified dataset contains  171966  records.
  SISTEMA_LEGADO NUMERO_PROVEEDOR  FEEDER_IMPORT_BATCH_ID  TEMP_EXT_PAYEE_ID  \
0          OBRAS    GAVE860528F18                     100                  9   
1         GASTOS     BIO111108JC6                     100                 12   
2          OBRAS     BIO111108JC6                     100                 13   
3         GASTOS     DSE090511EW5                     100                 18   
4          OBRAS     DSE090511EW5                     100                 19   
5          OBRAS     TVC8607234U2                     100                 23   
6         GASTOS     TVC8607234U2                     100                 24   
7         GASTOS     ABS110113195                     100                 27   
8          OBRAS     ABS110113195                     100                 28   
9          OBRAS     DUO7511286H3                     100                 37   

   TEMP_EXT_BANK_ACCT_ID 

### Step 5 - Creates a subset with elements whose taxt id and bank account are valid

In [7]:
subSetDf = modDf[modDf['CLABE_VALIDATION']==True]
print(len(modDf.index) ,' records in original dataset')
print(len(subSetDf.index), ' records with a valid bank account (18 digits)')

171966  records in original dataset
164836  records with a valid bank account (18 digits)


### Step 6 - Performs analysis for duplicate records based on taxt id or supplier id

In [8]:
sql_query = '''
SELECT NUMERO_PROVEEDOR,COUNT(*) AS numObs FROM subSetDf
GROUP BY NUMERO_PROVEEDOR
ORDER BY numObs DESC
'''
dupsAnalysis = ps.sqldf(sql_query)
print(len(dupsAnalysis.index), ' taxt ids in daset')
uniqueTaxdIds = dupsAnalysis[dupsAnalysis['numObs']==1]
print(len(uniqueTaxdIds.index), ' unique taxt ids')
duplicateTaxdIds = dupsAnalysis[dupsAnalysis['numObs']>1]
print(len(duplicateTaxdIds.index), ' duplicate taxt ids')
#
sql_query = '''
SELECT SISTEMA_LEGADO,COUNT(*) AS numObs FROM 
(SELECT SISTEMA_LEGADO,NUMERO_PROVEEDOR FROM subSetDf)
GROUP BY SISTEMA_LEGADO
ORDER BY numObs DESC
'''
distPerLegacy = ps.sqldf(sql_query)
for index, row in distPerLegacy.iterrows():
    print(row['SISTEMA_LEGADO'], ',', row['numObs'])

76565  taxt ids in daset
193  unique taxt ids
76372  duplicate taxt ids
GASTOS , 74841
OBRAS , 74826
MOTOR DE SEGUIMIENTO , 11147
CONSTRUNET , 1913
SIM , 1913
NEGOCIOS AFILIADOS , 107
TECNOLOGIA , 59
SIE , 11
MARKETPLACE , 10
REFACCIONARIA , 8
PORTAL SARI , 1


In [9]:
print('unique taxt ids: ')
print(uniqueTaxdIds.head(10))
print('duplicate taxt ids: ')
print(duplicateTaxdIds.head(100))

unique taxt ids: 
      NUMERO_PROVEEDOR  numObs
76372            16291       1
76373            16826       1
76374            17147       1
76375             1796       1
76376            18005       1
76377            18247       1
76378            18337       1
76379            18434       1
76380            18606       1
76381            18798       1
duplicate taxt ids: 
   NUMERO_PROVEEDOR  numObs
0             20539      27
1      DBM000228J35      14
2      ASO0408178B2      12
3      PMC9601107JA      12
4             19546      10
..              ...     ...
95            36928       5
96            37929       5
97            38728       5
98            40530       5
99            40834       5

[100 rows x 2 columns]


In [10]:
sql_query = '''
SELECT numObs,COUNT(*) AS NumCases FROM duplicateTaxdIds
GROUP BY numObs
ORDER BY numObs DESC
'''
result = ps.sqldf(sql_query)
for index, row in result.iterrows():
    print(row['numObs'], ' duplicates, ', row['NumCases'], ' cases')

27  duplicates,  1  cases
14  duplicates,  1  cases
12  duplicates,  2  cases
10  duplicates,  3  cases
9  duplicates,  2  cases
8  duplicates,  18  cases
7  duplicates,  5  cases
6  duplicates,  50  cases
5  duplicates,  24  cases
4  duplicates,  98  cases
3  duplicates,  11203  cases
2  duplicates,  64965  cases


### Step 7 - Perfoms analysis of distribution of accounts per supplier

In [12]:
sql_query = '''
SELECT NUMERO_PROVEEDOR,COUNT(*) AS numObs FROM
(SELECT NUMERO_PROVEEDOR,ATTRIBUTE1 FROM modDf)
GROUP BY NUMERO_PROVEEDOR
ORDER BY numObs DESC
'''
accountsPerSupplierDf = ps.sqldf(sql_query)
sql_query = '''
SELECT numObs,COUNT(*) AS numCases FROM accountsPerSupplierDf
GROUP BY numObs
ORDER BY numObs DESC
'''
result = ps.sqldf(sql_query)
for index, row in result.iterrows():
    print(row['numObs'], ', ', row['numCases'])

14 ,  1
27 ,  1
9 ,  2
12 ,  2
7 ,  5
10 ,  5
8 ,  22
5 ,  28
6 ,  53
4 ,  121
1 ,  179
3 ,  11207
2 ,  68440


### Step 7 - Create sets for duplicates elimination rules validation

In [31]:
dupsSet1 = duplicateTaxdIds[duplicateTaxdIds['numObs']==2]
dupsSet2 = duplicateTaxdIds[duplicateTaxdIds['numObs']==3]
dupsSet3 = duplicateTaxdIds[duplicateTaxdIds['numObs']>3]

In [43]:
example1Df = uniqueTaxdIds.merge(subSetDf, on='NUMERO_PROVEEDOR', how='left')
example1Df = example1Df[['SISTEMA_LEGADO','NUMERO_PROVEEDOR','FEEDER_IMPORT_BATCH_ID','TEMP_EXT_PAYEE_ID','TEMP_EXT_BANK_ACCT_ID',
'BANK_NAME','BRANCH_NAME','COUNTRY_CODE','BANK_ACCOUNT_NAME','BANK_ACCOUNT_NUM','CURRENCY_CODE','FOREING_PAYMENT_USE_FLAG',
'START_DATE','END_DATE','IBAN','CHECK_DIGITS','BANK_ACCOUNT_NAME_ALT','BANK_ACCOUNT_TYPE','ACCOUNT_SUFFIX','DESCRIPTION',
'AGENCY_LOCATION_CODE','EXCHANGE_RATE_AGREEMENT_NUM','EXCHANGE_RATE_AGREEMENT_TYPE','EXCHANGE_RATE','SECONDARY_ACCOUNT_REFERENCE',
'ATTRIBUTE_CATEGORY','ATTRIBUTE1','ATTRIBUTE2','ATTRIBUTE3','ATTRIBUTE4','ATTRIBUTE5','ATTRIBUTE6','ATTRIBUTE7','ATTRIBUTE8',
'ATTRIBUTE9','ATTRIBUTE10','ATTRIBUTE11','ATTRIBUTE12','ATTRIBUTE13','ATTRIBUTE14','ATTRIBUTE15']]
print(example1Df)
example1Df.to_csv(Path(workFolder, '5.2 Bank Accounts - RFCS Unicos.csv'), encoding='utf-8')

   SISTEMA_LEGADO NUMERO_PROVEEDOR  FEEDER_IMPORT_BATCH_ID  TEMP_EXT_PAYEE_ID  \
0          GASTOS     AME050428RN2                     100              16554   
1          GASTOS     CES100929KP8                     100              29713   
2          GASTOS     COR500328B10                     100              46860   
3          GASTOS     CSB080408DB8                     100              45531   
4          GASTOS     DIL1107085H3                     100              59935   
5          GASTOS     EME030203JAA                     100              63648   
6          GASTOS     FFM130226AY7                     100              72581   
7          GASTOS     ISE0402136VA                     100             108358   
8          GASTOS     MTE140127HH1                     100             149533   
9          GASTOS     OCO131019H46                     100             162312   
10         GASTOS     SAA1610043Q6                     100             199990   
11         GASTOS     SIN120

In [46]:
example2Df = dupsSet1.merge(subSetDf, on='NUMERO_PROVEEDOR', how='left')
example2Df = example2Df[['SISTEMA_LEGADO','NUMERO_PROVEEDOR','FEEDER_IMPORT_BATCH_ID','TEMP_EXT_PAYEE_ID','TEMP_EXT_BANK_ACCT_ID',
'BANK_NAME','BRANCH_NAME','COUNTRY_CODE','BANK_ACCOUNT_NAME','BANK_ACCOUNT_NUM','CURRENCY_CODE','FOREING_PAYMENT_USE_FLAG',
'START_DATE','END_DATE','IBAN','CHECK_DIGITS','BANK_ACCOUNT_NAME_ALT','BANK_ACCOUNT_TYPE','ACCOUNT_SUFFIX','DESCRIPTION',
'AGENCY_LOCATION_CODE','EXCHANGE_RATE_AGREEMENT_NUM','EXCHANGE_RATE_AGREEMENT_TYPE','EXCHANGE_RATE','SECONDARY_ACCOUNT_REFERENCE',
'ATTRIBUTE_CATEGORY','ATTRIBUTE1','ATTRIBUTE2','ATTRIBUTE3','ATTRIBUTE4','ATTRIBUTE5','ATTRIBUTE6','ATTRIBUTE7','ATTRIBUTE8',
'ATTRIBUTE9','ATTRIBUTE10','ATTRIBUTE11','ATTRIBUTE12','ATTRIBUTE13','ATTRIBUTE14','ATTRIBUTE15']]
print(example2Df.head(10))
example2Df.to_csv(Path(workFolder, '5.2 Bank Accounts - RFCS Duplicados -2.csv'), encoding='utf-8')

  SISTEMA_LEGADO NUMERO_PROVEEDOR  FEEDER_IMPORT_BATCH_ID  TEMP_EXT_PAYEE_ID  \
0         GASTOS     &JE040614N51                     100                139   
1          OBRAS     &JE040614N51                     100                140   
2         GASTOS    00AL570409DC8                     100             166443   
3          OBRAS    00AL570409DC8                     100             166445   
4          OBRAS    00BR5103152L7                     100             164158   
5         GASTOS    00BR5103152L7                     100             164159   
6          OBRAS    00CE810603TI4                     100             164715   
7         GASTOS    00CE810603TI4                     100             164716   
8         GASTOS     A&G060523IW9                     100              52878   
9          OBRAS     A&G060523IW9                     100              52879   

   TEMP_EXT_BANK_ACCT_ID      BANK_NAME  BRANCH_NAME COUNTRY_CODE  \
0                    134  BBVA BANCOMER  PROVEEDOR

In [47]:
example3Df = dupsSet2.merge(subSetDf, on='NUMERO_PROVEEDOR', how='left')
example3Df = example3Df[['SISTEMA_LEGADO','NUMERO_PROVEEDOR','FEEDER_IMPORT_BATCH_ID','TEMP_EXT_PAYEE_ID','TEMP_EXT_BANK_ACCT_ID',
'BANK_NAME','BRANCH_NAME','COUNTRY_CODE','BANK_ACCOUNT_NAME','BANK_ACCOUNT_NUM','CURRENCY_CODE','FOREING_PAYMENT_USE_FLAG',
'START_DATE','END_DATE','IBAN','CHECK_DIGITS','BANK_ACCOUNT_NAME_ALT','BANK_ACCOUNT_TYPE','ACCOUNT_SUFFIX','DESCRIPTION',
'AGENCY_LOCATION_CODE','EXCHANGE_RATE_AGREEMENT_NUM','EXCHANGE_RATE_AGREEMENT_TYPE','EXCHANGE_RATE','SECONDARY_ACCOUNT_REFERENCE',
'ATTRIBUTE_CATEGORY','ATTRIBUTE1','ATTRIBUTE2','ATTRIBUTE3','ATTRIBUTE4','ATTRIBUTE5','ATTRIBUTE6','ATTRIBUTE7','ATTRIBUTE8',
'ATTRIBUTE9','ATTRIBUTE10','ATTRIBUTE11','ATTRIBUTE12','ATTRIBUTE13','ATTRIBUTE14','ATTRIBUTE15']]
print(example3Df.head(10))
example3Df.to_csv(Path(workFolder, '5.2 Bank Accounts - RFCS Duplicados - 3.csv'), encoding='utf-8')

         SISTEMA_LEGADO NUMERO_PROVEEDOR  FEEDER_IMPORT_BATCH_ID  \
0                GASTOS    AAAA680827Q89                     100   
1                 OBRAS    AAAA680827Q89                     100   
2  MOTOR DE SEGUIMIENTO    AAAA680827Q89                     100   
3                GASTOS    AAAB950402HX6                     100   
4                 OBRAS    AAAB950402HX6                     100   
5  MOTOR DE SEGUIMIENTO    AAAB950402HX6                     100   
6                GASTOS    AAAD590706PW8                     100   
7                 OBRAS    AAAD590706PW8                     100   
8  MOTOR DE SEGUIMIENTO    AAAD590706PW8                     100   
9                GASTOS    AAAE551201CT0                     100   

   TEMP_EXT_PAYEE_ID  TEMP_EXT_BANK_ACCT_ID  BANK_NAME  BRANCH_NAME  \
0               7301                   7306  BANCOPPEL  PROVEEDORES   
1               7302                   7307  BANCOPPEL  PROVEEDORES   
2             115765                  

In [48]:
example4Df = dupsSet3.merge(subSetDf, on='NUMERO_PROVEEDOR', how='left')
example4Df = example4Df[['SISTEMA_LEGADO','NUMERO_PROVEEDOR','FEEDER_IMPORT_BATCH_ID','TEMP_EXT_PAYEE_ID','TEMP_EXT_BANK_ACCT_ID',
'BANK_NAME','BRANCH_NAME','COUNTRY_CODE','BANK_ACCOUNT_NAME','BANK_ACCOUNT_NUM','CURRENCY_CODE','FOREING_PAYMENT_USE_FLAG',
'START_DATE','END_DATE','IBAN','CHECK_DIGITS','BANK_ACCOUNT_NAME_ALT','BANK_ACCOUNT_TYPE','ACCOUNT_SUFFIX','DESCRIPTION',
'AGENCY_LOCATION_CODE','EXCHANGE_RATE_AGREEMENT_NUM','EXCHANGE_RATE_AGREEMENT_TYPE','EXCHANGE_RATE','SECONDARY_ACCOUNT_REFERENCE',
'ATTRIBUTE_CATEGORY','ATTRIBUTE1','ATTRIBUTE2','ATTRIBUTE3','ATTRIBUTE4','ATTRIBUTE5','ATTRIBUTE6','ATTRIBUTE7','ATTRIBUTE8',
'ATTRIBUTE9','ATTRIBUTE10','ATTRIBUTE11','ATTRIBUTE12','ATTRIBUTE13','ATTRIBUTE14','ATTRIBUTE15']]
print(example4Df.head(10))
example4Df.to_csv(Path(workFolder, '5.2 Bank Accounts - RFCS Duplicados > 3.csv'), encoding='utf-8')

  SISTEMA_LEGADO NUMERO_PROVEEDOR  FEEDER_IMPORT_BATCH_ID  TEMP_EXT_PAYEE_ID  \
0         GASTOS     DBM000228J35                     100              56179   
1         GASTOS     DBM000228J35                     100              56179   
2         GASTOS     DBM000228J35                     100              56179   
3         GASTOS     DBM000228J35                     100              56179   
4         GASTOS     DBM000228J35                     100              56179   
5         GASTOS     DBM000228J35                     100              56179   
6         GASTOS     DBM000228J35                     100              56179   
7          OBRAS     DBM000228J35                     100              56185   
8          OBRAS     DBM000228J35                     100              56185   
9          OBRAS     DBM000228J35                     100              56185   

   TEMP_EXT_BANK_ACCT_ID      BANK_NAME  BRANCH_NAME COUNTRY_CODE  \
0                  56702  BBVA BANCOMER  PROVEEDOR

In [51]:
#sql_query = '''
#SELECT DISTINCT SISTEMA_LEGADO,NUMERO_PROVEEDOR,FEEDER_IMPORT_BATCH_ID,TEMP_EXT_PAYEE_ID,
#TEMP_EXT_BANK_ACCT_ID,BANK_NAME,BRANCH_NAME,COUNTRY_CODE,BANK_ACCOUNT_NUM,CURRENCY_CODE,
#FOREING_PAYMENT_USE_FLAG,ATTRIBUTE1
#FROM subSetDf
#'''
dataToSave = ps.sqldf(sql_query)
datatoSave = subSetDf[['SISTEMA_LEGADO','NUMERO_PROVEEDOR','FEEDER_IMPORT_BATCH_ID','TEMP_EXT_PAYEE_ID','TEMP_EXT_BANK_ACCT_ID',
'BANK_NAME','BRANCH_NAME','COUNTRY_CODE','BANK_ACCOUNT_NAME','BANK_ACCOUNT_NUM','CURRENCY_CODE','FOREING_PAYMENT_USE_FLAG',
'START_DATE','END_DATE','IBAN','CHECK_DIGITS','BANK_ACCOUNT_NAME_ALT','BANK_ACCOUNT_TYPE','ACCOUNT_SUFFIX','DESCRIPTION',
'AGENCY_LOCATION_CODE','EXCHANGE_RATE_AGREEMENT_NUM','EXCHANGE_RATE_AGREEMENT_TYPE','EXCHANGE_RATE','SECONDARY_ACCOUNT_REFERENCE',
'ATTRIBUTE_CATEGORY','ATTRIBUTE1','ATTRIBUTE2','ATTRIBUTE3','ATTRIBUTE4','ATTRIBUTE5','ATTRIBUTE6','ATTRIBUTE7','ATTRIBUTE8',
'ATTRIBUTE9','ATTRIBUTE10','ATTRIBUTE11','ATTRIBUTE12','ATTRIBUTE13','ATTRIBUTE14','ATTRIBUTE15']]
print(len(dataToSave.index), ' records of interest.')
dataToSave.to_csv(Path(workFolder, '5_2_Bank_Accounts.csv'), encoding='utf-8')

160814  records of interest.
