##  Data from File 5.2 Bank Accounts

### Step 1. Load required packages

In [1]:
import pandas as pd
import pandasql as ps
from pathlib import Path

#### Step 2 - Get sample from source file

In [2]:
sourceFolder = '/content/drive/Shareddrives/Esperanto/Supports/Suppliers/5_Suppliers_Payees_BankAccounts'
workFolder = '/content/drive/Shareddrives/Esperanto/Supports/Suppliers/Work/5_Bank_Accounts'
fileName = '5.2. SUPPLIER_BANK_ACCOUNTS_COPPEL.csv'
inputTextFile = open(Path(sourceFolder, fileName), 'r', encoding='latin-1')
if inputTextFile:
    numTextLines = 0
    while True:
        numTextLines += 1
        textLine = inputTextFile.readline()
        print(textLine)
        if numTextLines >10:
            break
    inputTextFile.close()

'SISTEMA_LEGADO';'NUMERO_PROVEEDOR';'FEEDER_IMPORT_BATCH_ID';'TEMP_EXT_PAYEE_ID';'TEMP_EXT_BANK_ACCT_ID';'BANK_NAME';'BRANCH_NAME';'COUNTRY_CODE';'BANK_ACCOUNT_NAME';'BANK_ACCOUNT_NUM';'CURRENCY_CODE';'FOREING_PAYMENT_USE_FLAG';'START_DATE';'END_DATE';'IBAN';'CHECK_DIGITS';'BANK_ACCOUNT_NAME_ALT';'BANK_ACCOUNT_TYPE';'ACCOUNT_SUFFIX';'DESCRIPTION';'AGENCY_LOCATION_CODE';'EXCHANGE_RATE_AGREEMENT_NUM';'EXCHANGE_RATE_AGREEMENT_TYPE';'EXCHANGE_RATE';'SECONDARY_ACCOUNT_REFERENCE';'ATTRIBUTE_CATEGORY';'ATTRIBUTE1';'ATTRIBUTE2';'ATTRIBUTE3';'ATTRIBUTE4';'ATTRIBUTE5';'ATTRIBUTE6';'ATTRIBUTE7';'ATTRIBUTE8';'ATTRIBUTE9';'ATTRIBUTE10';'ATTRIBUTE11';'ATTRIBUTE12';'ATTRIBUTE13';'ATTRIBUTE14';'ATTRIBUTE15'

'OBRAS';'GAVE860528F18';'100';'9';'0009';'HSBC';'PROVEEDORES';'MX';'';'4057367294';'MXN';'N';'';'';'';'';'';'';'';'';'';'';'';'';'';'';'021741040573672946';'';'';'';'';'';'';'';'';'';'';'';'';'';''

'GASTOS';'BIO111108JC6';'100';'12';'00012';'BANORTE';'PROVEEDORES';'MX';'';'0808351359';'MXN';'N';'

### Step 3 - Load bank account data

In [3]:
df = pd.read_csv(Path(sourceFolder,fileName), sep=';', quotechar=chr(39), encoding='latin-1')
df.dtypes

SISTEMA_LEGADO                   object
NUMERO_PROVEEDOR                 object
FEEDER_IMPORT_BATCH_ID            int64
TEMP_EXT_PAYEE_ID                 int64
TEMP_EXT_BANK_ACCT_ID             int64
BANK_NAME                        object
BRANCH_NAME                      object
COUNTRY_CODE                     object
BANK_ACCOUNT_NAME               float64
BANK_ACCOUNT_NUM                float64
CURRENCY_CODE                    object
FOREING_PAYMENT_USE_FLAG         object
START_DATE                      float64
END_DATE                        float64
IBAN                            float64
CHECK_DIGITS                    float64
BANK_ACCOUNT_NAME_ALT           float64
BANK_ACCOUNT_TYPE               float64
ACCOUNT_SUFFIX                  float64
DESCRIPTION                     float64
AGENCY_LOCATION_CODE            float64
EXCHANGE_RATE_AGREEMENT_NUM     float64
EXCHANGE_RATE_AGREEMENT_TYPE    float64
EXCHANGE_RATE                   float64
SECONDARY_ACCOUNT_REFERENCE     float64


### Step 4 - Create a New Dataset (Modified) and Validate Bank Account

In [4]:
modDf = df
modDf['CLABE_VALIDATION'] = False
modDf['SISTEMA_LEGADO'] = modDf['SISTEMA_LEGADO'].astype(pd.StringDtype())
modDf['NUMERO_PROVEEDOR'] = modDf['NUMERO_PROVEEDOR'].astype(pd.StringDtype())
modDf['BANK_NAME'] = modDf['BANK_NAME'].astype(pd.StringDtype())
modDf['BRANCH_NAME'] = modDf['BRANCH_NAME'].astype(pd.StringDtype())
modDf['COUNTRY_CODE'] = modDf['COUNTRY_CODE'].astype(pd.StringDtype())
modDf['CURRENCY_CODE'] = modDf['CURRENCY_CODE'].astype(pd.StringDtype())
modDf['FOREING_PAYMENT_USE_FLAG'] = modDf['FOREING_PAYMENT_USE_FLAG'].astype(pd.StringDtype())
modDf['ATTRIBUTE1'] = modDf['ATTRIBUTE1'].astype(pd.StringDtype())
modDf['BANK_ACCOUNT_NUM'] = modDf['BANK_ACCOUNT_NUM'].astype(pd.StringDtype()) 
modDf['ATTRIBUTE1'] = modDf['ATTRIBUTE1'].astype(pd.StringDtype())
for index, row in modDf.iterrows():
    if len(str(row['ATTRIBUTE1']).replace('.0',''))==18:
        modDf.loc[index, 'CLABE_VALIDATION'] = True
#
modDf = modDf[modDf['NUMERO_PROVEEDOR'].notna()]
print('the orignal dataset contains ', len(df.index), ' records')
print('the modified dataset contains ', len(modDf.index), ' records.')
print(modDf.head(10))
print(modDf.dtypes)

the orignal dataset contains  171966  records
the modified dataset contains  171966  records.
  SISTEMA_LEGADO NUMERO_PROVEEDOR  FEEDER_IMPORT_BATCH_ID  TEMP_EXT_PAYEE_ID  \
0          OBRAS    GAVE860528F18                     100                  9   
1         GASTOS     BIO111108JC6                     100                 12   
2          OBRAS     BIO111108JC6                     100                 13   
3         GASTOS     DSE090511EW5                     100                 18   
4          OBRAS     DSE090511EW5                     100                 19   
5          OBRAS     TVC8607234U2                     100                 23   
6         GASTOS     TVC8607234U2                     100                 24   
7         GASTOS     ABS110113195                     100                 27   
8          OBRAS     ABS110113195                     100                 28   
9          OBRAS     DUO7511286H3                     100                 37   

   TEMP_EXT_BANK_ACCT_ID 

### Step 5 - Creates a subset with elements whose taxt id and bank account are valid

In [5]:
subSetDf = modDf[modDf['CLABE_VALIDATION']==True]
for index, row in subSetDf.iterrows():
    bankAccountStr = str(row['BANK_ACCOUNT_NUM'])
    clabeAccountStr = str(row['ATTRIBUTE1'])
    subSetDf.at[index, 'BANK_ACCOUNT_NUM'] = bankAccountStr.replace('.0','')
    subSetDf.at[index, 'ATTRIBUTE1'] = clabeAccountStr.replace('.0','')
print(len(modDf.index) ,' records in original dataset')
print(len(subSetDf.index), ' records with a valid bank account (18 digits)')
print(subSetDf.head(10))

171966  records in original dataset
164836  records with a valid bank account (18 digits)
  SISTEMA_LEGADO NUMERO_PROVEEDOR  FEEDER_IMPORT_BATCH_ID  TEMP_EXT_PAYEE_ID  \
0          OBRAS    GAVE860528F18                     100                  9   
1         GASTOS     BIO111108JC6                     100                 12   
2          OBRAS     BIO111108JC6                     100                 13   
3         GASTOS     DSE090511EW5                     100                 18   
4          OBRAS     DSE090511EW5                     100                 19   
5          OBRAS     TVC8607234U2                     100                 23   
6         GASTOS     TVC8607234U2                     100                 24   
7         GASTOS     ABS110113195                     100                 27   
8          OBRAS     ABS110113195                     100                 28   
9          OBRAS     DUO7511286H3                     100                 37   

   TEMP_EXT_BANK_ACCT_ID   BA

### Step 6 - Performs analysis for duplicate records based on taxt id or supplier id

In [6]:
sql_query = '''
SELECT NUMERO_PROVEEDOR,COUNT(*) AS numObs FROM subSetDf
GROUP BY NUMERO_PROVEEDOR
ORDER BY numObs DESC
'''
dupsAnalysis = ps.sqldf(sql_query)
print(len(dupsAnalysis.index), ' taxt ids in daset')
uniqueTaxdIds = dupsAnalysis[dupsAnalysis['numObs']==1]
print(len(uniqueTaxdIds.index), ' unique taxt ids')
duplicateTaxdIds = dupsAnalysis[dupsAnalysis['numObs']>1]
print(len(duplicateTaxdIds.index), ' duplicate taxt ids')
#
sql_query = '''
SELECT SISTEMA_LEGADO,COUNT(*) AS numObs FROM 
(SELECT SISTEMA_LEGADO,NUMERO_PROVEEDOR FROM subSetDf)
GROUP BY SISTEMA_LEGADO
ORDER BY numObs DESC
'''
distPerLegacy = ps.sqldf(sql_query)
for index, row in distPerLegacy.iterrows():
    print(row['SISTEMA_LEGADO'], ',', row['numObs'])

76565  taxt ids in daset
193  unique taxt ids
76372  duplicate taxt ids
GASTOS , 74841
OBRAS , 74826
MOTOR DE SEGUIMIENTO , 11147
CONSTRUNET , 1913
SIM , 1913
NEGOCIOS AFILIADOS , 107
TECNOLOGIA , 59
SIE , 11
MARKETPLACE , 10
REFACCIONARIA , 8
PORTAL SARI , 1


In [7]:
print('unique taxt ids: ')
print(uniqueTaxdIds.head(10))
print('duplicate taxt ids: ')
print(duplicateTaxdIds.head(100))

unique taxt ids: 
      NUMERO_PROVEEDOR  numObs
76372            16291       1
76373            16826       1
76374            17147       1
76375             1796       1
76376            18005       1
76377            18247       1
76378            18337       1
76379            18434       1
76380            18606       1
76381            18798       1
duplicate taxt ids: 
   NUMERO_PROVEEDOR  numObs
0             20539      27
1      DBM000228J35      14
2      ASO0408178B2      12
3      PMC9601107JA      12
4             19546      10
..              ...     ...
95            36928       5
96            37929       5
97            38728       5
98            40530       5
99            40834       5

[100 rows x 2 columns]


In [8]:
sql_query = '''
SELECT numObs,COUNT(*) AS NumCases FROM duplicateTaxdIds
GROUP BY numObs
ORDER BY numObs DESC
'''
result = ps.sqldf(sql_query)
for index, row in result.iterrows():
    print(row['numObs'], ' duplicates, ', row['NumCases'], ' cases')

27  duplicates,  1  cases
14  duplicates,  1  cases
12  duplicates,  2  cases
10  duplicates,  3  cases
9  duplicates,  2  cases
8  duplicates,  18  cases
7  duplicates,  5  cases
6  duplicates,  50  cases
5  duplicates,  24  cases
4  duplicates,  98  cases
3  duplicates,  11203  cases
2  duplicates,  64965  cases


### Step 7 - Perfoms analysis of distribution of accounts per supplier

In [9]:
sql_query = '''
SELECT DISTINCT NUMERO_PROVEEDOR,ATTRIBUTE1 FROM subSetDf
ORDER BY NUMERO_PROVEEDOR
'''
dataInScopeDf = ps.sqldf(sql_query)
print(len(dataInScopeDf.index), ' records in scope')
dataInScopeDf.to_csv(Path(workFolder, 'dataInScopeDfs.csv'), encoding='utf-8')
sql_query = '''
SELECT NUMERO_PROVEEDOR,COUNT(*) AS numObs FROM dataInScopeDf
GROUP BY NUMERO_PROVEEDOR
ORDER BY numObs DESC
'''
accountsPerSupplierDf = ps.sqldf(sql_query)
print(len(accountsPerSupplierDf.index), ' suppliers in dataset')
sql_query = '''
SELECT numObs,COUNT(*) AS numCases FROM accountsPerSupplierDf
GROUP BY numObs
ORDER BY numObs DESC
'''
result = ps.sqldf(sql_query)
for index, row in result.iterrows():
    print(row['numObs'], ', ', row['numCases'])
#
sql_query = '''
SELECT DISTINCT SISTEMA_LEGADO FROM subSetDf
''' 
legacySystemsInScopeDf = ps.sqldf(sql_query)
print(len(legacySystemsInScopeDf.index), ' legacy sistems in scope')
legacySystemsInScopeDf.to_csv(Path(workFolder, 'legacySystemsInScopeDfs.csv'), encoding='utf-8')

76928  records in scope
76565  suppliers in dataset
12 ,  1
7 ,  1
6 ,  1
5 ,  4
4 ,  19
3 ,  45
2 ,  178
1 ,  76316
11  legacy sistems in scope


### Step 8 - Create new reporting structure from subset

In [10]:
newReportDf = subSetDf[['SISTEMA_LEGADO','NUMERO_PROVEEDOR','FEEDER_IMPORT_BATCH_ID','TEMP_EXT_PAYEE_ID','TEMP_EXT_BANK_ACCT_ID',
'BANK_NAME','BRANCH_NAME','COUNTRY_CODE','BANK_ACCOUNT_NAME','BANK_ACCOUNT_NUM','CURRENCY_CODE','FOREING_PAYMENT_USE_FLAG',
'START_DATE','END_DATE','IBAN','CHECK_DIGITS','BANK_ACCOUNT_NAME_ALT','BANK_ACCOUNT_TYPE','ACCOUNT_SUFFIX','DESCRIPTION',
'AGENCY_LOCATION_CODE','EXCHANGE_RATE_AGREEMENT_NUM','EXCHANGE_RATE_AGREEMENT_TYPE','EXCHANGE_RATE','SECONDARY_ACCOUNT_REFERENCE',
'ATTRIBUTE_CATEGORY','ATTRIBUTE1','ATTRIBUTE2','ATTRIBUTE3','ATTRIBUTE4','ATTRIBUTE5','ATTRIBUTE6','ATTRIBUTE7','ATTRIBUTE8',
'ATTRIBUTE9','ATTRIBUTE10','ATTRIBUTE11','ATTRIBUTE12','ATTRIBUTE13','ATTRIBUTE14','ATTRIBUTE15']]
print(len(newReportDf.index), ' in new report structure')

164836  in new report structure


### Step 9 - Create new textfile

In [11]:
print(newReportDf.dtypes)

SISTEMA_LEGADO                   string
NUMERO_PROVEEDOR                 string
FEEDER_IMPORT_BATCH_ID            int64
TEMP_EXT_PAYEE_ID                 int64
TEMP_EXT_BANK_ACCT_ID             int64
BANK_NAME                        string
BRANCH_NAME                      string
COUNTRY_CODE                     string
BANK_ACCOUNT_NAME               float64
BANK_ACCOUNT_NUM                 string
CURRENCY_CODE                    string
FOREING_PAYMENT_USE_FLAG         string
START_DATE                      float64
END_DATE                        float64
IBAN                            float64
CHECK_DIGITS                    float64
BANK_ACCOUNT_NAME_ALT           float64
BANK_ACCOUNT_TYPE               float64
ACCOUNT_SUFFIX                  float64
DESCRIPTION                     float64
AGENCY_LOCATION_CODE            float64
EXCHANGE_RATE_AGREEMENT_NUM     float64
EXCHANGE_RATE_AGREEMENT_TYPE    float64
EXCHANGE_RATE                   float64
SECONDARY_ACCOUNT_REFERENCE     float64


In [13]:
newReportDf.fillna('', inplace=True)
with open(Path(workFolder,'5.2 SUPPLIER_BANK_ACCOUNTS_COPPEL(NEW).csv'), 'w', encoding='utf-8') as textFile:
    tmpVector = ['SISTEMA_LEGADO','NUMERO_PROVEEDOR','FEEDER_IMPORT_BATCH_ID','TEMP_EXT_PAYEE_ID','TEMP_EXT_BANK_ACCT_ID',
                 'BANK_NAME','BRANCH_NAME','COUNTRY_CODE','BANK_ACCOUNT_NAME','BANK_ACCOUNT_NUM','CURRENCY_CODE','FOREING_PAYMENT_USE_FLAG',
                 'START_DATE','END_DATE','IBAN','CHECK_DIGITS','BANK_ACCOUNT_NAME_ALT','BANK_ACCOUNT_TYPE','ACCOUNT_SUFFIX','DESCRIPTION',
                 'AGENCY_LOCATION_CODE','EXCHANGE_RATE_AGREEMENT_NUM','EXCHANGE_RATE_AGREEMENT_TYPE','EXCHANGE_RATE','SECONDARY_ACCOUNT_REFERENCE',
                 'ATTRIBUTE_CATEGORY','ATTRIBUTE1','ATTRIBUTE2','ATTRIBUTE3','ATTRIBUTE4','ATTRIBUTE5','ATTRIBUTE6','ATTRIBUTE7','ATTRIBUTE8',
                 'ATTRIBUTE9','ATTRIBUTE10','ATTRIBUTE11','ATTRIBUTE12','ATTRIBUTE13','ATTRIBUTE14','ATTRIBUTE15']
    textLine = chr(59).join([chr(39)+str(tmpColumn)+chr(39) for tmpColumn in tmpVector]) + '\n'
    textFile.write(textLine)
    linesWritten = 0
    for index, row in newReportDf.iterrows():
        tmpVector = row.to_list()
        textLine = chr(59).join([chr(39)+str(tmpColumn)+chr(39) for tmpColumn in tmpVector]) + '\n'
        textFile.write(textLine)
        linesWritten += 1
    textFile.close()
print(linesWritten, ' text lines written.')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,
164836  text lines written.


### Step 10 - Validate the number of rows on file created

In [14]:
textFileToValidate = open(Path(workFolder,'5.2 SUPPLIER_BANK_ACCOUNTS_COPPEL(NEW).csv'), 'r', encoding='utf-8')
if textFileToValidate:
    numTextLines = 0
    while True:
        numTextLines += 1
        textLine = textFileToValidate.readline()
        print(textLine)
        if numTextLines >10: break
    inputTextFile.close()

'SISTEMA_LEGADO';'NUMERO_PROVEEDOR';'FEEDER_IMPORT_BATCH_ID';'TEMP_EXT_PAYEE_ID';'TEMP_EXT_BANK_ACCT_ID';'BANK_NAME';'BRANCH_NAME';'COUNTRY_CODE';'BANK_ACCOUNT_NAME';'BANK_ACCOUNT_NUM';'CURRENCY_CODE';'FOREING_PAYMENT_USE_FLAG';'START_DATE';'END_DATE';'IBAN';'CHECK_DIGITS';'BANK_ACCOUNT_NAME_ALT';'BANK_ACCOUNT_TYPE';'ACCOUNT_SUFFIX';'DESCRIPTION';'AGENCY_LOCATION_CODE';'EXCHANGE_RATE_AGREEMENT_NUM';'EXCHANGE_RATE_AGREEMENT_TYPE';'EXCHANGE_RATE';'SECONDARY_ACCOUNT_REFERENCE';'ATTRIBUTE_CATEGORY';'ATTRIBUTE1';'ATTRIBUTE2';'ATTRIBUTE3';'ATTRIBUTE4';'ATTRIBUTE5';'ATTRIBUTE6';'ATTRIBUTE7';'ATTRIBUTE8';'ATTRIBUTE9';'ATTRIBUTE10';'ATTRIBUTE11';'ATTRIBUTE12';'ATTRIBUTE13';'ATTRIBUTE14';'ATTRIBUTE15'

'OBRAS';'GAVE860528F18';'100';'9';'9';'HSBC';'PROVEEDORES';'MX';'';'4057367294';'MXN';'N';'';'';'';'';'';'';'';'';'';'';'';'';'';'';'021741040573672946';'';'';'';'';'';'';'';'';'';'';'';'';'';''

'GASTOS';'BIO111108JC6';'100';'12';'12';'BANORTE';'PROVEEDORES';'MX';'';'808351359';'MXN';'N';'';'';''

In [15]:
validationDf = pd.read_csv(Path(workFolder,'5.2 SUPPLIER_BANK_ACCOUNTS_COPPEL(NEW).csv'), sep=';', quotechar=chr(39), encoding='utf-8')
validationDf.dtypes
print(len(validationDf.index), ' records found.')

164836  records found.
