## Data from File 5.1 Suppliers Payees

### Step 1. Load required package

In [1]:
import pandas as pd
import pandasql as ps
from pathlib import Path

### Step 2 - Get sample from source file

In [2]:
sourceFolder = '/content/drive/Shareddrives/Esperanto/Supports/Suppliers/5_Suppliers_Payees_BankAccounts'
workFolder = '/content/drive/Shareddrives/Esperanto/Supports/Suppliers/Work/5_Bank_Accounts'
fileName = '5.1. SUPPLIER_PAYEES_COPPEL.csv'
inputTextFile = open(Path(sourceFolder, fileName), 'r', encoding='latin-1')
if inputTextFile:
    numTextLines = 0
    while True:
        numTextLines += 1
        textLine = inputTextFile.readline()
        print(textLine)
        if numTextLines >10:
            break
    inputTextFile.close()

'SISTEMA_LEGADO';'NUMERO_PROVEEDOR';'FEEDER_IMPORT_BATCH_ID';'TEMP_EXT_PAYEE_ID';'BUSINESS_UNIT';'VENDOR_NUM';'VENDOR_SITE_CODE';'EXCLUSIVE_PAYMENT_FLAG';'DEFAULT_PAYMENT_METHOD_CODE';'DELIVERY_CHANNEL_CODE';'SETTLEMENT_PRIORITY';'REMIT_ADVICE_DELIVERY_METHOD';'REMIT_ADVICE_EMAIL';'REMIT_ADVICE_FAX';'BANK_INSTRUCTION1_CODE';'BANK_INSTRUCTION2_CODE';'BANK_INSTRUCTION_DETAILS';'PAYMENT_REASON_CODE';'PAYMENT_REASON_COMMENTS';'PAYMENT_TEXT_MESSAGE1';'PAYMENT_TEXT_MESSAGE2';'PAYMENT_TEXT_MESSAGE3';'BANK_CHARGE_BEARER'

'SAJ';'3';'100';'7';'BU_CSACV';'';'SJ3MXN004';'N';'Transferencia';'';'';'';'';'';'';'';'';'';'';'';'';'';''

'OBRAS';'GAVE860528F18';'100';'9';'BU_CSACV';'0 VERGARA EFRAIN';'SOGAVE860528F18';'N';'Transferencia';'';'';'';'';'';'';'';'';'';'';'';'';'';''

'OBRAS';'CST050404CH3';'100';'11';'BU_CSACV';'04 ST SC';'SOCST050404CH3';'N';'Transferencia';'';'';'';'';'';'';'';'';'';'';'';'';'';''

'GASTOS';'BIO111108JC6';'100';'12';'BU_CSACV';'100% BIORESPONSABLES SA DE CV';'GTBIO111108

### Step 3 - Load supplier payee data

In [3]:
df = pd.read_csv(Path(sourceFolder,fileName), sep=';', quotechar=chr(39), encoding='latin-1')
print(df.head(10))
print(len(df.index), ' records found. ')

  SISTEMA_LEGADO NUMERO_PROVEEDOR  FEEDER_IMPORT_BATCH_ID  TEMP_EXT_PAYEE_ID  \
0            SAJ                3                     100                  7   
1          OBRAS    GAVE860528F18                     100                  9   
2          OBRAS     CST050404CH3                     100                 11   
3         GASTOS     BIO111108JC6                     100                 12   
4          OBRAS     BIO111108JC6                     100                 13   
5          OBRAS     CDC150325SZ0                     100                 16   
6         GASTOS     DSE090511EW5                     100                 18   
7          OBRAS     DSE090511EW5                     100                 19   
8         GASTOS     TVC8607234U2                     100                 24   
9          OBRAS     TVC8607234U2                     100                 23   

  BUSINESS_UNIT                        VENDOR_NUM VENDOR_SITE_CODE  \
0      BU_CSACV                               NaN

In [4]:
df.dtypes

SISTEMA_LEGADO                   object
NUMERO_PROVEEDOR                 object
FEEDER_IMPORT_BATCH_ID            int64
TEMP_EXT_PAYEE_ID                 int64
BUSINESS_UNIT                    object
VENDOR_NUM                       object
VENDOR_SITE_CODE                 object
EXCLUSIVE_PAYMENT_FLAG           object
DEFAULT_PAYMENT_METHOD_CODE      object
DELIVERY_CHANNEL_CODE           float64
SETTLEMENT_PRIORITY             float64
REMIT_ADVICE_DELIVERY_METHOD    float64
REMIT_ADVICE_EMAIL              float64
REMIT_ADVICE_FAX                float64
BANK_INSTRUCTION1_CODE          float64
BANK_INSTRUCTION2_CODE          float64
BANK_INSTRUCTION_DETAILS        float64
PAYMENT_REASON_CODE             float64
PAYMENT_REASON_COMMENTS         float64
PAYMENT_TEXT_MESSAGE1           float64
PAYMENT_TEXT_MESSAGE2           float64
PAYMENT_TEXT_MESSAGE3           float64
BANK_CHARGE_BEARER              float64
dtype: object

### Step 4 - Creates a modified dataset with key field as strings

In [5]:
modDf = df
modDf['TAXID_VALIDATION'] = False
modDf['SISTEMA_LEGADO']=modDf['SISTEMA_LEGADO'].astype(pd.StringDtype())
modDf['NUMERO_PROVEEDOR']=modDf['NUMERO_PROVEEDOR'].astype(pd.StringDtype())
modDf['BUSINESS_UNIT']=modDf['BUSINESS_UNIT'].astype(pd.StringDtype())
modDf['VENDOR_NUM']=modDf['VENDOR_NUM'].astype(pd.StringDtype())
modDf['VENDOR_SITE_CODE']=modDf['VENDOR_SITE_CODE'].astype(pd.StringDtype())
modDf['EXCLUSIVE_PAYMENT_FLAG']=modDf['EXCLUSIVE_PAYMENT_FLAG'].astype(pd.StringDtype())
modDf['DEFAULT_PAYMENT_METHOD_CODE']=modDf['DEFAULT_PAYMENT_METHOD_CODE'].astype(pd.StringDtype())
print(modDf.head(10))
print(modDf.dtypes)

  SISTEMA_LEGADO NUMERO_PROVEEDOR  FEEDER_IMPORT_BATCH_ID  TEMP_EXT_PAYEE_ID  \
0            SAJ                3                     100                  7   
1          OBRAS    GAVE860528F18                     100                  9   
2          OBRAS     CST050404CH3                     100                 11   
3         GASTOS     BIO111108JC6                     100                 12   
4          OBRAS     BIO111108JC6                     100                 13   
5          OBRAS     CDC150325SZ0                     100                 16   
6         GASTOS     DSE090511EW5                     100                 18   
7          OBRAS     DSE090511EW5                     100                 19   
8         GASTOS     TVC8607234U2                     100                 24   
9          OBRAS     TVC8607234U2                     100                 23   

  BUSINESS_UNIT                        VENDOR_NUM VENDOR_SITE_CODE  \
0      BU_CSACV                              <NA>

### Step 5 - Reads Data in Scope

In [8]:
dataInScopeDf = pd.read_csv(Path(workFolder, 'dataInScopeDfs.csv'), encoding='utf-8')
print(dataInScopeDf.head(10))

   Unnamed: 0 NUMERO_PROVEEDOR          ATTRIBUTE1
0           0     &JE040614N51   12276001452967570
1           1    00AL570409DC8   12650014247123077
2           2    00BR5103152L7  137542104238981574
3           3    00CE810603TI4  137282103200730959
4           4             1482    2052093730305436
5           5             1482   12320001445245786
6           6            15020   21910040156221600
7           7            15025   12180001719871518
8           8            15026   30730232441602010
9           9            15039   21743040145993103


### Step 6 - Get Tax Id's or Supplier Id's in scope

In [13]:
sql_query = '''
SELECT NUMERO_PROVEEDOR FROM
(SELECT DISTINCT NUMERO_PROVEEDOR FROM dataInScopeDf)
ORDER BY NUMERO_PROVEEDOR ASC
'''
SuppliersInScope = ps.sqldf(sql_query)
print(SuppliersInScope.dtypes)
print(len(SuppliersInScope.index), ' suppliers in scope.')
print(SuppliersInScope.head(10))

NUMERO_PROVEEDOR    object
dtype: object
76565  suppliers in scope.
  NUMERO_PROVEEDOR
0     &JE040614N51
1    00AL570409DC8
2    00BR5103152L7
3    00CE810603TI4
4             1482
5            15020
6            15025
7            15026
8            15039
9            15043


### Step 5 - Merge Suppliers in Scope with 5.1 Table data

In [14]:
subSetDf = SuppliersInScope.merge(modDf, on='NUMERO_PROVEEDOR', how='left')
subSetDf = subSetDf[subSetDf['VENDOR_SITE_CODE'].notna()]
print(len(subSetDf.index), ' records obtained.')
print(subSetDf.head(10))

164488  records obtained.
  NUMERO_PROVEEDOR      SISTEMA_LEGADO  FEEDER_IMPORT_BATCH_ID  \
0     &JE040614N51              GASTOS                     100   
1     &JE040614N51               OBRAS                     100   
2    00AL570409DC8              GASTOS                     100   
3    00AL570409DC8               OBRAS                     100   
4    00BR5103152L7               OBRAS                     100   
5    00BR5103152L7              GASTOS                     100   
6    00CE810603TI4               OBRAS                     100   
7    00CE810603TI4              GASTOS                     100   
8             1482         PORTAL SARI                     100   
9             1482  NEGOCIOS AFILIADOS                     100   

   TEMP_EXT_PAYEE_ID BUSINESS_UNIT                    VENDOR_NUM  \
0                139      BU_CSACV  A & J EXPORTACIONES SA DE CV   
1                140      BU_CSACV  A & J EXPORTACIONES SA DE CV   
2             166443      BU_CSACV      OSO

### Steps6 - Validate Legacy Systems in Scope

In [17]:
legacySystemsFound = subSetDf.SISTEMA_LEGADO.unique()
print(len(legacySystemsFound), ' records found.')
print(list(legacySystemsFound))

14  records found.
['GASTOS', 'OBRAS', 'PORTAL SARI', 'NEGOCIOS AFILIADOS', 'SIM', 'CONSTRUNET', 'REFACCIONARIA', 'RXMENU', 'MXMENU', 'MUEBLES', 'TECNOLOGIA', 'SIE', 'MARKETPLACE', 'MOTOR DE SEGUIMIENTO']


### Steps 7 - check distribution of records per legacy system

In [22]:
sql_query = '''
SELECT SISTEMA_LEGADO,COUNT(*) AS numObs FROM subSetDf
GROUP BY SISTEMA_LEGADO
ORDER BY numObs DESC
'''
distPerLegacySystemDf = ps.sqldf(sql_query)
for index, row in distPerLegacySystemDf.iterrows():
    print(row['SISTEMA_LEGADO'], ',', row['numObs'])

GASTOS , 74690
OBRAS , 74643
MOTOR DE SEGUIMIENTO , 11226
CONSTRUNET , 1922
SIM , 1786
NEGOCIOS AFILIADOS , 107
TECNOLOGIA , 77
SIE , 11
MARKETPLACE , 10
REFACCIONARIA , 8
MXMENU , 4
RXMENU , 2
MUEBLES , 1
PORTAL SARI , 1


In [25]:
sql_query = '''
SELECT DISTINCT NUMERO_PROVEEDOR,BUSINESS_UNIT,VENDOR_SITE_CODE FROM subSetDf
'''
SupplierBUAndVSTDf = ps.sqldf(sql_query)
print(len(SupplierBUAndVSTDf.index), ' records in scope')
SupplierBUAndVSTDf.to_csv(Path(workFolder, 'SupplierBUAndVSTDf.csv'), encoding='utf-8')
sql_query = '''
SELECT NUMERO_PROVEEDOR,COUNT(BUSINESS_UNIT) AS numBUs,
       COUNT(VENDOR_SITE_CODE) AS numVSCs FROM SupplierBUAndVSTDf
GROUP BY NUMERO_PROVEEDOR
ORDER BY numBUs,numVSCs DESC
'''
result = ps.sqldf(sql_query)
print(result.head(10))
sql_query = '''
SELECT numBUs,numVSCs,COUNT(*) AS numObs FROM result
GROUP BY numBUs,numVSCs
ORDER BY numObs DESC
'''
result2 = ps.sqldf(sql_query)
print(result2.head())

164357  records in scope
  NUMERO_PROVEEDOR  numBUs  numVSCs
0            16291       1        1
1            16826       1        1
2            17147       1        1
3             1796       1        1
4            18005       1        1
5            18247       1        1
6            18337       1        1
7            18434       1        1
8            18606       1        1
9            18798       1        1
   numBUs  numVSCs  numObs
0       2        2   65118
1       3        3   11223
2       1        1     160
3       4        4      43
4       5        5      16


In [21]:
dataToSaveDf = subSetValidTaxIdDf[['SISTEMA_LEGADO','NUMERO_PROVEEDOR','FEEDER_IMPORT_BATCH_ID','TEMP_EXT_PAYEE_ID','BUSINESS_UNIT','VENDOR_NUM','VENDOR_SITE_CODE','EXCLUSIVE_PAYMENT_FLAG','DEFAULT_PAYMENT_METHOD_CODE']]
dataToSaveDf.to_csv(Path(workFolder, '5_1_Suppliers_Payees.csv'), encoding='utf-8')
print(len(dataToSaveDf.index), ' records saved.')

160559  records saved.


In [22]:
dataToSaveDf.dtypes

SISTEMA_LEGADO                 object
NUMERO_PROVEEDOR               object
FEEDER_IMPORT_BATCH_ID          int64
TEMP_EXT_PAYEE_ID               int64
BUSINESS_UNIT                  object
VENDOR_NUM                     object
VENDOR_SITE_CODE               object
EXCLUSIVE_PAYMENT_FLAG         object
DEFAULT_PAYMENT_METHOD_CODE    object
dtype: object

In [27]:
exceptions=dataToSaveDf['NUMERO_PROVEEDOR'].notnull()
print(len(exceptions.index), ' records not null')

160559  records not null
