## Data from File 5.3 Suppliers Bank Account Assignments

### Step 1. Load required package

In [1]:
import pandas as pd
import pandasql as ps
from pathlib import Path

### Step 2 - Get sample from source file

In [3]:
sourceFolder = '/content/drive/Shareddrives/Esperanto/Supports/Suppliers/5_Suppliers_Payees_BankAccounts'
workFolder = '/content/drive/Shareddrives/Esperanto/Supports/Suppliers/Work/5_Bank_Accounts'
fileName = '5.3. SUPPLIER_BANK_ACCOUNT_ASSIGNMENTS_COPPEL.csv'
inputTextFile = open(Path(sourceFolder, fileName), 'r', encoding='latin-1')
if inputTextFile:
    numTextLines = 0
    while True:
        numTextLines += 1
        textLine = inputTextFile.readline()
        print(textLine)
        if numTextLines >10:
            break
    inputTextFile.close()

SISTEMA_LEGADO,NUMERO_PROVEEDOR,FEEDER_IMPORT_BATCH_ID,TEMP_EXT_PAYEE_ID,TEMP_EXT_BANK_ACCT_ID,TEMP_PMT_INSTR_USE_ID,PRIMARY_FLAG,START_DATE,END_DATE

OBRAS,GAVE860528F18,100,9,9,9,Y,,

GASTOS,BIO111108JC6,100,12,12,12,Y,,

OBRAS,BIO111108JC6,100,13,13,13,Y,,

GASTOS,DSE090511EW5,100,18,18,18,Y,,

OBRAS,DSE090511EW5,100,19,19,19,Y,,

OBRAS,TVC8607234U2,100,23,23,23,Y,,

GASTOS,TVC8607234U2,100,24,24,24,Y,,

GASTOS,ABS110113195,100,27,27,27,Y,,

OBRAS,ABS110113195,100,28,28,28,Y,,

OBRAS,DUO7511286H3,100,37,37,37,Y,,



### Step 3 - Load supplier bank assignments data

In [7]:
df = pd.read_csv(Path(sourceFolder,fileName), sep=',', encoding='latin-1')
print(df.head(10))
print(len(df.index), ' records found. ')

  SISTEMA_LEGADO NUMERO_PROVEEDOR  FEEDER_IMPORT_BATCH_ID  TEMP_EXT_PAYEE_ID  \
0          OBRAS    GAVE860528F18                     100                  9   
1         GASTOS     BIO111108JC6                     100                 12   
2          OBRAS     BIO111108JC6                     100                 13   
3         GASTOS     DSE090511EW5                     100                 18   
4          OBRAS     DSE090511EW5                     100                 19   
5          OBRAS     TVC8607234U2                     100                 23   
6         GASTOS     TVC8607234U2                     100                 24   
7         GASTOS     ABS110113195                     100                 27   
8          OBRAS     ABS110113195                     100                 28   
9          OBRAS     DUO7511286H3                     100                 37   

   TEMP_EXT_BANK_ACCT_ID  TEMP_PMT_INSTR_USE_ID PRIMARY_FLAG  START_DATE  \
0                      9                   

In [8]:
df.dtypes

SISTEMA_LEGADO             object
NUMERO_PROVEEDOR           object
FEEDER_IMPORT_BATCH_ID      int64
TEMP_EXT_PAYEE_ID           int64
TEMP_EXT_BANK_ACCT_ID       int64
TEMP_PMT_INSTR_USE_ID       int64
PRIMARY_FLAG               object
START_DATE                float64
END_DATE                  float64
dtype: object

### Step 4 - Creates a modified dataset with key field as strings

In [9]:
modDf = df
modDf['SISTEMA_LEGADO']=modDf['SISTEMA_LEGADO'].astype(pd.StringDtype())
modDf['NUMERO_PROVEEDOR']=modDf['NUMERO_PROVEEDOR'].astype(pd.StringDtype())
modDf['PRIMARY_FLAG']=modDf['PRIMARY_FLAG'].astype(pd.StringDtype())
print(modDf.head(10))
print(modDf.dtypes)

  SISTEMA_LEGADO NUMERO_PROVEEDOR  FEEDER_IMPORT_BATCH_ID  TEMP_EXT_PAYEE_ID  \
0          OBRAS    GAVE860528F18                     100                  9   
1         GASTOS     BIO111108JC6                     100                 12   
2          OBRAS     BIO111108JC6                     100                 13   
3         GASTOS     DSE090511EW5                     100                 18   
4          OBRAS     DSE090511EW5                     100                 19   
5          OBRAS     TVC8607234U2                     100                 23   
6         GASTOS     TVC8607234U2                     100                 24   
7         GASTOS     ABS110113195                     100                 27   
8          OBRAS     ABS110113195                     100                 28   
9          OBRAS     DUO7511286H3                     100                 37   

   TEMP_EXT_BANK_ACCT_ID  TEMP_PMT_INSTR_USE_ID PRIMARY_FLAG  START_DATE  \
0                      9                   

### Step 5 - Reads Data in Scope

In [10]:
dataInScopeDf = pd.read_csv(Path(workFolder, 'dataInScopeDfs.csv'), encoding='utf-8')
print(dataInScopeDf.head(10))

   Unnamed: 0 NUMERO_PROVEEDOR          ATTRIBUTE1
0           0     &JE040614N51   12276001452967570
1           1    00AL570409DC8   12650014247123077
2           2    00BR5103152L7  137542104238981574
3           3    00CE810603TI4  137282103200730959
4           4             1482    2052093730305436
5           5             1482   12320001445245786
6           6            15020   21910040156221600
7           7            15025   12180001719871518
8           8            15026   30730232441602010
9           9            15039   21743040145993103


### Step 6 - Get Tax Id's or Supplier Id's in scope

In [11]:
sql_query = '''
SELECT NUMERO_PROVEEDOR FROM
(SELECT DISTINCT NUMERO_PROVEEDOR FROM dataInScopeDf)
ORDER BY NUMERO_PROVEEDOR ASC
'''
SuppliersInScope = ps.sqldf(sql_query)
print(SuppliersInScope.dtypes)
print(len(SuppliersInScope.index), ' suppliers in scope.')
print(SuppliersInScope.head(10))

NUMERO_PROVEEDOR    object
dtype: object
76565  suppliers in scope.
  NUMERO_PROVEEDOR
0     &JE040614N51
1    00AL570409DC8
2    00BR5103152L7
3    00CE810603TI4
4             1482
5            15020
6            15025
7            15026
8            15039
9            15043


### Step 5 - Merge Suppliers in Scope with 5.3 Table data

In [12]:
subSetDf = SuppliersInScope.merge(modDf, on='NUMERO_PROVEEDOR', how='left')
subSetDf = subSetDf[subSetDf['SISTEMA_LEGADO'].notna()]
print(len(subSetDf.index), ' records obtained.')
print(subSetDf.head(10))

164946  records obtained.
  NUMERO_PROVEEDOR      SISTEMA_LEGADO  FEEDER_IMPORT_BATCH_ID  \
0     &JE040614N51              GASTOS                     100   
1     &JE040614N51               OBRAS                     100   
2    00AL570409DC8              GASTOS                     100   
3    00AL570409DC8               OBRAS                     100   
4    00BR5103152L7               OBRAS                     100   
5    00BR5103152L7              GASTOS                     100   
6    00CE810603TI4               OBRAS                     100   
7    00CE810603TI4              GASTOS                     100   
8             1482         PORTAL SARI                     100   
9             1482  NEGOCIOS AFILIADOS                     100   

   TEMP_EXT_PAYEE_ID  TEMP_EXT_BANK_ACCT_ID  TEMP_PMT_INSTR_USE_ID  \
0                139                    134                    134   
1                140                    135                    135   
2             166443                 

### Steps6 - Validate Legacy Systems in Scope

In [13]:
legacySystemsFound = subSetDf.SISTEMA_LEGADO.unique()
print(len(legacySystemsFound), ' records found.')
print(list(legacySystemsFound))

11  records found.
['GASTOS', 'OBRAS', 'PORTAL SARI', 'NEGOCIOS AFILIADOS', 'CONSTRUNET', 'SIM', 'REFACCIONARIA', 'TECNOLOGIA', 'SIE', 'MARKETPLACE', 'MOTOR DE SEGUIMIENTO']


### Steps 7 - check distribution of records per legacy system

In [14]:
sql_query = '''
SELECT SISTEMA_LEGADO,COUNT(*) AS numObs FROM subSetDf
GROUP BY SISTEMA_LEGADO
ORDER BY numObs DESC
'''
distPerLegacySystemDf = ps.sqldf(sql_query)
for index, row in distPerLegacySystemDf.iterrows():
    print(row['SISTEMA_LEGADO'], ',', row['numObs'])

GASTOS , 74871
OBRAS , 74871
MOTOR DE SEGUIMIENTO , 11147
CONSTRUNET , 1922
SIM , 1921
NEGOCIOS AFILIADOS , 107
TECNOLOGIA , 77
SIE , 11
MARKETPLACE , 10
REFACCIONARIA , 8
PORTAL SARI , 1
