### Step 1. Load required packages

In [1]:
import pandas as pd
import pandasql as ps

### Step 2 - Loads suppliers interface data

In [2]:

from custom_procedures import newestFile
sourceFolder = '/content/drive/Shareddrives/Esperanto/Supports/Suppliers/1_Suppliers_Interface'
workFolder = '/content/drive/Shareddrives/Esperanto/Supports/Suppliers/Work/1_Suppliers_Interface'
sourceFile = newestFile(sourceFolder)
if sourceFile:
    print(sourceFile)
    df = pd.read_csv(sourceFile, sep=';', quotechar=chr(39), encoding='latin-1')
    print('=> Columns loaded')
    for columnName in df.columns: 
        print(columnName)
print(len(df.index),' records loaded')

/content/drive/Shareddrives/Esperanto/Supports/Suppliers/1_Suppliers_Interface/1. SUPPLIERS_INTERFACE_COPPEL.csv
=> Columns loaded
SISTEMA_LEGADO
NUMERO_PROVEEDOR
BATCH_ID
IMPORT_ACTION
VENDOR_NAME
VENDOR_NAME_NEW
SEGMENT1
VENDOR_NAME_ALT
ORGANIZATION_TYPE_LOOKUP_CODE
VENDOR_TYPE_LOOKUP_CODE
END_DATE_ACTIVE
BUSINESS_RELATIONSHIP
PARENT_SUPPLIER_NAME
ALIAS
DUNS_NUMBER
ONE_TIME_FLAG
CUSTOMER_NUM
STANDARD_INDUSTRY_CLASS
NI_NUMBER
CORPORATE_WEBSITE
CHIEF_EXECUTIVE_TITLE
CHIEF_EXECUTIVE_NAME
BUS_CLASS_NOT_APPLICABLE
TAX_COUNTRY_CODE
NUM_1099
FEDERAL_REPORTABLE_FLAG
TYPE_1099
STATE_REPORTABLE_FLAG
TAX_REPORTING_NAME
NAME_CONTROL
TAX_VERIFICATION_DATE
ALLOW_AWT_FLAG
AWT_GROUP_NAME
VAT_CODE
VAT_REGISTRATION_NUM
AUTO_TAX_CALC_OVERRIDE
PAYMENT_METHOD_LOOKUP_CODE
DELIVERY_CHANNEL
BRANK_INSTRUCTION1
BRANK_INSTRUCTION2
BRANK_INSTRUCTION_DETAILS
SETTLEMENT_PRIORITY
PAYMENT_TEXT_MESSAGE_1
PAYMENT_TEXT_MESSAGE_2
PAYMENT_TEXT_MESSAGE_3
IBY_BANK_CHARGE_BEARER
PAYMENT_REASON_CODE
PAYMENT_REASON_COMMENTS


### Step 3 - Find duplicates by NUM_1099

In [3]:
sql_query = '''
SELECT NUM_1099,numObs FROM
(SELECT NUM_1099,COUNT(*) AS numObs FROM df
GROUP BY NUM_1099)
ORDER BY numObs DESC
'''
dupsAnalysisDf = ps.sqldf(sql_query)
print('Total number of records, ', len(dupsAnalysisDf.index))
#
sql_query = '''
SELECT * FROM dupsAnalysisDf
WHERE numObs=1
ORDER BY NUM_1099 ASC
'''
uniqueRecordsDf = ps.sqldf(sql_query)
print('Total number unique records, ', len(uniqueRecordsDf.index))
#
sql_query = '''
SELECT * FROM dupsAnalysisDf
WHERE numObs>1
ORDER BY numObs DESC
'''
duplicateRecordsDf = ps.sqldf(sql_query)
print('Total number unique records, ', len(duplicateRecordsDf.index))

Total number of records,  78584
Total number unique records,  257
Total number unique records,  78327


### Step 4 - Asigns a case number to each duplicate record

In [4]:
duplicateRecordsDf['caseNum'] = 0
duplicateRecordsDf['caseNum'].astype(int)
print(duplicateRecordsDf.columns)
caseNum = 1
for index, row in duplicateRecordsDf.iterrows():
    duplicateRecordsDf.loc[index, 'caseNum'] = caseNum
    caseNum +=1
print(caseNum, ' Number of cases to be inspected.')
print(duplicateRecordsDf)

Index(['NUM_1099', 'numObs', 'caseNum'], dtype='object')
78328  Number of cases to be inspected.
            NUM_1099  numObs  caseNum
0      SACE650728LK6      10        1
1       DBM000228J35       8        2
2      MOEC6102155J0       7        3
3       AMC950330JM3       6        4
4       GSI910222KY9       6        5
...              ...     ...      ...
78322  ZUVJ9804029R8       2    78323
78323  ZUVY870930DQ5       2    78324
78324  ZUZD751014A10       2    78325
78325  ZUZM690116CK1       2    78326
78326  ZUZP751127FU3       2    78327

[78327 rows x 3 columns]


### Step 5 - Validate the number of duplicater per cases matches the total number of records

In [5]:
numRecordsVal = 0
for index, row in duplicateRecordsDf.iterrows():
    numRecordsVal = numRecordsVal + int(row['numObs'])
print(numRecordsVal, ' records contained in ', caseNum, ' cases.')

171292  records contained in  78328  cases.


### Step 6 - Save dataframes to csv files

In [19]:
print(dupsAnalysisDf)

            NUM_1099  numObs  caseNum
0      SACE650728LK6      10        1
1       DBM000228J35       8        2
2      MOEC6102155J0       7        3
3       AMC950330JM3       6        4
4       GSI910222KY9       6        5
...              ...     ...      ...
78322  ZUVJ9804029R8       2    78323
78323  ZUVY870930DQ5       2    78324
78324  ZUZD751014A10       2    78325
78325  ZUZM690116CK1       2    78326
78326  ZUZP751127FU3       2    78327

[78327 rows x 3 columns]
