-
Notifications
You must be signed in to change notification settings - Fork 0
/
invoice_filter.py
33 lines (26 loc) · 1.18 KB
/
invoice_filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import pandas as pd
# vars
masterList = []
duplicateIDs = []
dataFrames = []
data = ""
# read the data
try :
data = pd.read_csv("../data/joined_dataset_final.csv")
except :
print("Error opening file. Please check the filename and try again. Exiting...")
exit()
# create a unique identifier made of the 4 fields which duplicates have in common
data["unique_id"] = ( data["WRBTR"].astype("str") + data["BUKRS"].astype("str") + data["BLDAT"] + data["XBLNR"] )
# Build a master list of what we have seen as we go along
# Aim to iterate just once through the file
for i, row in data.iterrows():
if row["unique_id"] not in masterList: # if there's new entry, add to our 'seen' list
masterList.append(row["unique_id"])
else: # if the entry's already there, we filter it out
duplicateIDs.append(row["unique_id"])
for i in duplicateIDs: # use found duplicate IDs to pick out matching invoices
dataFrames.append(data.loc[data["unique_id"] == i])
# save the found duplicates to CSV
dataFrames = pd.concat(dataFrames)
dataFrames.to_csv("../data/duplicates.csv")