In [1]:
from pathlib import Path
import pandas as pd
import time
import os
import tqdm

In [2]:
# Replace 'your_file.csv' with the actual path to your CSV file
input_fp = Path.cwd() / 'tidy_data' # path of files to be found

output_fp = Path.cwd() / 'unique_data' # output path of files generated
# Use the Path object to actually create the subfolder
Path.mkdir(output_fp, exist_ok=True)

In [3]:
def find_matching_rows(df):
    grouped = df.groupby(['date_of_sale', 'purchase_amount', 'zip_code'])
    matching_indices = []

    for _, group in grouped:
        if len(group) > 1:
            matching_indices.extend(group.index.tolist())

    return matching_indices

In [4]:
start_time = time.time()

# Iterate through log files in the folder
for filename in sorted(os.listdir(input_fp)): #Alternative: for filename in tqdm.tqdm(sorted(os.listdir(input_fp))):
    if filename.startswith('tidy_sales_1992_2022_') and filename.endswith('.csv'):
        df = pd.read_csv(input_fp/filename, low_memory=False) # read the CSV file into a DataFrame
        #print(df.shape)
        matching_indices = find_matching_rows(df)
        matching_df = df[df.index.isin(matching_indices)]
        unique_matching_indices = list(set(matching_indices))

        # List of index numbers to exclude
        index_numbers_to_exclude = unique_matching_indices
        
        # Create a new DataFrame excluding rows with specified index numbers
        new_df = df[~df.index.isin(index_numbers_to_exclude)]
        
        # Rename and save the new DataFrame to a CSV file without index numbers
        output_filename = filename.replace('tidy_', 'unique_')  # Replace 'tidy_' with 'unique_' in filename
        new_df.to_csv(f'{output_fp}/{output_filename}', index=False)

        print(f'Number of rows in {filename}: {len(df)}')
        print(f'Number of matching rows: {len(matching_df)}')
        print(f'Number of unique rows: {len(new_df)}')
        print()

elapsed_time = time.time() - start_time
print(f'Runtime: {elapsed_time}')

Number of rows in tidy_sales_1992_2022_101.csv: 297624
Number of matching rows: 73051
Number of unique rows: 224573

Number of rows in tidy_sales_1992_2022_147.csv: 56327
Number of matching rows: 11937
Number of unique rows: 44390

Number of rows in tidy_sales_1992_2022_151.csv: 18191
Number of matching rows: 5928
Number of unique rows: 12263

Number of rows in tidy_sales_1992_2022_153.csv: 9457
Number of matching rows: 1785
Number of unique rows: 7672

Number of rows in tidy_sales_1992_2022_155.csv: 8115
Number of matching rows: 640
Number of unique rows: 7475

Number of rows in tidy_sales_1992_2022_157.csv: 56459
Number of matching rows: 12842
Number of unique rows: 43617

Number of rows in tidy_sales_1992_2022_159.csv: 30578
Number of matching rows: 4269
Number of unique rows: 26309

Number of rows in tidy_sales_1992_2022_161.csv: 9207
Number of matching rows: 1365
Number of unique rows: 7842

Number of rows in tidy_sales_1992_2022_163.csv: 7311
Number of matching rows: 457
Number o