In [None]:
from pathlib import Path
import pandas as pd
import time
import os
import tqdm

In [None]:
# Replace 'your_file.csv' with the actual path to your CSV file
input_fp = Path.cwd() / 'tidy_data' # path of files to be found

output_fp = Path.cwd() / 'unique_data' # output path of files generated
# Use the Path object to actually create the subfolder
Path.mkdir(output_fp, exist_ok=True)

In [None]:
def find_matching_rows(df):
    grouped = df.groupby(['date_of_sale', 'purchase_amount', 'zip_code'])
    matching_indices = []

    for _, group in grouped:
        if len(group) > 1:
            matching_indices.extend(group.index.tolist())

    return matching_indices

In [None]:
start_time = time.time()

# Iterate through log files in the folder
for filename in sorted(os.listdir(input_fp)): #Alternative: for filename in tqdm.tqdm(sorted(os.listdir(input_fp))):
    if filename.startswith('tidy_sales_1992_2022_') and filename.endswith('.csv'):
        df = pd.read_csv(input_fp/filename, low_memory=False) # read the CSV file into a DataFrame
        #print(df.shape)
        matching_indices = find_matching_rows(df)
        matching_df = df[df.index.isin(matching_indices)]
        unique_matching_indices = list(set(matching_indices))

        # List of index numbers to exclude
        index_numbers_to_exclude = unique_matching_indices
        
        # Create a new DataFrame excluding rows with specified index numbers
        new_df = df[~df.index.isin(index_numbers_to_exclude)]
        
        # Rename and save the new DataFrame to a CSV file without index numbers
        output_filename = filename.replace('tidy_', 'unique_')  # Replace 'tidy_' with 'unique_' in filename
        new_df.to_csv(f'{output_fp}/{output_filename}', index=False)

        print(f'Number of rows in {filename}: {len(df)}')
        print(f'Number of matching rows: {len(matching_df)}')
        print(f'Number of unique rows: {len(new_df)}')
        print()

elapsed_time = time.time() - start_time
print(f'Runtime: {elapsed_time}')