In [1]:
from pathlib import Path
import pandas as pd
import time
import os
import tqdm

In [2]:
# Replace 'your_file.csv' with the actual path to your CSV file
input_fp = Path.cwd() / 'tidy_data' # path of files to be found

output_fp = Path.cwd() / 'unique_data' # output path of files generated
# Use the Path object to actually create the subfolder
Path.mkdir(output_fp, exist_ok=True)

In [3]:
def find_matching_rows(df):
    grouped = df.groupby(['date_of_sale', 'purchase_amount'])
    matching_indices = []

    for _, group in grouped:
        if len(group) > 1:
            matching_indices.extend(group.index.tolist())

    return matching_indices

In [4]:
start_time = time.time()

# Iterate through log files in the folder
for filename in tqdm.tqdm(sorted(os.listdir(input_fp))):
    if filename.startswith('tidy_sales_1992_2022_') and filename.endswith('.csv'):
        df = pd.read_csv(input_fp/filename, low_memory=False) # read the CSV file into a DataFrame
        print(df.shape)
        matching_indices = find_matching_rows(df)
        matching_df = df[df.index.isin(matching_indices)]
        unique_matching_indices = list(set(matching_indices))

        # List of index numbers to exclude
        index_numbers_to_exclude = unique_matching_indices
        
        # Create a new DataFrame excluding rows with specified index numbers
        new_df = df[~df.index.isin(index_numbers_to_exclude)]
        
        # Rename and save the new DataFrame to a CSV file without index numbers
        output_filename = filename.replace('tidy_', 'unique_')  # Replace 'tidy_' with 'unique_' in filename
        new_df.to_csv(f'{output_fp}/{output_filename}', index=False)

        print(f'Number of rows in {filename}: {len(df)}')
        print(f'Number of matching rows: {len(matching_df)}')
        print(f'Number of unique rows: {len(new_df)}')
        print()

elapsed_time = time.time() - start_time
print(f'Runtime: {elapsed_time}')

  0%|                                                                        | 0/98 [00:00<?, ?it/s]

(297624, 13)


  1%|▋                                                               | 1/98 [00:04<06:57,  4.31s/it]

Number of rows in tidy_sales_1992_2022_101.csv: 297624
Number of matching rows: 82788
Number of unique rows: 214836

(56327, 13)


  2%|█▎                                                              | 2/98 [00:05<03:40,  2.30s/it]

Number of rows in tidy_sales_1992_2022_147.csv: 56327
Number of matching rows: 12199
Number of unique rows: 44128

(18191, 13)


  4%|██▌                                                             | 4/98 [00:05<01:22,  1.14it/s]

Number of rows in tidy_sales_1992_2022_151.csv: 18191
Number of matching rows: 5970
Number of unique rows: 12221

(9457, 13)
Number of rows in tidy_sales_1992_2022_153.csv: 9457
Number of matching rows: 1791
Number of unique rows: 7666

(8115, 13)


  5%|███▎                                                            | 5/98 [00:05<00:56,  1.63it/s]

Number of rows in tidy_sales_1992_2022_155.csv: 8115
Number of matching rows: 640
Number of unique rows: 7475

(56459, 13)


  6%|███▉                                                            | 6/98 [00:06<01:04,  1.43it/s]

Number of rows in tidy_sales_1992_2022_157.csv: 56459
Number of matching rows: 13149
Number of unique rows: 43310

(30578, 13)


  8%|█████▏                                                          | 8/98 [00:07<00:44,  2.03it/s]

Number of rows in tidy_sales_1992_2022_159.csv: 30578
Number of matching rows: 4413
Number of unique rows: 26165

(9207, 13)
Number of rows in tidy_sales_1992_2022_161.csv: 9207
Number of matching rows: 1365
Number of unique rows: 7842

(7311, 13)


 10%|██████▍                                                        | 10/98 [00:07<00:26,  3.36it/s]

Number of rows in tidy_sales_1992_2022_163.csv: 7311
Number of matching rows: 457
Number of unique rows: 6854

(7613, 13)
Number of rows in tidy_sales_1992_2022_165.csv: 7613
Number of matching rows: 287
Number of unique rows: 7326

(19979, 13)


 11%|███████                                                        | 11/98 [00:07<00:26,  3.25it/s]

Number of rows in tidy_sales_1992_2022_167.csv: 19979
Number of matching rows: 1734
Number of unique rows: 18245

(28437, 13)


 12%|███████▋                                                       | 12/98 [00:08<00:32,  2.65it/s]

Number of rows in tidy_sales_1992_2022_169.csv: 28437
Number of matching rows: 2820
Number of unique rows: 25617

(27690, 13)


 13%|████████▎                                                      | 13/98 [00:08<00:35,  2.38it/s]

Number of rows in tidy_sales_1992_2022_173.csv: 27690
Number of matching rows: 3131
Number of unique rows: 24559

(18852, 13)


 15%|█████████▋                                                     | 15/98 [00:09<00:28,  2.94it/s]

Number of rows in tidy_sales_1992_2022_175.csv: 18852
Number of matching rows: 3267
Number of unique rows: 15585

(8968, 13)
Number of rows in tidy_sales_1992_2022_183.csv: 8968
Number of matching rows: 208
Number of unique rows: 8760

(17920, 13)


 17%|██████████▉                                                    | 17/98 [00:09<00:22,  3.57it/s]

Number of rows in tidy_sales_1992_2022_185.csv: 17920
Number of matching rows: 951
Number of unique rows: 16969

(10933, 13)
Number of rows in tidy_sales_1992_2022_187.csv: 10933
Number of matching rows: 1584
Number of unique rows: 9349

(18055, 13)


 19%|████████████▏                                                  | 19/98 [00:10<00:21,  3.73it/s]

Number of rows in tidy_sales_1992_2022_190.csv: 18055
Number of matching rows: 909
Number of unique rows: 17146

(12332, 13)
Number of rows in tidy_sales_1992_2022_201.csv: 12332
Number of matching rows: 746
Number of unique rows: 11586

(22542, 13)


 20%|████████████▊                                                  | 20/98 [00:10<00:24,  3.13it/s]

Number of rows in tidy_sales_1992_2022_210.csv: 22542
Number of matching rows: 1419
Number of unique rows: 21123

(36523, 13)


 21%|█████████████▌                                                 | 21/98 [00:11<00:30,  2.54it/s]

Number of rows in tidy_sales_1992_2022_217.csv: 36523
Number of matching rows: 5704
Number of unique rows: 30819

(27576, 13)


 22%|██████████████▏                                                | 22/98 [00:11<00:32,  2.37it/s]

Number of rows in tidy_sales_1992_2022_219.csv: 27576
Number of matching rows: 4844
Number of unique rows: 22732

(15326, 13)


 23%|██████████████▊                                                | 23/98 [00:12<00:27,  2.72it/s]

Number of rows in tidy_sales_1992_2022_223.csv: 15326
Number of matching rows: 1565
Number of unique rows: 13761

(32915, 13)


 24%|███████████████▍                                               | 24/98 [00:12<00:31,  2.34it/s]

Number of rows in tidy_sales_1992_2022_230.csv: 32915
Number of matching rows: 2634
Number of unique rows: 30281

(23045, 13)


 26%|████████████████                                               | 25/98 [00:13<00:30,  2.37it/s]

Number of rows in tidy_sales_1992_2022_240.csv: 23045
Number of matching rows: 1603
Number of unique rows: 21442

(23577, 13)


 27%|████████████████▋                                              | 26/98 [00:13<00:29,  2.41it/s]

Number of rows in tidy_sales_1992_2022_250.csv: 23577
Number of matching rows: 3348
Number of unique rows: 20229

(25561, 13)


 28%|█████████████████▎                                             | 27/98 [00:14<00:30,  2.31it/s]

Number of rows in tidy_sales_1992_2022_253.csv: 25561
Number of matching rows: 3355
Number of unique rows: 22206

(29250, 13)


 29%|██████████████████                                             | 28/98 [00:14<00:31,  2.21it/s]

Number of rows in tidy_sales_1992_2022_259.csv: 29250
Number of matching rows: 3272
Number of unique rows: 25978

(16817, 13)


 30%|██████████████████▋                                            | 29/98 [00:14<00:27,  2.48it/s]

Number of rows in tidy_sales_1992_2022_260.csv: 16817
Number of matching rows: 2070
Number of unique rows: 14747

(47446, 13)


 31%|███████████████████▎                                           | 30/98 [00:15<00:35,  1.92it/s]

Number of rows in tidy_sales_1992_2022_265.csv: 47446
Number of matching rows: 7480
Number of unique rows: 39966

(14714, 13)


 32%|███████████████████▉                                           | 31/98 [00:15<00:29,  2.26it/s]

Number of rows in tidy_sales_1992_2022_269.csv: 14714
Number of matching rows: 1489
Number of unique rows: 13225

(22753, 13)


 33%|████████████████████▌                                          | 32/98 [00:16<00:28,  2.35it/s]

Number of rows in tidy_sales_1992_2022_270.csv: 22753
Number of matching rows: 3116
Number of unique rows: 19637

(20082, 13)


 34%|█████████████████████▏                                         | 33/98 [00:16<00:26,  2.48it/s]

Number of rows in tidy_sales_1992_2022_306.csv: 20082
Number of matching rows: 3572
Number of unique rows: 16510

(40005, 13)


 35%|█████████████████████▊                                         | 34/98 [00:17<00:31,  2.06it/s]

Number of rows in tidy_sales_1992_2022_316.csv: 40005
Number of matching rows: 8413
Number of unique rows: 31592

(22207, 13)


 36%|██████████████████████▌                                        | 35/98 [00:17<00:28,  2.24it/s]

Number of rows in tidy_sales_1992_2022_320.csv: 22207
Number of matching rows: 3765
Number of unique rows: 18442

(29496, 13)


 37%|███████████████████████▏                                       | 36/98 [00:18<00:28,  2.14it/s]

Number of rows in tidy_sales_1992_2022_326.csv: 29496
Number of matching rows: 5075
Number of unique rows: 24421

(19263, 13)


 38%|███████████████████████▊                                       | 37/98 [00:18<00:25,  2.38it/s]

Number of rows in tidy_sales_1992_2022_329.csv: 19263
Number of matching rows: 3670
Number of unique rows: 15593

(49930, 13)


 39%|████████████████████████▍                                      | 38/98 [00:19<00:30,  1.94it/s]

Number of rows in tidy_sales_1992_2022_330.csv: 49930
Number of matching rows: 14891
Number of unique rows: 35039

(14017, 13)


 40%|█████████████████████████                                      | 39/98 [00:19<00:25,  2.29it/s]

Number of rows in tidy_sales_1992_2022_336.csv: 14017
Number of matching rows: 1994
Number of unique rows: 12023

(19098, 13)


 41%|█████████████████████████▋                                     | 40/98 [00:19<00:22,  2.55it/s]

Number of rows in tidy_sales_1992_2022_340.csv: 19098
Number of matching rows: 4311
Number of unique rows: 14787

(13663, 13)


 42%|██████████████████████████▎                                    | 41/98 [00:19<00:19,  2.89it/s]

Number of rows in tidy_sales_1992_2022_350.csv: 13663
Number of matching rows: 1233
Number of unique rows: 12430

(30435, 13)


 43%|███████████████████████████                                    | 42/98 [00:20<00:21,  2.55it/s]

Number of rows in tidy_sales_1992_2022_360.csv: 30435
Number of matching rows: 4373
Number of unique rows: 26062

(48189, 13)


 44%|███████████████████████████▋                                   | 43/98 [00:21<00:28,  1.92it/s]

Number of rows in tidy_sales_1992_2022_370.csv: 48189
Number of matching rows: 7390
Number of unique rows: 40799

(41517, 13)


 45%|████████████████████████████▎                                  | 44/98 [00:21<00:30,  1.77it/s]

Number of rows in tidy_sales_1992_2022_376.csv: 41517
Number of matching rows: 6831
Number of unique rows: 34686

(31615, 13)


 46%|████████████████████████████▉                                  | 45/98 [00:22<00:28,  1.83it/s]

Number of rows in tidy_sales_1992_2022_390.csv: 31615
Number of matching rows: 6356
Number of unique rows: 25259

(25644, 13)


 47%|█████████████████████████████▌                                 | 46/98 [00:22<00:26,  1.95it/s]

Number of rows in tidy_sales_1992_2022_400.csv: 25644
Number of matching rows: 2557
Number of unique rows: 23087

(24561, 13)


 48%|██████████████████████████████▏                                | 47/98 [00:23<00:24,  2.11it/s]

Number of rows in tidy_sales_1992_2022_410.csv: 24561
Number of matching rows: 4967
Number of unique rows: 19594

(25835, 13)


 49%|██████████████████████████████▊                                | 48/98 [00:23<00:23,  2.11it/s]

Number of rows in tidy_sales_1992_2022_420.csv: 25835
Number of matching rows: 5045
Number of unique rows: 20790

(30803, 13)


 50%|███████████████████████████████▌                               | 49/98 [00:24<00:23,  2.07it/s]

Number of rows in tidy_sales_1992_2022_430.csv: 30803
Number of matching rows: 5393
Number of unique rows: 25410

(12541, 13)


 51%|████████████████████████████████▏                              | 50/98 [00:24<00:19,  2.48it/s]

Number of rows in tidy_sales_1992_2022_440.csv: 12541
Number of matching rows: 2650
Number of unique rows: 9891

(18398, 13)


 52%|████████████████████████████████▊                              | 51/98 [00:24<00:16,  2.78it/s]

Number of rows in tidy_sales_1992_2022_450.csv: 18398
Number of matching rows: 3654
Number of unique rows: 14744

(113911, 13)


 53%|█████████████████████████████████▍                             | 52/98 [00:26<00:32,  1.40it/s]

Number of rows in tidy_sales_1992_2022_461.csv: 113911
Number of matching rows: 37011
Number of unique rows: 76900

(35586, 13)


 54%|██████████████████████████████████                             | 53/98 [00:26<00:29,  1.50it/s]

Number of rows in tidy_sales_1992_2022_479.csv: 35586
Number of matching rows: 8347
Number of unique rows: 27239

(17412, 13)


 56%|███████████████████████████████████▎                           | 55/98 [00:27<00:18,  2.27it/s]

Number of rows in tidy_sales_1992_2022_480.csv: 17412
Number of matching rows: 2506
Number of unique rows: 14906

(11302, 13)
Number of rows in tidy_sales_1992_2022_482.csv: 11302
Number of matching rows: 1738
Number of unique rows: 9564

(6389, 13)


 57%|████████████████████████████████████                           | 56/98 [00:27<00:14,  2.94it/s]

Number of rows in tidy_sales_1992_2022_492.csv: 6389
Number of matching rows: 699
Number of unique rows: 5690

(33784, 13)


 58%|████████████████████████████████████▋                          | 57/98 [00:27<00:15,  2.62it/s]

Number of rows in tidy_sales_1992_2022_510.csv: 33784
Number of matching rows: 10556
Number of unique rows: 23228

(15569, 13)


 59%|█████████████████████████████████████▎                         | 58/98 [00:28<00:13,  2.89it/s]

Number of rows in tidy_sales_1992_2022_530.csv: 15569
Number of matching rows: 2652
Number of unique rows: 12917

(38795, 13)


 60%|█████████████████████████████████████▉                         | 59/98 [00:28<00:16,  2.37it/s]

Number of rows in tidy_sales_1992_2022_540.csv: 38795
Number of matching rows: 8739
Number of unique rows: 30056

(23490, 13)


 61%|██████████████████████████████████████▌                        | 60/98 [00:29<00:15,  2.48it/s]

Number of rows in tidy_sales_1992_2022_550.csv: 23490
Number of matching rows: 5440
Number of unique rows: 18050

(64520, 13)


 62%|███████████████████████████████████████▏                       | 61/98 [00:30<00:20,  1.77it/s]

Number of rows in tidy_sales_1992_2022_561.csv: 64520
Number of matching rows: 15025
Number of unique rows: 49495

(2286, 13)
Number of rows in tidy_sales_1992_2022_563.csv: 2286
Number of matching rows: 253
Number of unique rows: 2033

(27225, 13)


 64%|████████████████████████████████████████▌                      | 63/98 [00:30<00:14,  2.34it/s]

Number of rows in tidy_sales_1992_2022_573.csv: 27225
Number of matching rows: 3945
Number of unique rows: 23280

(24550, 13)


 65%|█████████████████████████████████████████▏                     | 64/98 [00:31<00:14,  2.31it/s]

Number of rows in tidy_sales_1992_2022_575.csv: 24550
Number of matching rows: 4084
Number of unique rows: 20466

(33994, 13)


 66%|█████████████████████████████████████████▊                     | 65/98 [00:31<00:15,  2.19it/s]

Number of rows in tidy_sales_1992_2022_580.csv: 33994
Number of matching rows: 7656
Number of unique rows: 26338

(30956, 13)


 67%|██████████████████████████████████████████▍                    | 66/98 [00:32<00:15,  2.09it/s]

Number of rows in tidy_sales_1992_2022_607.csv: 30956
Number of matching rows: 9545
Number of unique rows: 21411

(57306, 13)


 68%|███████████████████████████████████████████                    | 67/98 [00:32<00:18,  1.72it/s]

Number of rows in tidy_sales_1992_2022_615.csv: 57306
Number of matching rows: 15857
Number of unique rows: 41449

(55384, 13)


 69%|███████████████████████████████████████████▋                   | 68/98 [00:33<00:19,  1.54it/s]

Number of rows in tidy_sales_1992_2022_621.csv: 55384
Number of matching rows: 14846
Number of unique rows: 40538

(69843, 13)


 70%|████████████████████████████████████████████▎                  | 69/98 [00:34<00:21,  1.34it/s]

Number of rows in tidy_sales_1992_2022_630.csv: 69843
Number of matching rows: 20444
Number of unique rows: 49399

(50205, 13)


 71%|█████████████████████████████████████████████                  | 70/98 [00:35<00:21,  1.33it/s]

Number of rows in tidy_sales_1992_2022_657.csv: 50205
Number of matching rows: 11318
Number of unique rows: 38887

(34380, 13)


 72%|█████████████████████████████████████████████▋                 | 71/98 [00:36<00:18,  1.42it/s]

Number of rows in tidy_sales_1992_2022_661.csv: 34380
Number of matching rows: 5869
Number of unique rows: 28511

(12568, 13)


 74%|██████████████████████████████████████████████▉                | 73/98 [00:36<00:11,  2.23it/s]

Number of rows in tidy_sales_1992_2022_665.csv: 12568
Number of matching rows: 2090
Number of unique rows: 10478

(12693, 13)
Number of rows in tidy_sales_1992_2022_671.csv: 12693
Number of matching rows: 2354
Number of unique rows: 10339

(25833, 13)


 76%|███████████████████████████████████████████████▌               | 74/98 [00:36<00:10,  2.21it/s]

Number of rows in tidy_sales_1992_2022_706.csv: 25833
Number of matching rows: 3864
Number of unique rows: 21969

(22835, 13)


 77%|████████████████████████████████████████████████▏              | 75/98 [00:37<00:09,  2.33it/s]

Number of rows in tidy_sales_1992_2022_707.csv: 22835
Number of matching rows: 4197
Number of unique rows: 18638

(27025, 13)


 79%|█████████████████████████████████████████████████▌             | 77/98 [00:38<00:07,  2.69it/s]

Number of rows in tidy_sales_1992_2022_710.csv: 27025
Number of matching rows: 3343
Number of unique rows: 23682

(13506, 13)
Number of rows in tidy_sales_1992_2022_727.csv: 13506
Number of matching rows: 2013
Number of unique rows: 11493

(64035, 13)


 80%|██████████████████████████████████████████████████▏            | 78/98 [00:39<00:11,  1.78it/s]

Number of rows in tidy_sales_1992_2022_730.csv: 64035
Number of matching rows: 15682
Number of unique rows: 48353

(51936, 13)


 81%|██████████████████████████████████████████████████▊            | 79/98 [00:39<00:12,  1.58it/s]

Number of rows in tidy_sales_1992_2022_740.csv: 51936
Number of matching rows: 10461
Number of unique rows: 41475

(4333, 13)
Number of rows in tidy_sales_1992_2022_741.csv: 4333
Number of matching rows: 802
Number of unique rows: 3531

(35112, 13)


 83%|████████████████████████████████████████████████████           | 81/98 [00:40<00:08,  2.04it/s]

Number of rows in tidy_sales_1992_2022_746.csv: 35112
Number of matching rows: 4239
Number of unique rows: 30873

(193113, 13)


 84%|████████████████████████████████████████████████████▋          | 82/98 [00:43<00:16,  1.01s/it]

Number of rows in tidy_sales_1992_2022_751.csv: 193113
Number of matching rows: 55769
Number of unique rows: 137344

(25118, 13)


 85%|█████████████████████████████████████████████████████▎         | 83/98 [00:43<00:12,  1.18it/s]

Number of rows in tidy_sales_1992_2022_756.csv: 25118
Number of matching rows: 4843
Number of unique rows: 20275

(32556, 13)


 86%|██████████████████████████████████████████████████████         | 84/98 [00:44<00:10,  1.32it/s]

Number of rows in tidy_sales_1992_2022_760.csv: 32556
Number of matching rows: 4786
Number of unique rows: 27770

(29934, 13)


 87%|██████████████████████████████████████████████████████▋        | 85/98 [00:44<00:09,  1.44it/s]

Number of rows in tidy_sales_1992_2022_766.csv: 29934
Number of matching rows: 4410
Number of unique rows: 25524

(13567, 13)


 88%|███████████████████████████████████████████████████████▎       | 86/98 [00:44<00:06,  1.80it/s]

Number of rows in tidy_sales_1992_2022_773.csv: 13567
Number of matching rows: 2341
Number of unique rows: 11226

(26152, 13)


 89%|███████████████████████████████████████████████████████▉       | 87/98 [00:45<00:05,  1.91it/s]

Number of rows in tidy_sales_1992_2022_779.csv: 26152
Number of matching rows: 5719
Number of unique rows: 20433

(28038, 13)


 90%|████████████████████████████████████████████████████████▌      | 88/98 [00:45<00:05,  1.95it/s]

Number of rows in tidy_sales_1992_2022_787.csv: 28038
Number of matching rows: 4543
Number of unique rows: 23495

(55286, 13)


 91%|█████████████████████████████████████████████████████████▏     | 89/98 [00:46<00:05,  1.67it/s]

Number of rows in tidy_sales_1992_2022_791.csv: 55286
Number of matching rows: 11068
Number of unique rows: 44218

(22423, 13)


 92%|█████████████████████████████████████████████████████████▊     | 90/98 [00:46<00:04,  1.82it/s]

Number of rows in tidy_sales_1992_2022_810.csv: 22423
Number of matching rows: 4969
Number of unique rows: 17454

(37590, 13)


 93%|██████████████████████████████████████████████████████████▌    | 91/98 [00:47<00:03,  1.75it/s]

Number of rows in tidy_sales_1992_2022_813.csv: 37590
Number of matching rows: 7821
Number of unique rows: 29769

(23307, 13)


 94%|███████████████████████████████████████████████████████████▏   | 92/98 [00:47<00:03,  1.95it/s]

Number of rows in tidy_sales_1992_2022_820.csv: 23307
Number of matching rows: 5346
Number of unique rows: 17961

(1722, 13)
Number of rows in tidy_sales_1992_2022_825.csv: 1722
Number of matching rows: 235
Number of unique rows: 1487

(17164, 13)


 96%|████████████████████████████████████████████████████████████▍  | 94/98 [00:48<00:01,  2.82it/s]

Number of rows in tidy_sales_1992_2022_840.csv: 17164
Number of matching rows: 2024
Number of unique rows: 15140

(27197, 13)


 97%|█████████████████████████████████████████████████████████████  | 95/98 [00:48<00:01,  2.73it/s]

Number of rows in tidy_sales_1992_2022_846.csv: 27197
Number of matching rows: 6883
Number of unique rows: 20314

(25821, 13)


 98%|█████████████████████████████████████████████████████████████▋ | 96/98 [00:49<00:00,  2.57it/s]

Number of rows in tidy_sales_1992_2022_849.csv: 25821
Number of matching rows: 3948
Number of unique rows: 21873

(131115, 13)


 99%|██████████████████████████████████████████████████████████████▎| 97/98 [00:50<00:00,  1.29it/s]

Number of rows in tidy_sales_1992_2022_851.csv: 131115
Number of matching rows: 36905
Number of unique rows: 94210

(41978, 13)


100%|███████████████████████████████████████████████████████████████| 98/98 [00:51<00:00,  1.90it/s]

Number of rows in tidy_sales_1992_2022_860.csv: 41978
Number of matching rows: 9402
Number of unique rows: 32576

Runtime: 51.593843936920166



