# active / inactive

this notebook generates acvtive / inactive table for fp shoplst daily.  
there should be 2 functions, 

1. create active / inactive
given a start and end date, this function should fetch all data in this time interval, concat them, 
and make an active table and inactive table.  

2. update to target
given a target date, this function should update the active table day by day until the target date. 

## lib

In [9]:
import pandas as pd
import numpy as np 
import os
import traceback
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
from typing import List, Optional, Dict

## func

In [27]:
from pathlib import Path
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import traceback
from typing import List, Optional, Dict

def deep_find_files(file_format: str, directory: str, *keywords: str) -> List[str]:
    '''
    Use rglob to recursively search for files with the given format in the specified directory and all subdirectories
    '''
    return [
        str(file)
        for file in Path(directory).rglob(f'*.{file_format}')
        if all(keyword in file.name for keyword in keywords)
    ]

def concat_folder(folder_path, keywords: List[str], max_workers=8):
    """
    Read every CSV under `folder_path` (and subfolders) in parallel,
    add a column `scrape_batch_date` extracted from the folder name,
    and return one concatenated DataFrame (or empty DataFrame if none).
    """
    try:
        # Find all CSV files in the folder (and its subfolders)
        csv_files = deep_find_files('csv', folder_path, *keywords)
        print(f"Found {len(csv_files)} CSV files in {folder_path}")
        if not csv_files:
            return pd.DataFrame()

        # Assume folder_path ends with YYYY-MM-DD
        date_str = os.path.basename(os.path.normpath(folder_path))

        dfs = []
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            # Submit all reads in parallel
            futures = {executor.submit(pd.read_csv, fp): fp for fp in csv_files}
            for fut in futures:
                fp = futures[fut]
                try:
                    df = fut.result()
                    # Add the scrape_batch_date column
                    df['scrape_batch_date'] = date_str
                    dfs.append(df)
                except Exception as e:
                    print(f"⚠️ failed to read {fp}: {e}")

        # Concatenate or return empty DataFrame
        return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()

    except Exception:
        print(f"Error in concat_folder({folder_path}):")
        traceback.print_exc()
        return pd.DataFrame()

In [28]:
def update_survival_dfs(base_folder, 
                        start_date, end_date,
                        keywords,
                        max_workers=8,
                        active_file="active_shops.csv", 
                        inactive_file="inactive_shops.csv"):
    """
    Updates existing active/inactive shop files with new data.
    
    1) Reads existing active & inactive CSVs if they exist
    2) Loads new data from specified date range
    3) Combines all data and recalculates active/inactive status
    4) Saves updated files
    """
    import os
    
    # Read existing files if they exist
    existing_active = pd.DataFrame()
    existing_inactive = pd.DataFrame()
    
    if os.path.exists(active_file):
        try:
            existing_active = pd.read_csv(active_file)
            print(f"Loaded {len(existing_active)} shops from {active_file}")
        except Exception as e:
            print(f"Error reading {active_file}: {e}")
    
    if os.path.exists(inactive_file):
        try:
            existing_inactive = pd.read_csv(inactive_file)
            print(f"Loaded {len(existing_inactive)} shops from {inactive_file}")
        except Exception as e:
            print(f"Error reading {inactive_file}: {e}")
    
    # Process new data
    try:
        # build list of date-folders
        dates = pd.date_range(start=start_date, end=end_date)
        folders = [f"{base_folder}/{d.strftime('%Y-%m-%d')}" for d in dates]
        
        # parallel load with progress bar
        dfs = []
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            for df in tqdm(
                executor.map(lambda fp: concat_folder(fp, keywords, max_workers), folders),
                total=len(folders),
                desc="Processing new dates",
            ):
                if not df.empty:
                    dfs.append(df)
        
        if not dfs:
            print(f"No new data found in any folder from {start_date} to {end_date}")
            # If we have existing data, just return it
            if not (existing_active.empty and existing_inactive.empty):
                return existing_active, existing_inactive
            else:
                return pd.DataFrame(), pd.DataFrame()
        
        # merge all daily data from new period
        new_date_shoplst_df = pd.concat(dfs, ignore_index=True)
        
        # ensure datetime
        new_date_shoplst_df['scrape_batch_date'] = pd.to_datetime(
            new_date_shoplst_df['scrape_batch_date'],
            format='%Y-%m-%d'
        )
        
        # Prepare existing data to merge with new data
        existing_combined = pd.DataFrame()
        if not (existing_active.empty and existing_inactive.empty):
            # Convert date columns to datetime in existing data
            for df in [existing_active, existing_inactive]:
                if not df.empty:
                    for col in ['scrape_batch_date', 'first_seen', 'last_seen']:
                        if col in df.columns:
                            df[col] = pd.to_datetime(df[col])
            
            # Combine existing active and inactive
            existing_combined = pd.concat([existing_active, existing_inactive], ignore_index=True)
            
            # Extract just the raw data columns (excluding first_seen, last_seen)
            cols_to_keep = [col for col in existing_combined.columns 
                           if col not in ['first_seen', 'last_seen']]
            
            existing_raw = existing_combined[cols_to_keep].copy()
            
            # Combine with new data
            date_shoplst_df = pd.concat([existing_raw, new_date_shoplst_df], ignore_index=True)
        else:
            date_shoplst_df = new_date_shoplst_df
            
        # Remove potential duplicates (same shop on same day)
        date_shoplst_df = date_shoplst_df.drop_duplicates(
            subset=['shopCode', 'scrape_batch_date'], 
            keep='last'
        )
        
        # compute first & last seen per shop
        shop_dates = (
            date_shoplst_df
            .groupby('shopCode')['scrape_batch_date']
            .agg(first_seen='min', last_seen='max')
            .reset_index()
        )
        
        # grab each shop's full row at its last_seen date
        last_record_df = (
            date_shoplst_df
            .sort_values(['shopCode','scrape_batch_date'], ascending=[True, False])
            .drop_duplicates(subset='shopCode', keep='first')
        )
        
        # merge to bring in first_seen/last_seen
        merged = last_record_df.merge(shop_dates, on='shopCode', how='left')
        
        # reference = latest last_seen
        reference_date = shop_dates['last_seen'].max()
        merged['days_since_last'] = (reference_date - merged['last_seen']).dt.days
        
        # split
        active_df = merged[merged['days_since_last'] < 60].drop(columns='days_since_last')
        inactive_df = merged[merged['days_since_last'] >= 60].drop(columns='days_since_last')
        
        # Save updated files
        active_df.to_csv(active_file, encoding="utf-8-sig", index=False)
        print(f"Saved {len(active_df)} active shops to {active_file}")
        
        inactive_df.to_csv(inactive_file, encoding="utf-8-sig", index=False)
        print(f"Saved {len(inactive_df)} inactive shops to {inactive_file}")
        
        return active_df, inactive_df
        
    except Exception:
        print("Error in update_survival_dfs:")
        traceback.print_exc()
        if not (existing_active.empty and existing_inactive.empty):
            print("Returning existing data without updates")
            return existing_active, existing_inactive
        return pd.DataFrame(), pd.DataFrame()


## implement on test data

In [30]:
# example usage - update test records
# usage - create new records
active, inactive = update_survival_dfs(
    base_folder="data/survive_test_data",
    start_date="2024-01-01",  # Just process new dates
    end_date="2024-12-31",
    keywords=[],
    max_workers=8,
    active_file="test_active_shops.csv",
    inactive_file="test_inactive_shops.csv"
)

Found 2 CSV files in data/survive_test_data/2024-01-03Found 2 CSV files in data/survive_test_data/2024-01-01

Found 2 CSV files in data/survive_test_data/2024-01-02
Found 2 CSV files in data/survive_test_data/2024-01-04
Found 0 CSV files in data/survive_test_data/2024-01-07
Found 0 CSV files in data/survive_test_data/2024-01-05
Found 0 CSV files in data/survive_test_data/2024-01-06
Found 0 CSV files in data/survive_test_data/2024-01-08
Found 0 CSV files in data/survive_test_data/2024-01-09
Found 2 CSV files in data/survive_test_data/2024-01-12
Found 2 CSV files in data/survive_test_data/2024-01-10
Found 2 CSV files in data/survive_test_data/2024-01-11
Found 2 CSV files in data/survive_test_data/2024-01-13


Processing new dates:   0%|          | 0/366 [00:00<?, ?it/s]

Found 2 CSV files in data/survive_test_data/2024-01-14
Found 2 CSV files in data/survive_test_data/2024-01-15
Found 2 CSV files in data/survive_test_data/2024-01-16
Found 2 CSV files in data/survive_test_data/2024-01-18
Found 2 CSV files in data/survive_test_data/2024-01-17
Found 2 CSV files in data/survive_test_data/2024-01-19
Found 2 CSV files in data/survive_test_data/2024-01-20


Processing new dates:   4%|▍         | 16/366 [00:00<00:02, 154.75it/s]

Found 2 CSV files in data/survive_test_data/2024-01-21
Found 2 CSV files in data/survive_test_data/2024-01-23
Found 2 CSV files in data/survive_test_data/2024-01-22
Found 2 CSV files in data/survive_test_data/2024-01-24
Found 2 CSV files in data/survive_test_data/2024-01-25
Found 2 CSV files in data/survive_test_data/2024-01-26
Found 2 CSV files in data/survive_test_data/2024-01-27
Found 2 CSV files in data/survive_test_data/2024-01-28
Found 2 CSV files in data/survive_test_data/2024-01-29
Found 2 CSV files in data/survive_test_data/2024-01-30
Found 2 CSV files in data/survive_test_data/2024-01-31
Found 2 CSV files in data/survive_test_data/2024-02-02Found 2 CSV files in data/survive_test_data/2024-02-03

Found 2 CSV files in data/survive_test_data/2024-02-04
Found 2 CSV files in data/survive_test_data/2024-02-01
Found 2 CSV files in data/survive_test_data/2024-02-05
Found 2 CSV files in data/survive_test_data/2024-02-06
Found 2 CSV files in data/survive_test_data/2024-02-07
Found 2 CS

Processing new dates:   9%|▊         | 32/366 [00:00<00:02, 122.72it/s]

Found 2 CSV files in data/survive_test_data/2024-02-09
Found 2 CSV files in data/survive_test_data/2024-02-10
Found 2 CSV files in data/survive_test_data/2024-02-12
Found 2 CSV files in data/survive_test_data/2024-02-11
Found 2 CSV files in data/survive_test_data/2024-02-13
Found 2 CSV files in data/survive_test_data/2024-02-15
Found 2 CSV files in data/survive_test_data/2024-02-14


Processing new dates:  12%|█▏        | 45/366 [00:00<00:03, 90.87it/s] 

Found 2 CSV files in data/survive_test_data/2024-02-16
Found 2 CSV files in data/survive_test_data/2024-02-17
Found 2 CSV files in data/survive_test_data/2024-02-18
Found 2 CSV files in data/survive_test_data/2024-02-19
Found 0 CSV files in data/survive_test_data/2024-02-20
Found 2 CSV files in data/survive_test_data/2024-02-21
Found 0 CSV files in data/survive_test_data/2024-02-22
Found 0 CSV files in data/survive_test_data/2024-02-24
Found 0 CSV files in data/survive_test_data/2024-02-23
Found 2 CSV files in data/survive_test_data/2024-02-26
Found 2 CSV files in data/survive_test_data/2024-02-25


Processing new dates:  17%|█▋        | 61/366 [00:00<00:02, 110.70it/s]

Found 2 CSV files in data/survive_test_data/2024-02-27
Found 2 CSV files in data/survive_test_data/2024-02-28
Found 2 CSV files in data/survive_test_data/2024-03-01
Found 2 CSV files in data/survive_test_data/2024-03-02
Found 2 CSV files in data/survive_test_data/2024-02-29
Found 2 CSV files in data/survive_test_data/2024-03-03
Found 2 CSV files in data/survive_test_data/2024-03-04
Found 0 CSV files in data/survive_test_data/2024-03-06
Found 2 CSV files in data/survive_test_data/2024-03-05
Found 0 CSV files in data/survive_test_data/2024-03-07
Found 2 CSV files in data/survive_test_data/2024-03-09
Found 2 CSV files in data/survive_test_data/2024-03-08
Found 2 CSV files in data/survive_test_data/2024-03-12
Found 2 CSV files in data/survive_test_data/2024-03-10
Found 2 CSV files in data/survive_test_data/2024-03-13
Found 2 CSV files in data/survive_test_data/2024-03-14
Found 1 CSV files in data/survive_test_data/2024-03-11
Found 0 CSV files in data/survive_test_data/2024-03-15
Found 2 CS

Processing new dates:  28%|██▊       | 102/366 [00:00<00:02, 131.90it/s]

Found 2 CSV files in data/survive_test_data/2024-03-19
Found 2 CSV files in data/survive_test_data/2024-03-20
Found 2 CSV files in data/survive_test_data/2024-03-21
Found 2 CSV files in data/survive_test_data/2024-03-22
Found 2 CSV files in data/survive_test_data/2024-03-25
Found 2 CSV files in data/survive_test_data/2024-03-23
Found 2 CSV files in data/survive_test_data/2024-03-26
Found 2 CSV files in data/survive_test_data/2024-03-24
Found 2 CSV files in data/survive_test_data/2024-03-28
Found 2 CSV files in data/survive_test_data/2024-03-27
Found 0 CSV files in data/survive_test_data/2024-03-30
Found 0 CSV files in data/survive_test_data/2024-04-02
Found 0 CSV files in data/survive_test_data/2024-03-29
Found 0 CSV files in data/survive_test_data/2024-04-01
Found 0 CSV files in data/survive_test_data/2024-04-03
Found 0 CSV files in data/survive_test_data/2024-04-05
Found 0 CSV files in data/survive_test_data/2024-04-04
Found 2 CSV files in data/survive_test_data/2024-03-31
Found 0 CS

Processing new dates:  37%|███▋      | 137/366 [00:01<00:01, 134.61it/s]

Found 2 CSV files in data/survive_test_data/2024-05-04
Found 2 CSV files in data/survive_test_data/2024-05-05
Found 2 CSV files in data/survive_test_data/2024-05-06
Found 2 CSV files in data/survive_test_data/2024-05-08
Found 2 CSV files in data/survive_test_data/2024-05-07
Found 2 CSV files in data/survive_test_data/2024-05-10
Found 2 CSV files in data/survive_test_data/2024-05-09
Found 2 CSV files in data/survive_test_data/2024-05-11
Found 2 CSV files in data/survive_test_data/2024-05-12
Found 2 CSV files in data/survive_test_data/2024-05-15
Found 2 CSV files in data/survive_test_data/2024-05-14
Found 2 CSV files in data/survive_test_data/2024-05-13
Found 2 CSV files in data/survive_test_data/2024-05-16
Found 2 CSV files in data/survive_test_data/2024-05-18
Found 2 CSV files in data/survive_test_data/2024-05-17
Found 2 CSV files in data/survive_test_data/2024-05-21
Found 2 CSV files in data/survive_test_data/2024-05-20
Found 2 CSV files in data/survive_test_data/2024-05-19
Found 2 CS

Processing new dates:  46%|████▌     | 169/366 [00:01<00:01, 145.07it/s]

Found 2 CSV files in data/survive_test_data/2024-06-01
Found 2 CSV files in data/survive_test_data/2024-06-02
Found 2 CSV files in data/survive_test_data/2024-06-03
Found 2 CSV files in data/survive_test_data/2024-06-04
Found 2 CSV files in data/survive_test_data/2024-06-05
Found 2 CSV files in data/survive_test_data/2024-06-06
Found 2 CSV files in data/survive_test_data/2024-06-07
Found 2 CSV files in data/survive_test_data/2024-06-08
Found 2 CSV files in data/survive_test_data/2024-06-09
Found 2 CSV files in data/survive_test_data/2024-06-10
Found 2 CSV files in data/survive_test_data/2024-06-12
Found 2 CSV files in data/survive_test_data/2024-06-11
Found 2 CSV files in data/survive_test_data/2024-06-13
Found 2 CSV files in data/survive_test_data/2024-06-14
Found 2 CSV files in data/survive_test_data/2024-06-15
Found 2 CSV files in data/survive_test_data/2024-06-16
Found 2 CSV files in data/survive_test_data/2024-06-17
Found 2 CSV files in data/survive_test_data/2024-06-18
Found 2 CS

Processing new dates:  58%|█████▊    | 211/366 [00:01<00:00, 168.19it/s]

Found 2 CSV files in data/survive_test_data/2024-07-12
Found 2 CSV files in data/survive_test_data/2024-07-13
Found 2 CSV files in data/survive_test_data/2024-07-14
Found 2 CSV files in data/survive_test_data/2024-07-15
Found 2 CSV files in data/survive_test_data/2024-07-16
Found 2 CSV files in data/survive_test_data/2024-07-17
Found 2 CSV files in data/survive_test_data/2024-07-18
Found 2 CSV files in data/survive_test_data/2024-07-19
Found 2 CSV files in data/survive_test_data/2024-07-20
Found 2 CSV files in data/survive_test_data/2024-07-21
Found 2 CSV files in data/survive_test_data/2024-07-22
Found 2 CSV files in data/survive_test_data/2024-07-23
Found 2 CSV files in data/survive_test_data/2024-07-24
Found 2 CSV files in data/survive_test_data/2024-07-25
Found 2 CSV files in data/survive_test_data/2024-07-26
Found 2 CSV files in data/survive_test_data/2024-07-27
Found 2 CSV files in data/survive_test_data/2024-07-28
Found 2 CSV files in data/survive_test_data/2024-07-29
Found 2 CS

Processing new dates:  63%|██████▎   | 229/366 [00:01<00:00, 157.85it/s]

Found 2 CSV files in data/survive_test_data/2024-08-16
Found 2 CSV files in data/survive_test_data/2024-08-15
Found 2 CSV files in data/survive_test_data/2024-08-17
Found 2 CSV files in data/survive_test_data/2024-08-18
Found 2 CSV files in data/survive_test_data/2024-08-19
Found 2 CSV files in data/survive_test_data/2024-08-20
Found 2 CSV files in data/survive_test_data/2024-08-21
Found 2 CSV files in data/survive_test_data/2024-08-22
Found 2 CSV files in data/survive_test_data/2024-08-23
Found 2 CSV files in data/survive_test_data/2024-08-24
Found 2 CSV files in data/survive_test_data/2024-08-25
Found 2 CSV files in data/survive_test_data/2024-08-26
Found 2 CSV files in data/survive_test_data/2024-08-27
Found 2 CSV files in data/survive_test_data/2024-08-29
Found 2 CSV files in data/survive_test_data/2024-08-28
Found 2 CSV files in data/survive_test_data/2024-08-30
Found 2 CSV files in data/survive_test_data/2024-08-31
Found 2 CSV files in data/survive_test_data/2024-09-02
Found 2 CS

Processing new dates:  71%|███████▏  | 261/366 [00:01<00:00, 134.15it/s]

Found 2 CSV files in data/survive_test_data/2024-09-04
Found 2 CSV files in data/survive_test_data/2024-09-05
Found 2 CSV files in data/survive_test_data/2024-09-06
Found 2 CSV files in data/survive_test_data/2024-09-08
Found 2 CSV files in data/survive_test_data/2024-09-07
Found 2 CSV files in data/survive_test_data/2024-09-09
Found 2 CSV files in data/survive_test_data/2024-09-10
Found 2 CSV files in data/survive_test_data/2024-09-11
Found 2 CSV files in data/survive_test_data/2024-09-12
Found 2 CSV files in data/survive_test_data/2024-09-13
Found 2 CSV files in data/survive_test_data/2024-09-14
Found 2 CSV files in data/survive_test_data/2024-09-15
Found 2 CSV files in data/survive_test_data/2024-09-16
Found 2 CSV files in data/survive_test_data/2024-09-17
Found 2 CSV files in data/survive_test_data/2024-09-18
Found 2 CSV files in data/survive_test_data/2024-09-19
Found 2 CSV files in data/survive_test_data/2024-09-20
Found 2 CSV files in data/survive_test_data/2024-09-21
Found 2 CS

Processing new dates:  79%|███████▉  | 289/366 [00:02<00:00, 131.99it/s]

Found 2 CSV files in data/survive_test_data/2024-10-05
Found 2 CSV files in data/survive_test_data/2024-10-06
Found 2 CSV files in data/survive_test_data/2024-10-07
Found 2 CSV files in data/survive_test_data/2024-10-08
Found 2 CSV files in data/survive_test_data/2024-10-10
Found 2 CSV files in data/survive_test_data/2024-10-09
Found 2 CSV files in data/survive_test_data/2024-10-11
Found 2 CSV files in data/survive_test_data/2024-10-12
Found 2 CSV files in data/survive_test_data/2024-10-13
Found 2 CSV files in data/survive_test_data/2024-10-14
Found 2 CSV files in data/survive_test_data/2024-10-15
Found 2 CSV files in data/survive_test_data/2024-10-16
Found 2 CSV files in data/survive_test_data/2024-10-17
Found 2 CSV files in data/survive_test_data/2024-10-18
Found 2 CSV files in data/survive_test_data/2024-10-19
Found 2 CSV files in data/survive_test_data/2024-10-20
Found 2 CSV files in data/survive_test_data/2024-10-21
Found 2 CSV files in data/survive_test_data/2024-10-22
Found 2 CS

Processing new dates:  87%|████████▋ | 319/366 [00:02<00:00, 136.85it/s]

Found 2 CSV files in data/survive_test_data/2024-11-02
Found 2 CSV files in data/survive_test_data/2024-11-03
Found 2 CSV files in data/survive_test_data/2024-11-04
Found 2 CSV files in data/survive_test_data/2024-11-06
Found 2 CSV files in data/survive_test_data/2024-11-05
Found 2 CSV files in data/survive_test_data/2024-11-07
Found 2 CSV files in data/survive_test_data/2024-11-09
Found 2 CSV files in data/survive_test_data/2024-11-08
Found 2 CSV files in data/survive_test_data/2024-11-10
Found 2 CSV files in data/survive_test_data/2024-11-11
Found 2 CSV files in data/survive_test_data/2024-11-13
Found 2 CSV files in data/survive_test_data/2024-11-12
Found 2 CSV files in data/survive_test_data/2024-11-14
Found 2 CSV files in data/survive_test_data/2024-11-15
Found 2 CSV files in data/survive_test_data/2024-11-16
Found 0 CSV files in data/survive_test_data/2024-11-18
Found 2 CSV files in data/survive_test_data/2024-11-17
Found 0 CSV files in data/survive_test_data/2024-11-19
Found 2 CS

Processing new dates:  95%|█████████▌| 349/366 [00:02<00:00, 135.05it/s]

Found 2 CSV files in data/survive_test_data/2024-12-02
Found 2 CSV files in data/survive_test_data/2024-12-03
Found 2 CSV files in data/survive_test_data/2024-12-04
Found 2 CSV files in data/survive_test_data/2024-12-06
Found 2 CSV files in data/survive_test_data/2024-12-05
Found 2 CSV files in data/survive_test_data/2024-12-07
Found 2 CSV files in data/survive_test_data/2024-12-08
Found 2 CSV files in data/survive_test_data/2024-12-09
Found 2 CSV files in data/survive_test_data/2024-12-10
Found 2 CSV files in data/survive_test_data/2024-12-11
Found 2 CSV files in data/survive_test_data/2024-12-12
Found 2 CSV files in data/survive_test_data/2024-12-14
Found 2 CSV files in data/survive_test_data/2024-12-13
Found 4 CSV files in data/survive_test_data/2024-12-15
Found 4 CSV files in data/survive_test_data/2024-12-16
Found 4 CSV files in data/survive_test_data/2024-12-17
Found 4 CSV files in data/survive_test_data/2024-12-18
Found 4 CSV files in data/survive_test_data/2024-12-19
Found 4 CS

Processing new dates: 100%|██████████| 366/366 [00:02<00:00, 129.97it/s]

Found 4 CSV files in data/survive_test_data/2024-12-24
Found 4 CSV files in data/survive_test_data/2024-12-25
Found 4 CSV files in data/survive_test_data/2024-12-26
Found 4 CSV files in data/survive_test_data/2024-12-27
Found 4 CSV files in data/survive_test_data/2024-12-28
Found 4 CSV files in data/survive_test_data/2024-12-29
Found 4 CSV files in data/survive_test_data/2024-12-30
Found 4 CSV files in data/survive_test_data/2024-12-31





Saved 1942 active shops to test_active_shops.csv
Saved 142 inactive shops to test_inactive_shops.csv


In [32]:
# usage - create new records
active, inactive = update_survival_dfs(
    base_folder="data/survive_test_data",
    start_date="2025-01-01",  # Just process new dates
    end_date="2025-01-01",
    keywords=[],
    max_workers=8,
    active_file="test_active_shops.csv",
    inactive_file="test_inactive_shops.csv"
)

Loaded 1942 shops from test_active_shops.csv
Loaded 142 shops from test_inactive_shops.csv


Processing new dates: 100%|██████████| 1/1 [00:00<00:00, 200.89it/s]

Found 1 CSV files in data/survive_test_data/2025-01-01
Saved 1942 active shops to test_active_shops.csv
Saved 142 inactive shops to test_inactive_shops.csv





## inplement on files in google drive

In [29]:
# usage - create new records
active, inactive = update_survival_dfs(
    base_folder="/Users/yun/Library/CloudStorage/GoogleDrive-racoffee33@gmail.com/.shortcut-targets-by-id/1ouHUfwV9gjb5MilbqRsuT55HCukAcivErdpeotDNOF0iimcyQ3_uQTwHwmAdJB_CMVcuQmNG/FoodpandaUbereat/foodpanda_crawler_beta_2/shopLst",
    start_date="2023-07-14",  # Just process new dates
    end_date="2023-07-15",
    keywords=['shopLst'], 
    max_workers=8,
    active_file="active_shops.csv",
    inactive_file="inactive_shops.csv"
)

Processing new dates:   0%|          | 0/2 [00:00<?, ?it/s]

Found 543 CSV files in /Users/yun/Library/CloudStorage/GoogleDrive-racoffee33@gmail.com/.shortcut-targets-by-id/1ouHUfwV9gjb5MilbqRsuT55HCukAcivErdpeotDNOF0iimcyQ3_uQTwHwmAdJB_CMVcuQmNG/FoodpandaUbereat/foodpanda_crawler_beta_2/shopLst/2023-07-14
Found 543 CSV files in /Users/yun/Library/CloudStorage/GoogleDrive-racoffee33@gmail.com/.shortcut-targets-by-id/1ouHUfwV9gjb5MilbqRsuT55HCukAcivErdpeotDNOF0iimcyQ3_uQTwHwmAdJB_CMVcuQmNG/FoodpandaUbereat/foodpanda_crawler_beta_2/shopLst/2023-07-15


  return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
Processing new dates: 100%|██████████| 2/2 [00:04<00:00,  2.46s/it]


Saved 59498 active shops to active_shops.csv
Saved 0 inactive shops to inactive_shops.csv
