In [None]:
import pandas as pd
import os

# Set paths
DATA_DIR = r"e:\BSK-SER\BSK-SER\data"
CITIZEN_FILE = os.path.join(DATA_DIR, "ml_citizen_master.csv")
PROVISION_FILE = os.path.join(DATA_DIR, "ml_provision.csv")
FINAL_DF_FILE = os.path.join(DATA_DIR, "final_df.csv")

# Target size
TARGET_SIZE_MB = 45
BYTES_PER_MB = 1024 * 1024
TARGET_SIZE_BYTES = TARGET_SIZE_MB * BYTES_PER_MB

def truncate_file(file_path):
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        return

    file_size = os.path.getsize(file_path)
    file_size_mb = file_size / BYTES_PER_MB
    print(f"Processing {os.path.basename(file_path)}...")
    print(f"  Current Size: {file_size_mb:.2f} MB")

    if file_size <= TARGET_SIZE_BYTES:
        print("  Size is already under limit. Skipping.")
        return

    # Estimate rows to keep
    try:
        # Sample first 1000 rows to get average bytes per row
        sample_df = pd.read_csv(file_path, nrows=1000, encoding='latin-1')
        sample_csv = sample_df.to_csv(index=False)
        sample_size = len(sample_csv)
        avg_bytes_per_row = sample_size / 1000
        
        target_rows = int(TARGET_SIZE_BYTES / avg_bytes_per_row)
        print(f"  Estimated target rows: {target_rows}")
        
        # Read only the target number of rows
        df = pd.read_csv(file_path, nrows=target_rows, encoding='latin-1')
        
        # Save truncated file
        df.to_csv(file_path, index=False, encoding='latin-1')
        
        new_size = os.path.getsize(file_path)
        new_size_mb = new_size / BYTES_PER_MB
        print(f"  Truncated Size: {new_size_mb:.2f} MB")

    except Exception as e:
        print(f"  Error processing file: {e}")

# Run truncation
truncate_file(CITIZEN_FILE)
truncate_file(PROVISION_FILE)
truncate_file(FINAL_DF_FILE)
   