In [1]:
import os
import pandas as pd

# =========================
# USER INPUTS
# =========================
INPUT_DIR  = r"C:\Users\ss6365\Desktop\PrivAR_PSM_PSM_I\data\original\geotrace\security"
OUTPUT_DIR = r"C:\Users\ss6365\Desktop\PrivAR_PSM_PSM_I\data\original\geotrace\privacy"
NUM_COPIES = 5        # augmentation factor
COPY_COL   = "copy"   # name of the new column

# =========================
# CREATE OUTPUT DIRECTORY
# =========================
os.makedirs(OUTPUT_DIR, exist_ok=True)

# =========================
# PROCESS FILES
# =========================
for filename in os.listdir(INPUT_DIR):
    if not filename.lower().endswith(".csv"):
        continue

    input_path = os.path.join(INPUT_DIR, filename)
    df = pd.read_csv(input_path)

    base_name = os.path.splitext(filename)[0]

    for i in range(1, NUM_COPIES + 1):
        df_copy = df.copy()
        df_copy[COPY_COL] = i

        output_filename = f"{base_name}_copy{i}.csv"
        output_path = os.path.join(OUTPUT_DIR, output_filename)

        df_copy.to_csv(output_path, index=False)

print(f"Augmentation complete. Generated {NUM_COPIES} copies per CSV.")


Augmentation complete. Generated 5 copies per CSV.


In [4]:
import os
import glob
import pandas as pd

# ============================================================
# USER INPUT
# ============================================================
BASE_DIR = r"C:\Users\ss6365\Desktop\PrivAR_PSM_PSM_I\data\distorted_only_privacy\geotrace\delta_3_Noisy_threshold_5\perturbed_1"

MECHS = ["plm", "psm", "psmi"]          # subfolders
EPS_FOLDERS = ["eps01", "eps05"]       # eps subfolders to process (add more if needed)

# If you want deterministic ordering:
# - If filenames contain numbers, lexicographic is usually OK (e.g., user_001.csv < user_010.csv)
# - For strict "natural" ordering, see note below.
SORT_FILES = True


def merge_csvs_in_folder(folder_path: str) -> pd.DataFrame:
    csv_paths = glob.glob(os.path.join(folder_path, "*.csv"))
    if not csv_paths:
        return pd.DataFrame()

    if SORT_FILES:
        csv_paths = sorted(csv_paths)

    dfs = []
    for p in csv_paths:
        df = pd.read_csv(p)
        dfs.append(df)

    merged = pd.concat(dfs, axis=0, ignore_index=True)
    return merged


for mech in MECHS:
    mech_dir = os.path.join(BASE_DIR, mech)

    for eps_folder in EPS_FOLDERS:
        eps_dir = os.path.join(mech_dir, eps_folder)

        if not os.path.isdir(eps_dir):
            print(f"[SKIP] Missing folder: {eps_dir}")
            continue

        merged_df = merge_csvs_in_folder(eps_dir)
        if merged_df.empty:
            print(f"[SKIP] No CSV files in: {eps_dir}")
            continue

        # Build output folder name exactly as you requested
        # Example:
        # ...\plm\merged_geotrace_plm_delta_3_Noisy_threshold_5_eps01
        out_folder_name = f"merged_geotrace_{mech}_delta_3_Noisy_threshold_5_{eps_folder}"
        out_dir = os.path.join(mech_dir, out_folder_name)
        os.makedirs(out_dir, exist_ok=True)

        # Output file name (inside that folder)
        # You can change this if you want a different name.
        out_csv_path = os.path.join(out_dir, f"{out_folder_name}.csv")

        merged_df.to_csv(out_csv_path, index=False)
        print(f"[OK] Merged {mech}/{eps_folder}: {len(merged_df)} rows -> {out_csv_path}")

print("Done.")


[OK] Merged plm/eps01: 63771 rows -> C:\Users\ss6365\Desktop\PrivAR_PSM_PSM_I\data\distorted_only_privacy\geotrace\delta_3_Noisy_threshold_5\perturbed_1\plm\merged_geotrace_plm_delta_3_Noisy_threshold_5_eps01\merged_geotrace_plm_delta_3_Noisy_threshold_5_eps01.csv
[SKIP] Missing folder: C:\Users\ss6365\Desktop\PrivAR_PSM_PSM_I\data\distorted_only_privacy\geotrace\delta_3_Noisy_threshold_5\perturbed_1\plm\eps05
[OK] Merged psm/eps01: 63771 rows -> C:\Users\ss6365\Desktop\PrivAR_PSM_PSM_I\data\distorted_only_privacy\geotrace\delta_3_Noisy_threshold_5\perturbed_1\psm\merged_geotrace_psm_delta_3_Noisy_threshold_5_eps01\merged_geotrace_psm_delta_3_Noisy_threshold_5_eps01.csv
[SKIP] Missing folder: C:\Users\ss6365\Desktop\PrivAR_PSM_PSM_I\data\distorted_only_privacy\geotrace\delta_3_Noisy_threshold_5\perturbed_1\psm\eps05
[OK] Merged psmi/eps01: 63771 rows -> C:\Users\ss6365\Desktop\PrivAR_PSM_PSM_I\data\distorted_only_privacy\geotrace\delta_3_Noisy_threshold_5\perturbed_1\psmi\merged_geotra

In [5]:
import os
import glob
import re
import pandas as pd

# ============================================================
# USER INPUT
# ============================================================
BASE_DIR = r"C:\Users\ss6365\Desktop\PrivAR_PSM_PSM_I\data\distorted_only_privacy\geotrace\delta_3_Noisy_threshold_5\perturbed_1"

MECHS = ["plm", "psm", "psmi"]
EPS_FOLDERS = ["eps01", "eps05"]  # add more if needed

USE_NATURAL_SORT = True


def natural_key(path: str):
    name = os.path.basename(path)
    return [int(t) if t.isdigit() else t.lower() for t in re.split(r"(\d+)", name)]


def merge_csvs_in_folder(folder_path: str) -> pd.DataFrame:
    csv_paths = glob.glob(os.path.join(folder_path, "*.csv"))
    if not csv_paths:
        return pd.DataFrame()

    if USE_NATURAL_SORT:
        csv_paths = sorted(csv_paths, key=natural_key)
    else:
        csv_paths = sorted(csv_paths)

    dfs = [pd.read_csv(p) for p in csv_paths]
    return pd.concat(dfs, axis=0, ignore_index=True)


for mech in MECHS:
    mech_dir = os.path.join(BASE_DIR, mech)
    if not os.path.isdir(mech_dir):
        print(f"[SKIP] Missing mech folder: {mech_dir}")
        continue

    for eps_folder in EPS_FOLDERS:
        eps_dir = os.path.join(mech_dir, eps_folder)
        if not os.path.isdir(eps_dir):
            print(f"[SKIP] Missing eps folder: {eps_dir}")
            continue

        merged_df = merge_csvs_in_folder(eps_dir)
        if merged_df.empty:
            print(f"[SKIP] No CSV files in: {eps_dir}")
            continue

        # Save DIRECTLY in plm/psm/psmi folder (NOT in a new subfolder)
        out_name = f"merged_geotrace_{mech}_delta_3_Noisy_threshold_5_{eps_folder}.csv"
        out_csv_path = os.path.join(mech_dir, out_name)

        merged_df.to_csv(out_csv_path, index=False)
        print(f"[OK] {mech}/{eps_folder}: {len(merged_df)} rows -> {out_csv_path}")

print("Done.")


[OK] plm/eps01: 63771 rows -> C:\Users\ss6365\Desktop\PrivAR_PSM_PSM_I\data\distorted_only_privacy\geotrace\delta_3_Noisy_threshold_5\perturbed_1\plm\merged_geotrace_plm_delta_3_Noisy_threshold_5_eps01.csv
[SKIP] Missing eps folder: C:\Users\ss6365\Desktop\PrivAR_PSM_PSM_I\data\distorted_only_privacy\geotrace\delta_3_Noisy_threshold_5\perturbed_1\plm\eps05
[OK] psm/eps01: 63771 rows -> C:\Users\ss6365\Desktop\PrivAR_PSM_PSM_I\data\distorted_only_privacy\geotrace\delta_3_Noisy_threshold_5\perturbed_1\psm\merged_geotrace_psm_delta_3_Noisy_threshold_5_eps01.csv
[SKIP] Missing eps folder: C:\Users\ss6365\Desktop\PrivAR_PSM_PSM_I\data\distorted_only_privacy\geotrace\delta_3_Noisy_threshold_5\perturbed_1\psm\eps05
[OK] psmi/eps01: 63771 rows -> C:\Users\ss6365\Desktop\PrivAR_PSM_PSM_I\data\distorted_only_privacy\geotrace\delta_3_Noisy_threshold_5\perturbed_1\psmi\merged_geotrace_psmi_delta_3_Noisy_threshold_5_eps01.csv
[SKIP] Missing eps folder: C:\Users\ss6365\Desktop\PrivAR_PSM_PSM_I\data\

In [None]:
now i have this merged csv file and i want to do some machine learning thing on it. for the csv file, you will find many columns, but the one that you want to discretize is latitude and longitude

so basically latitude and longitude anre perturbed to perturbed_latitude and perturbed_longitude

so i want to discretize (latitude and longitude) to location_id, so 2d to 1d, now my machine learnings task is to find location_id to perturtbed latiutude and longuitude

so for this what you ahve to do is discretize latitude and longitude to location id and add id number in the column for , give too corasegridding many different lcoation pair would be in the same location id, if coarse then the opposite. 

def lat_lon_to_grid_pos(lat, lon, min_lat, max_lat, min_lon, max_lon, grid_size):
    lat_step = (max_lat - min_lat) / grid_size
    lon_step = (max_lon - min_lon) / grid_size
    
    # Calculate grid positions, clamping to the grid size - 1
    lat_pos = min(int((lat - min_lat) / lat_step), grid_size - 1)
    lon_pos = min(int((lon - min_lon) / lon_step), grid_size - 1)
    
    # Convert 2D grid position to a single integer
    grid_pos = lat_pos * grid_size + lon_pos
    return grid_pos

def encode_locations(df, num_lat_bins, num_lon_bins):
    """
    Encode latitude and longitude into a fixed-size grid and calculate the average
    latitude and longitude for each grid cell.
    """
    lat_min, lat_max = df['latitude'].min(), df['latitude'].max()
    lon_min, lon_max = df['longitude'].min(), df['longitude'].max()

    lat_bins = np.linspace(lat_min, lat_max, num_lat_bins + 1)
    lon_bins = np.linspace(lon_min, lon_max, num_lon_bins + 1)

    df['lat_bin'] = pd.cut(df['latitude'], bins=lat_bins, labels=False, include_lowest=True)
    df['lon_bin'] = pd.cut(df['longitude'], bins=lon_bins, labels=False, include_lowest=True)
    
    df['location_id'] = df['lat_bin'] * num_lon_bins + df['lon_bin'] + 1

    return df

this is some code that i have which may be wrong, your task is to give me the accurate code