In [20]:
import pandas as pd
import numpy as np
from tqdm import tqdm  # For progress bar
import os

# ---------------------- Configuration ----------------------

# Paths to your CSV files
DATABASE_PATH = '5_alteast4_movielens_data.csv'
AUX_DATA_PATH = '5_percent_mldataset_intersected_adversary_data_for.csv'
WEIGHTS_PATH = '5_movie_weights.csv'

# Output directories for results
OUTPUT_DIR_UNIFORM = 'deanonymization_results_uniform'
OUTPUT_DIR_NO_TOP_100 = 'deanonymization_results_no_top_100'
OUTPUT_DIR_NO_TOP_500 = 'deanonymization_results_no_top_500'
# Ensure output directories exist
os.makedirs(OUTPUT_DIR_UNIFORM, exist_ok=True)
os.makedirs(OUTPUT_DIR_NO_TOP_100, exist_ok=True)
os.makedirs(OUTPUT_DIR_NO_TOP_500, exist_ok=True)



# Batch size for processing ml users
BATCH_SIZE = 1000

# ---------------------- Load Datasets ----------------------

print("Loading datasets...")
database = pd.read_csv(DATABASE_PATH)
aux_data = pd.read_csv(AUX_DATA_PATH)
weights_df = pd.read_csv(WEIGHTS_PATH)



# Get unique 'imdbId' values from both dataframes
unique_aux_ids = set(aux_data['imdbId'].unique())
unique_ml_ids = set(database['imdbId'].unique())

# Check if the difference is zero
if unique_aux_ids - unique_ml_ids == set():
    print("All GOOD,no extra movied id found")
else:
    print("adv is not a subset of data")
    
# Drop rows with empty 'rating_value' in aux_data and reset index
aux_data = aux_data.dropna(subset=['rating_value']).reset_index(drop=True)

# Half the ratings as MovieLens scale is 0-5
aux_data['rating_value'] = aux_data['rating_value'] / 2

# ---------------------- Preprocessing ----------------------

# Convert 'imdbId' to strings to ensure consistency
database['imdbId'] = database['imdbId'].astype(str)
aux_data['imdbId'] = aux_data['imdbId'].astype(str)

# Rename 'userId' columns to avoid confusion
database = database.rename(columns={'userId': 'ml_userId'})
aux_data = aux_data.rename(columns={'userId': 'aux_userId'})

# Convert weights to a dictionary for fast lookup
movie_weights = dict(zip(weights_df['imdbId'].astype(str), weights_df['weight']))


# ---------------------- Define Filtering Function ----------------------

def filter_top_n_movies(database, aux_data, n=500):
    """
    Filters out the top N most rated movies from both MovieLens and auxiliary datasets.

    Parameters:
    - database (DataFrame): MovieLens dataset.
    - aux_data (DataFrame): Auxiliary adversary dataset.
    - n (int): Number of top movies to exclude.

    Returns:
    - Tuple of filtered (database, aux_data, top_movies)
    """
    print(f"Excluding top {n} most popular movies...")

    # Identify top N movies based on MovieLens dataset
    top_movies = database['imdbId'].value_counts().head(n).index.tolist()
    # print(f"Top {n} movies to exclude: {top_movies}")

    # Filter out top N movies from both datasets
    filtered_database = database[~database['imdbId'].isin(top_movies)].reset_index(drop=True)
    filtered_aux_data = aux_data[~aux_data['imdbId'].isin(top_movies)].reset_index(drop=True)

    print(f"Database size after filtering: {filtered_database.shape}")
    print(f"Auxiliary data size after filtering: {filtered_aux_data.shape}")

    return filtered_database, filtered_aux_data, top_movies

# ---------------------- Apply Filtering ----------------------

# Choose which output directory to use based on exclusion
# Here, we're focusing on excluding top 500 movies
EXCLUDE_TOP_N = 500
database, aux_data, top_movies = filter_top_n_movies(database, aux_data, n=EXCLUDE_TOP_N)



# ---------------------- Create imdbId to aux_userIds Mapping ----------------------

print("Creating imdbId to aux_userIds mapping...")
# Group aux_data by 'imdbId' and aggregate 'aux_userId' into sets for fast lookup
imdb_to_aux_users = aux_data.groupby('imdbId')['aux_userId'].apply(set).to_dict()
print(f"Total unique movies in aux data: {len(imdb_to_aux_users)}")

# ---------------------- Get Unique ml_userIds ----------------------

ml_user_ids = database['ml_userId'].unique()
total_ml_users = len(ml_user_ids)
print(f"Total unique MovieLens users: {total_ml_users}")

# ---------------------- Processing Function ----------------------

# def process_ml_users_in_batches(database, ml_user_ids, output_dir, exclude_top_n=0):
#     """
#     Process MovieLens users in batches, finding auxiliary users who have rated at least
#     one common movie, and save the results to CSV files.
#     """
#     # Optionally filter out the top N most popular movies
#     if exclude_top_n > 0:
#         print(f"Excluding top {exclude_top_n} most popular movies...")
#         top_movies = database['imdbId'].value_counts().head(exclude_top_n).index
#         database = database[~database['imdbId'].isin(top_movies)]

#     # Iterate over batches of MovieLens users
#     for i in range(0, len(ml_user_ids), BATCH_SIZE):
#         batch_ml_user_ids = ml_user_ids[i:i + BATCH_SIZE]
#         batch_start = i
#         batch_end = i + len(batch_ml_user_ids) - 1
#         print(f"\nProcessing MovieLens users {batch_start} to {batch_end} (Batch size: {len(batch_ml_user_ids)})")

#         # Filter database for the current batch of ml users
#         ml_batch = database[database['ml_userId'].isin(batch_ml_user_ids)]

#         # Group by 'ml_userId' and aggregate 'imdbId' into sets
#         ml_user_imdbs = ml_batch.groupby('ml_userId')['imdbId'].apply(set).to_dict()

#         # Initialize results list
#         results = []

#         # Iterate over each ml user in the batch
#         for ml_user_id, ml_imdb_set in tqdm(ml_user_imdbs.items(), desc="Processing ml users in batch"):
#             # Initialize a set to collect aux_userIds who have rated at least one common movie
#             aux_users_set = set()

#             # Iterate over each imdbId rated by the ml user
#             for imdb_id in ml_imdb_set:
#                 # Get aux_userIds who have rated this imdbId
#                 aux_user_ids = imdb_to_aux_users.get(imdb_id, set())
#                 aux_users_set.update(aux_user_ids)

#             # Convert aux_users_set to a sorted list for consistency
#             aux_users_list = sorted(aux_users_set)

#             # Store the result
#             results.append({
#                 'ml_user_id': ml_user_id,
#                 'aux_users': ','.join(map(str, aux_users_list))  # Convert list to comma-separated string
#             })

#         # Create a DataFrame from the results
#         batch_results_df = pd.DataFrame(results)

#         # Define the output filename
#         output_filename = f'batch_{batch_start}_to_{batch_end}_results.csv'
#         output_filepath = os.path.join(output_dir, output_filename)

#         # Save the batch results to CSV
#         batch_results_df.to_csv(output_filepath, index=False)
#         print(f"Saved results for MovieLens users {batch_start} to {batch_end} to '{output_filepath}'")

#         # Clear results to free memory
#         del results, batch_results_df, ml_batch, ml_user_imdbs




# ---------------------- Processing Function ----------------------

def process_ml_users_in_batches(database, ml_user_ids, output_dir, exclude_top_n=0):
    """
    Process MovieLens users in batches, finding auxiliary users who have rated at least
    one common movie, and save the results to CSV files.

    Parameters:
    - database (DataFrame): Filtered MovieLens dataset.
    - ml_user_ids (array-like): Unique MovieLens user IDs.
    - output_dir (str): Directory to save batch results.
    - exclude_top_n (int): Number of top movies to exclude (already handled).
    """
    # Since top N movies are already excluded before calling this function, no need to exclude again.
    # This parameter can be retained for flexibility or removed if unnecessary.

    # Iterate over batches of MovieLens users
    for i in range(0, len(ml_user_ids), BATCH_SIZE):
        batch_ml_user_ids = ml_user_ids[i:i + BATCH_SIZE]
        batch_start = i
        batch_end = i + len(batch_ml_user_ids) - 1
        print(f"\nProcessing MovieLens users {batch_start} to {batch_end} (Batch size: {len(batch_ml_user_ids)})")

        # Filter database for the current batch of ml users
        ml_batch = database[database['ml_userId'].isin(batch_ml_user_ids)]

        # Group by 'ml_userId' and aggregate 'imdbId' into sets
        ml_user_imdbs = ml_batch.groupby('ml_userId')['imdbId'].apply(set).to_dict()

        # Initialize results list
        results = []

        # Iterate over each ml user in the batch
        for ml_user_id, ml_imdb_set in tqdm(ml_user_imdbs.items(), desc="Processing ml users in batch"):
            # Initialize a set to collect aux_userIds who have rated at least one common movie
            aux_users_set = set()

            # Iterate over each imdbId rated by the ml user
            for imdb_id in ml_imdb_set:
                # Get aux_userIds who have rated this imdbId
                aux_user_ids = imdb_to_aux_users.get(imdb_id, set())
                aux_users_set.update(aux_user_ids)

            # Convert aux_users_set to a sorted list for consistency
            aux_users_list = sorted(aux_users_set)

            # Store the result
            results.append({
                'ml_user_id': ml_user_id,
                'aux_users': ','.join(map(str, aux_users_list))  # Convert list to comma-separated string
            })

        # Create a DataFrame from the results
        batch_results_df = pd.DataFrame(results)

        # Define the output filename
        output_filename = f'batch_{batch_start}_to_{batch_end}_results.csv'
        output_filepath = os.path.join(output_dir, output_filename)

        # Save the batch results to CSV
        batch_results_df.to_csv(output_filepath, index=False)
        print(f"Saved results for MovieLens users {batch_start} to {batch_end} to '{output_filepath}'")

        # Clear results to free memory
        del results, batch_results_df, ml_batch, ml_user_imdbs



# ---------------------- Main Execution ----------------------

# print("Starting batch processing of MovieLens users (Uniform)...")
# process_ml_users_in_batches(database, ml_user_ids, OUTPUT_DIR_UNIFORM)

# print("Starting batch processing of MovieLens users (Without Top 100 Movies)...")
# process_ml_users_in_batches(database, ml_user_ids, OUTPUT_DIR_NO_TOP_100, exclude_top_n=100)

print("Starting batch processing of MovieLens users (Without Top 500 Movies)...")
process_ml_users_in_batches(database, ml_user_ids, OUTPUT_DIR_NO_TOP_500, exclude_top_n=500)

print("\nBatch processing completed successfully.")


Loading datasets...
All GOOD,no extra movied id found
Excluding top 500 most popular movies...
Top 500 movies to exclude: ['111161', '109830', '110912', '133093', '102926', '76759', '108052', '137523', '107290', '120737', '112573', '167260', '80684', '167261', '114814', '114709', '103064', '68646', '86190', '1375666', '468569', '82971', '169547', '114369', '106977', '112384', '116282', '88763', '116629', '120815', '172495', '114746', '209144', '167404', '96895', '103639', '99348', '110357', '126029', '111503', '119217', '325980', '111257', '109040', '211915', '93779', '816692', '198781', '73486', '71562', '338013', '266543', '101414', '120338', '266697', '117060', '83658', '100405', '71853', '910970', '97576', '88247', '112864', '372784', '1049413', '110413', '95016', '99685', '111282', '317705', '105236', '268978', '120586', '110475', '107048', '119654', '78748', '109444', '107614', '109686', '112431', '112462', '117998', '371746', '499549', '113189', '1853728', '120689', '120382', '1

Processing ml users in batch: 100%|██████████| 1000/1000 [00:02<00:00, 409.90it/s]


Saved results for MovieLens users 0 to 999 to 'deanonymization_results_no_top_500\batch_0_to_999_results.csv'

Processing MovieLens users 1000 to 1999 (Batch size: 1000)


Processing ml users in batch: 100%|██████████| 1000/1000 [00:02<00:00, 470.47it/s]


Saved results for MovieLens users 1000 to 1999 to 'deanonymization_results_no_top_500\batch_1000_to_1999_results.csv'

Processing MovieLens users 2000 to 2999 (Batch size: 1000)


Processing ml users in batch: 100%|██████████| 1000/1000 [00:02<00:00, 404.56it/s]


Saved results for MovieLens users 2000 to 2999 to 'deanonymization_results_no_top_500\batch_2000_to_2999_results.csv'

Processing MovieLens users 3000 to 3999 (Batch size: 1000)


Processing ml users in batch: 100%|██████████| 1000/1000 [00:02<00:00, 437.78it/s]


Saved results for MovieLens users 3000 to 3999 to 'deanonymization_results_no_top_500\batch_3000_to_3999_results.csv'

Processing MovieLens users 4000 to 4999 (Batch size: 1000)


Processing ml users in batch: 100%|██████████| 1000/1000 [00:02<00:00, 466.87it/s]


Saved results for MovieLens users 4000 to 4999 to 'deanonymization_results_no_top_500\batch_4000_to_4999_results.csv'

Processing MovieLens users 5000 to 5999 (Batch size: 1000)


Processing ml users in batch: 100%|██████████| 1000/1000 [00:02<00:00, 431.73it/s]


Saved results for MovieLens users 5000 to 5999 to 'deanonymization_results_no_top_500\batch_5000_to_5999_results.csv'

Processing MovieLens users 6000 to 6999 (Batch size: 1000)


Processing ml users in batch: 100%|██████████| 1000/1000 [00:02<00:00, 458.80it/s]


Saved results for MovieLens users 6000 to 6999 to 'deanonymization_results_no_top_500\batch_6000_to_6999_results.csv'

Processing MovieLens users 7000 to 7518 (Batch size: 519)


Processing ml users in batch: 100%|██████████| 519/519 [00:01<00:00, 470.08it/s]


Saved results for MovieLens users 7000 to 7518 to 'deanonymization_results_no_top_500\batch_7000_to_7518_results.csv'

Batch processing completed successfully.


In [21]:
# Step 1: Create a Unified Set of imdbId Values
all_imdbIds = pd.unique(pd.concat([database['imdbId'], aux_data['imdbId']]))
all_imdbIds = sorted(all_imdbIds)  # Ensure consistent ordering
num_movies = len(all_imdbIds)
print(f"Total unique movies across both datasets: {num_movies}")


Total unique movies across both datasets: 5916


In [22]:
# Step 2: Map imdbId to Column Indice
imdbId_to_col_idx = {imdbId: idx for idx, imdbId in enumerate(all_imdbIds)}


In [23]:
# Step 3a: Map Users to Row Indices
ml_userIds = sorted(database['ml_userId'].unique())
ml_userId_to_row_idx = {userId: idx for idx, userId in enumerate(ml_userIds)}
num_ml_users = len(ml_userIds)
print(f"Total unique MovieLens users: {num_ml_users}")

Total unique MovieLens users: 7519


In [24]:
# Step 3b: Map Users to Row Indices

aux_userIds = sorted(aux_data['aux_userId'].unique())
aux_userId_to_row_idx = {userId: idx for idx, userId in enumerate(aux_userIds)}
num_aux_users = len(aux_userIds)
print(f"Total unique auxiliary users: {num_aux_users}")


Total unique auxiliary users: 706585


In [25]:
from scipy.sparse import coo_matrix

# Prepare data for the ratings matrix
database_rows = []
database_cols = []
database_ratings_data = []
database_timestamps_data = []

for idx, row in tqdm(database.iterrows(), total=database.shape[0], desc='Processing database ratings'):
    userId = row['ml_userId']
    imdbId = row['imdbId']
    rating = row['rating']
    timestamp = row['timestamp']
    
    row_idx = ml_userId_to_row_idx[userId]
    col_idx = imdbId_to_col_idx[imdbId]
    
    database_rows.append(row_idx)
    database_cols.append(col_idx)
    database_ratings_data.append(rating)
    database_timestamps_data.append(timestamp)

database_ratings_matrix = coo_matrix((database_ratings_data, (database_rows, database_cols)),
                                     shape=(num_ml_users, num_movies)).tocsr()

database_timestamps_matrix = coo_matrix((database_timestamps_data, (database_rows, database_cols)),
                                        shape=(num_ml_users, num_movies)).tocsr()

Processing database ratings: 100%|██████████| 217713/217713 [00:07<00:00, 28988.45it/s]


In [26]:
# Convert 'rating_value' to numeric, setting errors to NaN
aux_data['rating_value'] = pd.to_numeric(aux_data['rating_value'], errors='coerce')



In [27]:
aux_rows = []
aux_cols = []
aux_ratings_data = []
aux_timestamps_data = []
for idx, row in tqdm(aux_data.iterrows(), total=aux_data.shape[0], desc='Processing aux ratings'):
    userId = row['aux_userId']
    imdbId = row['imdbId']
    rating = row['rating_value']
    timestamp = row['review_date_epoch']
    
    if pd.isna(rating):
        continue  # Skip entries without valid ratings
    
    row_idx = aux_userId_to_row_idx[userId]
    col_idx = imdbId_to_col_idx.get(imdbId)
    if col_idx is None:
        continue  # Should not happen if we've combined all imdbIds
    
    aux_rows.append(row_idx)
    aux_cols.append(col_idx)
    aux_ratings_data.append(rating)
    aux_timestamps_data.append(timestamp)

aux_ratings_matrix = coo_matrix((aux_ratings_data, (aux_rows, aux_cols)),
                                shape=(num_aux_users, num_movies)).tocsr()


Processing aux ratings: 100%|██████████| 1484750/1484750 [00:52<00:00, 28022.11it/s]


In [28]:
aux_timestamps_matrix = coo_matrix((aux_timestamps_data, (aux_rows, aux_cols)),
                                   shape=(num_aux_users, num_movies)).tocsr()


In [29]:
# Step 5: Prepare Weights Array
weights_df['imdbId'] = weights_df['imdbId'].astype(str)
weights_df['col_idx'] = weights_df['imdbId'].map(imdbId_to_col_idx)
weights_df = weights_df.dropna(subset=['col_idx'])
weights = np.zeros(num_movies)

weights[weights_df['col_idx'].astype(int)] = weights_df['weight'].values


In [30]:
def compute_rating_similarity(ml_ratings, candidate_ratings, rating_threshold=1):
    # Compute absolute difference
    rating_diff = np.abs(ml_ratings - candidate_ratings)
    # Valid ratings are those where both users have rated the movie
    valid_mask = (ml_ratings > 0) & (candidate_ratings > 0)
    # Ratings are similar if the difference is within the threshold
    rating_sim = np.zeros_like(ml_ratings)
    rating_sim[valid_mask] = (rating_diff[valid_mask] <= rating_threshold).astype(float)
    return rating_sim

def compute_timestamp_similarity(ml_timestamps, candidate_timestamps, time_threshold=14 * 24 * 3600):
    # Compute absolute difference in timestamps
    time_diff = np.abs(ml_timestamps - candidate_timestamps)
    # Valid timestamps are those where both users have timestamps
    valid_mask = (ml_timestamps > 0) & (candidate_timestamps > 0)
    # Timestamps are similar if the difference is within the threshold
    timestamp_sim = np.zeros_like(ml_timestamps)
    timestamp_sim[valid_mask] = (time_diff[valid_mask] <= time_threshold).astype(float)
    return timestamp_sim


In [31]:
def compute_rating_presence_similarity(ml_ratings, candidate_ratings):
    # Create a mask where both users have rated the movie (presence check)
    valid_mask = (ml_ratings > 0) & (candidate_ratings > 0)
    # Set similarity to 1.0 if both users have rated the movie, 0.0 otherwise
    rating_sim = np.zeros_like(ml_ratings)
    rating_sim[valid_mask] = 1.0
    return rating_sim
def compute_timestamp_presence_similarity(ml_timestamps, candidate_timestamps):
    # Create a mask where both users have timestamps (presence check)
    valid_mask = (ml_timestamps > 0) & (candidate_timestamps > 0)
    # Set similarity to 1.0 if both users have timestamps, 0.0 otherwise
    timestamp_sim = np.zeros_like(ml_timestamps)
    timestamp_sim[valid_mask] = 1.0
    return timestamp_sim



In [32]:
def compute_scores_for_ml_user(ml_user_id, candidate_aux_user_ids, phi=1.5):
    """
    Compute scores between one ml_user_id and all candidate auxiliary users.
    """
    # Get ml_user_idx
    ml_user_idx = ml_userId_to_row_idx[ml_user_id]

    # Get the indices of the movies rated by the ml_user_id
    ml_rated_movies = database_ratings_matrix.getrow(ml_user_idx).indices
    ml_ratings = database_ratings_matrix.getrow(ml_user_idx).data
    ml_timestamps = database_timestamps_matrix.getrow(ml_user_idx).data
    ml_weights = weights[ml_rated_movies]

    num_movies = len(ml_rated_movies)

    if num_movies == 0:
        # ml_user_id has not rated any movies
        return None

    # Map candidate_aux_user_ids to row indices
    candidate_aux_user_indices = [
        aux_userId_to_row_idx[aux_user_id]
        for aux_user_id in candidate_aux_user_ids
        if aux_user_id in aux_userId_to_row_idx
    ]

    num_candidates = len(candidate_aux_user_indices)

    if num_candidates == 0:
        return None  # No candidates found

    # Get candidate ratings and timestamps matrices (num_candidates x num_movies)
    candidate_ratings = aux_ratings_matrix[candidate_aux_user_indices][:, ml_rated_movies].toarray()
    candidate_timestamps = aux_timestamps_matrix[candidate_aux_user_indices][:, ml_rated_movies].toarray()

    # Broadcast ml_ratings and ml_timestamps to match candidate matrices
    ml_ratings_matrix = np.tile(ml_ratings, (num_candidates, 1))
    ml_timestamps_matrix = np.tile(ml_timestamps, (num_candidates, 1))

    # Compute similarities
    rating_sim = compute_rating_similarity(ml_ratings_matrix, candidate_ratings)
    timestamp_sim = compute_timestamp_similarity(ml_timestamps_matrix, candidate_timestamps)

    sim = rating_sim * timestamp_sim

    # Multiply similarities by weights and sum over movies
    weighted_sim = sim * ml_weights
    scores = np.sum(weighted_sim, axis=1)

    # Compute eccentricity to check for a unique match
    max_score_idx = np.argmax(scores)
    max_score = scores[max_score_idx]
    sorted_scores = np.sort(scores)[::-1]
    max2_score = sorted_scores[1] if len(sorted_scores) > 1 else 0
    sigma = np.std(scores)
    eccentricity = (max_score - max2_score) / sigma if sigma > 0 else np.inf

    if eccentricity < phi:
        return None  # No unique match found
    else:
        # Return the matched auxiliary user and score
        matched_aux_user_idx = candidate_aux_user_indices[max_score_idx]
        matched_aux_userId = aux_userIds[matched_aux_user_idx]
        return matched_aux_userId, max_score


In [None]:
# phi = 1.5  # Eccentricity threshold
# matches = []

# # Assuming you have a list of batch files from your previous processing
# batch_files = [os.path.join(OUTPUT_DIR_NO_TOP_500, f) for f in os.listdir(OUTPUT_DIR_NO_TOP_500) if f.endswith('.csv')]

# for batch_file in tqdm(batch_files, desc='Processing batches'):
#     # Load the batch_results_df
#     batch_results_df = pd.read_csv(batch_file)

#     for idx, row in batch_results_df.iterrows():
#         ml_user_id = row['ml_user_id']
#         aux_users_str = row['aux_users']
#         if not aux_users_str:
#             continue  # No candidates

#         candidate_aux_user_ids = [int(aux_user_id) for aux_user_id in aux_users_str.split(',') if aux_user_id]

#         result = compute_scores_for_ml_user(ml_user_id, candidate_aux_user_ids, phi=phi)

#         if result is not None:
#             matched_aux_userId, score = result
#             matches.append({'ml_user_id': ml_user_id, 'aux_user_id': matched_aux_userId, 'score': score})
#             print(f"MovieLens user {ml_user_id} matched with auxiliary user {matched_aux_userId} (Score: {score})")
#         else:
#             print(f"No unique match for MovieLens user {ml_user_id}")


In [None]:
# import gc

# phi = 1.5  # Eccentricity threshold
# matches = []

# # Assuming you have a list of batch files from your previous processing
# batch_files = [os.path.join(OUTPUT_DIR_NO_TOP_500, f) for f in os.listdir(OUTPUT_DIR_NO_TOP_500) if f.endswith('.csv')]

# for batch_file in tqdm(batch_files, desc='Processing batches'):
#     # Load the batch_results_df
#     batch_results_df = pd.read_csv(batch_file)

#     for idx, row in batch_results_df.iterrows():
#         ml_user_id = row['ml_user_id']
#         aux_users_str = row['aux_users']
#         if not aux_users_str:
#             continue  # No candidates

#         # Convert aux_users_str to a list of candidate IDs
#         candidate_aux_user_ids = [int(aux_user_id) for aux_user_id in aux_users_str.split(',') if aux_user_id]

#         # Compute scores for the current ml_user_id
#         result = compute_scores_for_ml_user(ml_user_id, candidate_aux_user_ids, phi=phi)

#         if result is not None:
#             matched_aux_userId, score = result
#             matches.append({'ml_user_id': ml_user_id, 'aux_user_id': matched_aux_userId, 'score': score})
#             print(f"MovieLens user {ml_user_id} matched with auxiliary user {matched_aux_userId} (Score: {score})")
#         else:
#             print(f"No unique match for MovieLens user {ml_user_id}")

#         # Explicitly free memory for variables no longer needed
#         del candidate_aux_user_ids, result
#         gc.collect()  # Force garbage collection

#     # Free memory for the DataFrame after processing each batch file
#     del batch_results_df
#     gc.collect()

# # Consider saving matches periodically instead of storing all in memory



In [33]:
import numpy as np
from tqdm import tqdm

def compute_scores_for_ml_user(ml_user_id, candidate_aux_user_ids, phi=1.5, sub_batch_size=10000):
    """
    Compute scores between one ml_user_id and all candidate auxiliary users in sub-batches to manage memory usage.

    Parameters:
    - ml_user_id (int): The MovieLens user ID to de-anonymize.
    - candidate_aux_user_ids (list of int): List of auxiliary user IDs who have at least one movie rating in common.
    - phi (float): Eccentricity threshold to determine a unique match.
    - sub_batch_size (int): Number of auxiliary users to process in each sub-batch.

    Returns:
    - tuple or None: Returns (matched_aux_userId, max_score) if a unique match is found, else None.
    """

    # Get ml_user_idx
    ml_user_idx = ml_userId_to_row_idx.get(ml_user_id)
    if ml_user_idx is None:
        return None  # ML user ID not found

    # Get the indices of the movies rated by the ml_user_id
    ml_rated_movies = database_ratings_matrix.getrow(ml_user_idx).indices
    ml_ratings = database_ratings_matrix.getrow(ml_user_idx).data
    ml_timestamps = database_timestamps_matrix.getrow(ml_user_idx).data
    ml_weights = weights[ml_rated_movies]

    num_movies = len(ml_rated_movies)
    
    if num_movies == 0:
        # ml_user_id has not rated any movies
        return None

    # Map candidate_aux_user_ids to row indices, ensuring they exist
    candidate_aux_user_indices = [
        aux_userId_to_row_idx.get(aux_user_id)
        for aux_user_id in candidate_aux_user_ids
    ]
    # Remove None values (aux_user_ids not found)
    candidate_aux_user_indices = [idx for idx in candidate_aux_user_indices if idx is not None]

    num_candidates = len(candidate_aux_user_indices)
    
    if num_candidates == 0:
        return None  # No candidates found

    # Initialize an array to accumulate scores
    scores = np.zeros(num_candidates)

    # Process auxiliary users in sub-batches
    for start in range(0, num_candidates, sub_batch_size):
        end = min(start + sub_batch_size, num_candidates)
        sub_batch_indices = candidate_aux_user_indices[start:end]

        # Get candidate ratings and timestamps matrices (sub_batch_size x num_movies)
        candidate_ratings = aux_ratings_matrix[sub_batch_indices][:, ml_rated_movies].toarray()
        candidate_timestamps = aux_timestamps_matrix[sub_batch_indices][:, ml_rated_movies].toarray()

        # Broadcast ml_ratings and ml_timestamps to match sub-batch size
        # Shape: (sub_batch_size, num_movies)
        ml_ratings_matrix = np.tile(ml_ratings, (len(sub_batch_indices), 1))
        ml_timestamps_matrix = np.tile(ml_timestamps, (len(sub_batch_indices), 1))

        # Compute similarities
        rating_sim = compute_rating_similarity(ml_ratings_matrix, candidate_ratings)
        timestamp_sim = compute_timestamp_similarity(ml_timestamps_matrix, candidate_timestamps)

        sim = rating_sim * timestamp_sim

        # Multiply similarities by weights and sum over movies
        # Each movie's weight is applied to the corresponding column
        # Shape of sim: (sub_batch_size, num_movies)
        # Shape of ml_weights: (num_movies,)
        # Broadcasting ml_weights across rows
        weighted_sim = sim * ml_weights

        # Sum across movies to get total score for each auxiliary user in the sub-batch
        sub_batch_scores = np.sum(weighted_sim, axis=1)

        # Accumulate the scores into the main scores array
        scores[start:end] += sub_batch_scores

        # Free memory for this sub-batch
        del candidate_ratings, candidate_timestamps, ml_ratings_matrix, ml_timestamps_matrix, rating_sim, timestamp_sim, sim, weighted_sim, sub_batch_scores
        gc.collect()

    # After processing all sub-batches, determine if there's a unique match
    max_score_idx = np.argmax(scores)
    max_score = scores[max_score_idx]
    sorted_scores = np.sort(scores)[::-1]
    max2_score = sorted_scores[1] if len(sorted_scores) > 1 else 0
    sigma = np.std(scores)
    eccentricity = (max_score - max2_score) / sigma if sigma > 0 else np.inf
    
    min_score_threshold = 0.8  # Define a minimum score required for a valid match
    
    if eccentricity < phi or max_score < min_score_threshold:
        return None  # No unique match found
    else:
        # Return the matched auxiliary user and score
        matched_aux_user_idx = candidate_aux_user_indices[max_score_idx]
        matched_aux_userId = aux_userIds[matched_aux_user_idx]
        return matched_aux_userId, max_score


In [34]:
import gc
from tqdm import tqdm

phi = 1.5  # Eccentricity threshold
matches = []

# List of batch files
batch_files = [os.path.join(OUTPUT_DIR_NO_TOP_500, f) for f in os.listdir(OUTPUT_DIR_NO_TOP_500) if f.endswith('.csv')]

for batch_file in tqdm(batch_files, desc='Processing batches'):
    # Load the batch_results_df
    batch_results_df = pd.read_csv(batch_file)

    for idx, row in batch_results_df.iterrows():
        ml_user_id = row['ml_user_id']
        aux_users_str = row['aux_users']
        if not aux_users_str:
            continue  # No candidates

        # Convert aux_users_str to a list of candidate IDs
        candidate_aux_user_ids = [int(aux_user_id) for aux_user_id in aux_users_str.split(',') if aux_user_id]

        # Compute scores for the current ml_user_id using sub-batching
        result = compute_scores_for_ml_user(ml_user_id, candidate_aux_user_ids, phi=phi, sub_batch_size=1000)

        if result is not None:
            matched_aux_userId, score = result
            matches.append({'ml_user_id': ml_user_id, 'aux_user_id': matched_aux_userId, 'score': score})
            print(f"MovieLens user {ml_user_id} matched with auxiliary user {matched_aux_userId} (Score: {score})")
        else:
            print(f"No unique match for MovieLens user {ml_user_id}")

        # Explicitly free memory for variables no longer needed
        del candidate_aux_user_ids, result
        gc.collect()  # Force garbage collection

    # Free memory for the DataFrame after processing each batch file
    del batch_results_df
    gc.collect()

# After all batches are processed, save the matches
matches_df = pd.DataFrame(matches)
matches_df.to_csv('deanonymization_matches.csv', index=False)
print(f"Total matches found: {len(matches_df)}")


Processing batches:   0%|          | 0/8 [00:00<?, ?it/s]

No unique match for MovieLens user 112
No unique match for MovieLens user 177
No unique match for MovieLens user 253
No unique match for MovieLens user 267
No unique match for MovieLens user 280
MovieLens user 342 matched with auxiliary user 740366 (Score: 1.000258116249563)
No unique match for MovieLens user 381
No unique match for MovieLens user 392
No unique match for MovieLens user 413
No unique match for MovieLens user 446
No unique match for MovieLens user 470
MovieLens user 487 matched with auxiliary user 48758 (Score: 1.1374569846391913)
No unique match for MovieLens user 509
MovieLens user 531 matched with auxiliary user 308978 (Score: 1.3325822072772802)
No unique match for MovieLens user 557
No unique match for MovieLens user 594
No unique match for MovieLens user 605
No unique match for MovieLens user 623
No unique match for MovieLens user 647
MovieLens user 696 matched with auxiliary user 22823 (Score: 0.8366443781408373)
No unique match for MovieLens user 726
No unique ma

Processing batches:  12%|█▎        | 1/8 [20:00<2:20:05, 1200.85s/it]

No unique match for MovieLens user 28053
No unique match for MovieLens user 28115
No unique match for MovieLens user 28173
No unique match for MovieLens user 28176
No unique match for MovieLens user 28187
No unique match for MovieLens user 28204
No unique match for MovieLens user 28269
No unique match for MovieLens user 28364
No unique match for MovieLens user 28370
No unique match for MovieLens user 28383
No unique match for MovieLens user 28408
No unique match for MovieLens user 28432
No unique match for MovieLens user 28449
No unique match for MovieLens user 28537
No unique match for MovieLens user 28558
No unique match for MovieLens user 28563
No unique match for MovieLens user 28577
No unique match for MovieLens user 28586
No unique match for MovieLens user 28617
No unique match for MovieLens user 28627
MovieLens user 28649 matched with auxiliary user 18784 (Score: 1.711940492636988)
No unique match for MovieLens user 28655
No unique match for MovieLens user 28697
No unique match 

Processing batches:  12%|█▎        | 1/8 [22:32<2:37:48, 1352.65s/it]

No unique match for MovieLens user 32725





KeyboardInterrupt: 