Imports

In [None]:
# !pip install pandas numpy
import pandas as pd
import numpy as np
from IPython.display import display

VARS

In [None]:
# Load the CSV file
csv_file_path = 'input.csv'
df = pd.read_csv(csv_file_path)

# Replace NaN values with a placeholder for blanks only if you're sure they represent truly blank cells
BLANK_PLACEHOLDER = "<BLANK>"  # Define a constant for blank placeholders

# Iterate over all columns in the DataFrame
for column in df.columns:
    df[column].fillna(BLANK_PLACEHOLDER, inplace=True)

In [None]:
# # df.head()
# # df.info()


# # Duplicates ----------
# print("Number of duplicate rows:", len(df[df.duplicated('account_url', keep=False)]))
# df[df.duplicated('account_url', keep=False)].sort_values(['account_url'])

---

In [None]:
import pandas as pd
import numpy as np


def consolidate_rows(rows):
    """Consolidate rows with user input for conflict resolution, handling blank and non-blank values appropriately."""
    consolidated = rows.iloc[0].copy()  # Start with the first row as the base for consolidation
    for column in rows.columns:
        # Dropping NaN to consider non-empty values for conflict detection
        non_empty_values = rows[column].dropna().unique()
        
        # If there's exactly one non-NaN value, use it directly
        if len(non_empty_values) == 1:
            consolidated[column] = non_empty_values[0]
        elif len(non_empty_values) > 1:
            print(f"\nConflict in '{column}' column with unique values: {non_empty_values}")
            # Prepare a conflict DataFrame excluding NaN values for user selection
            conflict_df = rows[[column]].dropna().drop_duplicates().reset_index(drop=True)
            display_df = conflict_df.reset_index().rename(columns={"index": "Choice"})
            display_df['Choice'] += 1  # Adjust choice numbering to start from 1
            display(display_df)
            
            chosen_index = input(f"Select the row number to keep for {column} (1-{len(conflict_df)}), or type 'exit' to stop: ")
            if chosen_index.lower() == 'exit':
                print("Operation stopped by user. Saving progress...")
                return None  # Allow early exit from consolidation
            
            try:
                chosen_index = int(chosen_index) - 1  # Convert to zero-based index
                if 0 <= chosen_index < len(conflict_df):
                    # Ensure scalar value assignment by specifying column for .iloc
                    consolidated[column] = conflict_df.iloc[chosen_index, 0]  # Access scalar value directly
                else:
                    print("Invalid row number. No changes made to this column.")
            except ValueError:
                print("Invalid input. Please enter a valid row number or 'exit'.")
                return None  # Handle non-integer inputs gracefully
        else:
            # If all values in this column for the group are NaN, leave the consolidated value as NaN
            consolidated[column] = np.nan
    
    return consolidated




def has_conflict(group):
    """Check if there is any conflict within the group, ignoring NaN values."""
    for column in group.columns:
        non_na_values = group[column].dropna().unique()
        if len(non_na_values) > 1:
            return True
    return False

def main(csv_file_path):
    # Load the CSV, treat all data as string to prevent automatic NaN conversion for blank cells
    df = pd.read_csv(csv_file_path, dtype=str)

    original_order_df = df.copy()  # Keep a copy for reference to the original order

    duplicates = df[df.duplicated('account_url', keep=False)]
    
    if duplicates.empty:
        print("No duplicates found.")
        df.to_csv('result.csv', index=False)
        return df
    
    consolidated_list = []
    for account_url, group in duplicates.groupby('account_url'):
        if not has_conflict(group):
            continue  # Skip this group if no conflict is found
        
        print(f"Processing next duplicate group for account_url: {account_url}")
        display(group)
        consolidated_row = consolidate_rows(group)
        if consolidated_row is None:
            print("Operation stopped by user. Saving progress...")
            break
        consolidated_list.append(consolidated_row)
    
    if consolidated_list:
        consolidated_df = pd.DataFrame(consolidated_list, columns=df.columns)
        df = pd.concat([original_order_df, consolidated_df], ignore_index=False)
        df = df.drop_duplicates('account_url', keep='last')
    
    df.sort_index(inplace=True)
    df.to_csv('result.csv', index=False)
    print("Consolidation complete. Results saved to 'result.csv'.")
    return df

# Replace 'input.csv' with your actual CSV file path
csv_file_path = 'input.csv'
df_processed = main(csv_file_path)
