# Outlook Mail De-duplication

I had a bunch of duplicate emails in my Outlook folder. This script prints all such duplicates as tab-delimited file.

In [None]:
import win32com.client
from collections import defaultdict
import pandas as pd
import pywintypes

# Path to the PST file
pst_file_path = r"C:\Anand\Mail\Gramener-archive.pst"
pst_folder_name = "Gramener-Archive"
pst_subfolder_name = "Archive"


def process_folder(folder, email_dict):
    for item in folder.Items:
        if item.Class == 43:  # MailItem class
            # Use a tuple of (sender, subject, timestamp) as the key
            try:
                body = item.Body[:50] if item.Body else ""
                key = (item.SenderName, item.Subject, item.ReceivedTime, body)
                email_dict[key].append((item.EntryID, item.Size))
                if len(email_dict[key]) > 1:
                    print("\t".join((len(email_dict[key]) - 1, item.EntryID, item.SenderName, item.Subject, item.ReceivedTime, body, item.Size)))
            except pywintypes.com_error:
                pass


def find_duplicates():
    outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI")

    # Add the PST file to the current Outlook profile if it's not already loaded
    try:
        outlook.AddStore(pst_file_path)
    except:
        pass  # The PST file might already be loaded

    # Access the archive folder
    archive_folder = None
    for folder in outlook.Folders:
        for subfolder in folder.Folders:
            if folder.Name == pst_folder_name and subfolder.Name == pst_subfolder_name:
                archive_folder = subfolder
                break
        if archive_folder:
            break

    if not archive_folder:
        print(f"Folder '{pst_folder_name}' not found.")
        return

    email_dict = defaultdict(list)

    # Process the archive folder and its sub-folders
    process_folder(archive_folder, email_dict)

find_duplicates()

Then I manually edited the output, placed it into a CSV file with columns:

- `count` -- the duplicate count, i.e. 0 for the first entry, 1 for the second, and so on
- `id` -- the Entry ID
- ... any other fields.

Then I run the following to delete duplicates.

In [None]:
import pandas as pd
import win32com.client

# Read the CSV file
df = pd.read_csv('duplicates.csv')

# Filter rows where the "count" column is greater than 0
filtered_df = df[df['count'] > 0]

# Initialize Outlook application
outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI")

# Iterate through filtered rows and delete the corresponding emails
for index, row in filtered_df.iterrows():
    mail = outlook.GetItemFromID(row['id'])
    mail.Delete()
    print(index, f"Deleted email with ID: {row['id']}")
