<a href="https://colab.research.google.com/github/iamkish0re/Gdrive-Duplicates-Remover/blob/enhance-1.0/GDrive_duplicates_remover.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#   Google Drive duplicates remover (using colab)

> This notebook will remove all the duplicates in the specified folder and subfolders of google drive and delete them (move them to trash).





In [2]:
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

Mounted at /content/gdrive/


Usage :
Specify the path of the folder which you want to find and remove duplicates

Example: If the folder path is `root/user/company` and company is the folder you want to find duplicates. Specify `root/user` below

In [3]:
# %cd gdrive/Shareddrives/Munnar
%cd gdrive/Othercomputers

/content/gdrive/Othercomputers


In [4]:
import glob, os, shutil, logging, time
from hashlib import sha256

In [5]:
# Create a custom logger
logger = logging.getLogger('gdrive-duplicate-remover')
logger.propagate = False # do not pass logs to the default logger
logging.basicConfig(
                    level=logging.DEBUG,
                    force=True, # Resets any previous configuration
                    )
# Create handlers
handler = logging.StreamHandler()

# Create formatters and add it to handlers
format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S')
handler.setFormatter(format)

# Add handlers to the logger
logger.addHandler(handler)
logger.propagate = False

In [6]:
class GdriveCleaner:
    def __init__(self):
        self.HOME_DIRECTORY      = os.getcwd()
        self.FILE_HASHES_UNIQUE  = []
        self.MEMORY_DELETED_BYTE = 0
        self.CHUNK_SIZE          = 65536
        self.DEL_COUNT           = 0
        logger.debug("HOME DIRECTORY : " + str(self.HOME_DIRECTORY))
        logger.debug("DEFAULT BLOCK SIZE : " + str(self.CHUNK_SIZE))


    def calculate_filehash(self, file: str) -> str:
        filehash = sha256()
        try:
            with open(file, 'rb') as f:
                chunk = f.read(self.CHUNK_SIZE)
                while len(chunk) > 0:
                    filehash.update(chunk)
                    chunk = f.read(self.CHUNK_SIZE)
                filehash = filehash.hexdigest()
            return filehash
        except:
            return False

    def check_and_delete_duplicates(self, folder_name) -> None:
        logger.debug("-----------------------Traversing paths in " + folder_name)

        all_dirs_list = [path[0] for path in os.walk(folder_name + '/')]

        logger.debug("-----------------------Traversing Completed!")

        for path in all_dirs_list:
            logger.info("-----------------------Checking directory : " + path)
            os.chdir(path)
            all_dir_current_list = [file for file in os.listdir() if os.path.isfile(file)]
            for file in all_dir_current_list:
                filehash = self.calculate_filehash(file)
                if not filehash in self.FILE_HASHES_UNIQUE:
                    if filehash:
                        self.FILE_HASHES_UNIQUE.append(filehash)
                else:
                    logger.debug("Deleting : " + file)
                    del_file_size = os.path.getsize(file)
                    
                    os.remove(file)
                    self.DEL_COUNT += 1
                    self.MEMORY_DELETED_BYTE += del_file_size
                    
            os.chdir(self.HOME_DIRECTORY)

        

    def main(self, folder_name) -> None:
        logger.info("-----------------------Starting Clean ...")
        self.check_and_delete_duplicates(folder_name)
        logger.info("-----------------------Clean Completed! ...")

        memory_deleted_mb = round(self.MEMORY_DELETED_BYTE / 1048576, 2)

        logger.info('-----------------------Duplicate removal done!')
        logger.debug('-----------------------File cleaned  : ' + str(self.DEL_COUNT))
        logger.debug('-----------------------Total Space saved : ' + str(memory_deleted_mb) + 'MB')


def gdrive_duplicate_remover(folder_name) -> str:
    App = GdriveCleaner()
    App.main(folder_name)
    return 'COMPLETED!'

In [None]:
gdrive_duplicate_remover('My Laptop (1)/')

29-Jan-23 15:05:06 - gdrive-duplicate-remover - DEBUG - HOME DIRECTORY : /content/gdrive/Othercomputers
29-Jan-23 15:05:06 - gdrive-duplicate-remover - DEBUG - DEFAULT BLOCK SIZE : 65536
29-Jan-23 15:05:06 - gdrive-duplicate-remover - INFO - -----------------------Starting Clean ...
29-Jan-23 15:05:06 - gdrive-duplicate-remover - DEBUG - -----------------------Traversing paths in My Laptop (1)/
