<a href="https://colab.research.google.com/github/iamkish0re/Gdrive-Duplicates-Remover/blob/gdrive-test/GDrive_duplicates_remover.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#   Google Drive duplicates remover (using colab)

> This notebook will remove all the duplicates in the specified folder and subfolders of google drive and delete them (move them to trash).





In [3]:
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

Mounted at /content/gdrive/


Specify the path of the folder which you want to find and remove duplicates

Example: If the folder path is `root/user/company` and company is the folder you want to find duplicates. Specify `root/user` below

In [15]:
%cd Othercomputers/My Laptop/Phone backup/Test_dups

/content/gdrive/Othercomputers/My Laptop/Phone backup/Test_dups


In [7]:
import glob, os, shutil, logging, time
from hashlib import sha256

In [8]:
# Create a custom logger
logger = logging.getLogger('gdrive-duplicate-remover')
logger.propagate = False # do not pass logs to the default logger
logging.basicConfig(
                    level=logging.DEBUG,
                    force=True, # Resets any previous configuration
                    )
# Create handlers
handler = logging.StreamHandler()

# Create formatters and add it to handlers
format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S')
handler.setFormatter(format)

# Add handlers to the logger
logger.addHandler(handler)
logger.propagate = False

In [9]:
logger.debug('Test Logger')

29-Jan-23 08:44:57 - gdrive-duplicate-remover - DEBUG - Test Logger


In [10]:
class GdriveCleaner:
    def __init__(self):
        self.home_dir = os.getcwd()
        self.File_hashes = []
        self.File_hashes_folder = []
        self.Cleaned_dirs = []
        self.Total_bytes_saved = 0
        self.block_size = 65536
        self.count_cleaned = 0
        logger.debug("HOME DIRECTORY : " + str(self.home_dir))
        logger.debug("DEFAULT BLOCK SIZE : " + str(self.block_size))


    def generate_hash(self, Filename: str) -> str:
        Filehash = sha256()
        try:
            with open(Filename, 'rb') as File:
                fileblock = File.read(self.block_size)
                while len(fileblock) > 0:
                    Filehash.update(fileblock)
                    fileblock = File.read(self.block_size)
                Filehash = Filehash.hexdigest()
            return Filehash
        except:
            return False

    def clean(self, folder_name) -> None:
        logger.debug("Traversing paths in " + folder_name)
        all_dirs = [path[0] for path in os.walk(folder_name + '/')]
        logger.debug("Traversing Completed!")
        for path in all_dirs:
            logger.debug("Current directory : " + path)
            os.chdir(path)
            All_Files = [file for file in os.listdir() if os.path.isfile(file)]
            for file in All_Files:
                filehash = self.generate_hash(file)
                if not filehash in self.File_hashes:
                    if filehash:
                        self.File_hashes.append(filehash)
                        # print(file)
                else:
                    byte_saved = os.path.getsize(file)
                    self.count_cleaned += 1
                    self.Total_bytes_saved += byte_saved
                    os.remove(file)
                    filename = file.split('/')[-1]
                    logger.info("Deleting : " + filename)
            os.chdir(self.home_dir)

    def cleaning_summary(self) -> None:
        mb_saved = self.Total_bytes_saved / 1048576
        mb_saved = round(mb_saved, 2)
        logger.info('Duplicate removal done!')
        logger.debug('File cleaned  : ' + str(self.count_cleaned))
        logger.debug('Total Space saved : ' + str(mb_saved) + 'MB')

    def main(self, folder_name) -> None:
        logger.info("Starting Clean ...")
        self.clean(folder_name)
        logger.info("Clean Completed! ...")
        self.cleaning_summary()


def gdrive_duplicate_remover(folder_name):
    App = GdriveCleaner()
    App.main(folder_name)
    return True

In [19]:
gdrive_duplicate_remover('Phone backup')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
29-Jan-23 09:10:27 - gdrive-duplicate-remover - INFO - Cleaned : Screenshot_20171105-115818.png
29-Jan-23 09:10:27 - gdrive-duplicate-remover - DEBUG - Current directory : Phone backup/J7Prime_Backup_10_01_2018/Phone backup/Images/DCIM/Camera
29-Jan-23 09:10:33 - gdrive-duplicate-remover - INFO - Cleaned : 20171016_171852.jpg
29-Jan-23 09:10:33 - gdrive-duplicate-remover - INFO - Cleaned : 20171018_095722.jpg
29-Jan-23 09:10:33 - gdrive-duplicate-remover - INFO - Cleaned : 20171018_095759.jpg
29-Jan-23 09:10:33 - gdrive-duplicate-remover - INFO - Cleaned : 20171018_095701(0).jpg
29-Jan-23 09:10:33 - gdrive-duplicate-remover - INFO - Cleaned : 20171018_095702.jpg
29-Jan-23 09:10:33 - gdrive-duplicate-remover - INFO - Cleaned : 20171018_095659.jpg
29-Jan-23 09:10:33 - gdrive-duplicate-remover - INFO - Cleaned : 20171018_095703.jpg
29-Jan-23 09:10:33 - gdrive-duplicate-remover - INFO - Cleaned : 20171018_095758.jpg
29-Jan-23

True