In [1]:
from minio import Minio
from minio.error import S3Error
from hashlib import sha256
import io
import math
import collections
import os

In [2]:
#Connection to my Minio Server which I installed in GCP
client = Minio(
    "35.202.171.168:9000",
    access_key="minioadmin",
    secret_key="minioadmin",
    secure=False
)

# Make 'loomings' bucket if not exist.
found = client.bucket_exists("loomings")
if not found:
    client.make_bucket("loomings")
    print("Successfully created bucket 'loomings'")
else:
    print("Bucket 'loomings' already exists")
    
#Push loomings.txt fileto bucket
client.fput_object("loomings", "loomings.txt", 'loomings.txt')
print("'loomings.txt' is successfully uploaded as object 'loomings.txt' to bucket 'loomings'.")

Bucket 'loomings' already exists
'loomings.txt' is successfully uploaded as object 'loomings.txt' to bucket 'loomings'.


In [3]:
#Read each line from loomings.txt and push each line as a file to the bucket including the metadata
with open('loomings.txt', 'r',encoding='utf-8') as loomings:
    lines = loomings.readlines()
    fileCount = 0
    for line in lines:
        strippedLine = line.strip()
        if(len(strippedLine)>0):
            strippedLine = strippedLine.encode('utf-8')
            fileCount += 1
            filename = "File-"+str(fileCount)
            #I have used SHA256 to find the hash of the file
            hash = sha256(strippedLine).hexdigest()
            length = len(strippedLine)
                        
            #put_object is used to create a file with a string.
            client.put_object(
                "loomings", filename, io.BytesIO(bytes(strippedLine)), length, metadata={"Content-hash": hash},
            )
            print(filename, " is successfully uploaded to bucket 'loomings' with metadata ", {"Content-hash": hash})
            

#Printing the numbe of non blank lines using the counter variable fileCount
print("\nTotal number of Non blank lines : ", str(fileCount))

File-1  is successfully uploaded to bucket 'loomings' with metadata  {'Content-hash': '08a2152f1443355617cb16b6b84b206771cdc1a6cecfae86b15bed1715c0dc18'}
File-2  is successfully uploaded to bucket 'loomings' with metadata  {'Content-hash': '550f91de362c86e7ab08d3e55b6c2ec6316c85e34285d8b37bb97528f8a709a0'}
File-3  is successfully uploaded to bucket 'loomings' with metadata  {'Content-hash': 'a00e0126c2b8b458e7d375c6d8c1c75b1ba1b48f5082cc7f39ad6e57ba736fdd'}
File-4  is successfully uploaded to bucket 'loomings' with metadata  {'Content-hash': '555164242460f0b53b95cd82195aaa111dd639fa2ee640b4ce3bb944fb3ba60d'}
File-5  is successfully uploaded to bucket 'loomings' with metadata  {'Content-hash': '3878fc5938524787dcc77e5c5d6e6c6d7914389be3dc725ebc633551a2405058'}
File-6  is successfully uploaded to bucket 'loomings' with metadata  {'Content-hash': '6c6c8ea443c8b57a28d3fc43bb1a493ae1dcdbda0f851218f621b353186c7a74'}
File-7  is successfully uploaded to bucket 'loomings' with metadata  {'Conte

In [4]:
#Finding the size of all files using list_objects
fileSizeMap = []
for file in client.list_objects('loomings'):
    filename = file.object_name
    if('File-' in filename):
        fileSizeMap.append([filename, file.size])
                
#Sorting the list according the size of each file
print("List of files and their sizes in ascending order:")
fileSizeMap.sort(key=lambda x: x[1])
for fileSize in fileSizeMap:
    print("FileName : "+fileSize[0].rjust(8)+",    File Size : "+str(fileSize[1]).rjust(4))

List of files and their sizes in ascending order:
FileName :  File-12,    File Size :  139
FileName :  File-16,    File Size :  336
FileName :   File-2,    File Size :  389
FileName :   File-4,    File Size :  598
FileName :   File-5,    File Size :  628
FileName :  File-13,    File Size :  659
FileName :   File-3,    File Size :  667
FileName :  File-10,    File Size :  709
FileName :   File-9,    File Size :  783
FileName :  File-14,    File Size :  785
FileName :  File-15,    File Size :  831
FileName :   File-8,    File Size :  831
FileName :   File-1,    File Size : 1115
FileName :  File-11,    File Size : 1184
FileName :   File-7,    File Size : 1452
FileName :   File-6,    File Size : 1956


In [5]:
#Method to get file content using the fget_object 
def getFileContent(fileName):
    client.fget_object('loomings', fileName, fileName)
    with open(fileName, 'r',encoding='utf-8') as file:
        line = file.readline().strip()
    os.remove(fileName)
    return line
                
#Creating a hashmap with hash as key and list of files as value.
hashFilesMap = collections.defaultdict(list)
for file in client.list_objects('loomings'):
    if('File-' in file.object_name):
        fileName = file.object_name
        fileInfo = client.fget_object('loomings', fileName, fileName)
        hash = fileInfo.metadata['x-amz-meta-content-hash']
        os.remove(fileName)
        hashFilesMap[hash].insert(0,fileName)
    
#If there are multiple files with the same hash, then they are duplicates. So just keep one of them. 
for files in hashFilesMap.values():
    if(len(files)>1):
        print(', '.join(files), "have the same hash digest.")
        print("\nThe original text line of "+files[0]+": ", getFileContent(files[0]))
        
#Variable to store the list of line numbers to be deleted as they are duplicates.
deleteLines = list()

for files in hashFilesMap.values():
    if len(files)>1:
        deleteLines.extend([x.replace('File-', '') for x in files[1:]])
        
print("\nLines", ','.join(deleteLines), "will be removed from 'loomings.txt' as they are duplicates.")

#Create a new file with all the lines from loomings.txt except for the duplicates
with open('loomings.txt', 'r',encoding='utf-8') as loomings:
    with open('loomings-clean.txt', 'w', encoding='utf-8') as clean:
        clean.truncate(0)
        lines = loomings.readlines()
        j = 0
        for i, line in enumerate(lines):
            if(len(line.strip())>0):
                j += 1
                if(j not in deleteLines):
                    clean.write(str(line))
                    if(i!=len(lines)-1): clean.write("\n")
print("'loomings-clean.txt' has been generated with unique statements")

File-8, File-15 have the same hash digest.

The original text line of File-8:  No, when I go to sea, I go as a simple sailor, right before the mast, plumb down into the forecastle, aloft there to the royal mast-head. True, they rather order me about some, and make me jump from spar to spar, like a grasshopper in a May meadow. And at first, this sort of thing is unpleasant enough. It touches one’s sense of honor, particularly if you come of an old established family in the land, the Van Rensselaers, or Randolphs, or Hardicanutes. And more than all, if just previous to putting your hand into the tar-pot, you have been lording it as a country schoolmaster, making the tallest boys stand in awe of you. The transition is a keen one, I assure you, from a schoolmaster to a sailor, and requires a strong decoction of Seneca and the Stoics to enable you to grin and bear it. But even this wears off in time.

Lines 15 will be removed from 'loomings.txt' as they are duplicates.
'loomings-clean.txt' 