In [127]:
import os, hashlib, datetime, time, json, numpy
from pathlib import Path

class FilePro:

    def get_info(self):
        
        resultList = []     # list of dictionaries in which all dictionaries have properties of files which are..
                            # ..from the folder project_1
        
        def calHashValue(filePath):
            hashobj = hashlib.sha1()
            file = open(filePath,'rb')
            data = 0
            while data!=b'':
                data = file.read(1024)
                hashobj.update(data)
            return hashobj.hexdigest()

        # accessing all files and folders through os.walk() function
        # root, dirs, files variables will have the root path, directories and file names respectively
        # created a dictionary called tempDict, this will store all the required properties of each file, with keys..
        # ..as the file property name and values as the corrsponding file property value
        # and then appending each file's tempDict to "resultList"
        
        for (root,dirs,files) in os.walk(".", topdown=True):         
            for name in files:
                tempDict = {}
                filePath = os.path.join(root, name)
                if os.path.splitext(filePath)[1]!= ".ipynb" and os.path.splitext(filePath)[1]!= "":
                    tempDict["filePath"] = filePath
                    tempDict["fileName"] = Path(filePath).stem
                    tempDict["fileExtension"] = os.path.splitext(filePath)[1]
                    tempDict["fileSize"] = os.path.getsize(filePath)
                    tempDict["fileLastModified"] = str(datetime.date.fromtimestamp(os.path.getctime(filePath)))
                    tempDict["fileHashValue"] = calHashValue(filePath)
                    resultList.append(tempDict)
        
        # writing "resultList" list to "file_info" file in json format
        
        with open("file_info", "w") as fi:
            json.dump(resultList, fi)

    def __init__(self, currentWorkingDir):
        self.currentWorkingDir = currentWorkingDir
        filePath = currentWorkingDir+"/file_info.csv"
        if not os.path.isfile(filePath):
            self.get_info()
        elif str(datetime.date.fromtimestamp(os.path.getctime(filePath)))<=str(datetime.date.fromtimestamp(time.time())):
            self.get_info()
            
    def generate_report(self, currentWorkingDir):
        
        fileTypes = []               # sotres all the distinct file types
        fileSizes = []               # stores all files file size, this is useful for calculating mean and median of..
                                     # ..all files
        meanByFileTypes = {}         # stores mean of each file type
        medianByFileTypes = {}       # stores median of each file type
        folderPaths = []             # stores paths of all folders, this is useful for building the..
                                     # .."folderPathsAndFileCount" dictionary
        folderPathsAndFileCount = {} # stores number of files in each folder, with keys storing folder's path and..
                                     # ..values storing corrresponding number of files
        folderNames = []             # stores names of folders, this is useful for retrieving the folder name with..
                                     # ..highest number of files and biggest size
        
        
        self.currentWorkingDir = currentWorkingDir
        filePath = currentWorkingDir+"/file_info.csv"
        if not os.path.isfile(filePath):
            self.get_info()
        elif str(datetime.date.fromtimestamp(os.path.getctime(filePath)))<=str(datetime.date.fromtimestamp(time.time())):
            self.get_info()
   
        fileInfo = open("file_info","r")   # opening the file in read mode
        data = []                          # stores the data in "file_info" file
        data = json.load(fileInfo)         # using json.load() function to retrieve the data in "file_info" file
        for i in data:
            
            # here we are adding the file type to "fileTypes" list if a specific file type is not present in the list
            
            if i["fileExtension"] not in fileTypes:          
                fileTypes.append(i["fileExtension"])
            
            # appending file sizes to "fileSizes" list, we can use this list to calcu;ate mean and median of all files
            
            fileSizes.append(i["fileSize"])                  
            
            # here we are checking if a specific file type (as keys) is present in "meanByFileTypes" and "medianByFileTypes"..
            # ..if it is not present we are adding that specific file type as key and an empty list as its..
            # ..corresponding value which stores file sizes of the corresponding file type
            # file sizes of same type will be appended to their respective lists 
            
            if i["fileExtension"] not in meanByFileTypes.keys():
                meanByFileTypes[i["fileExtension"]] = []
                meanByFileTypes[i["fileExtension"]].append(i["fileSize"])
                medianByFileTypes[i["fileExtension"]] = []
                medianByFileTypes[i["fileExtension"]].append(i["fileSize"])
            else:
                meanByFileTypes[i["fileExtension"]].append(i["fileSize"])
                medianByFileTypes[i["fileExtension"]].append(i["fileSize"])
                
        # calculating mean and median of specific file types by accessing the list values in "meanByFileTypes" and..
        #.."medianByFileTypes"
                
        for k in meanByFileTypes.keys():
            meanByFileTypes[k] = numpy.mean(meanByFileTypes[k])
            medianByFileTypes[k] = numpy.median(medianByFileTypes[k])
        
        for (root,dirs,files) in os.walk(".", topdown=True):            
            for i in range(len(dirs)):
                root = root.replace(".","")                   # getting rid of "." for proper file path creation
                folderPaths.append(os.path.join(root, dirs[i]))
                folderNames.append(dirs[i])
                
        folderPaths[1] = "/"+folderPaths[1]
        for keys in folderPaths:
            folderPathsAndFileCount[keys] = 0
        fileCount = [0]                                   # stores count of files of each folder, this is useful for..
                                                          # ..calculating folder with highest number of files
        
        maxFolderSize = [0]                               # stores file size of each folder, this is useful for..
                                                          # ..calculating folder with biggest size
        
        
        # here are we calculating size of each folder and appending it to "maxFodlerSize" and also calculating count..
        # ..of files and appending it to "fileCount"
        
        
        for i in range(1, len(folderPaths)):              # here we are ignoring the value in base index as it..
                                                          #..belongs to the current notebook
            count = 0
            size = 0
            dirIter = os.scandir(currentWorkingDir+folderPaths[i])
            for direc in dirIter:
                if direc.is_file():
                    count+=1
                    size += os.path.getsize(direc)
            maxFolderSize.append(size)
            folderPathsAndFileCount[dirIter] = count
            fileCount.append(count)
        maxCountIndex = fileCount.index(max(fileCount))         # finding the index which has highest number of files
        maxSizeIndex = maxFolderSize.index(max(maxFolderSize))  # finding the index which has biggest size
        
        # with the above two indices we can get the required folder names from "folderNames" list
        
        # finally, adding all the data that was collected in this function to the dictionary "filesData"
        
        filesData = {}
        filesData["File Types"] = fileTypes
        filesData["Mean by file type"] = meanByFileTypes
        filesData["Median by file type"] = medianByFileTypes
        filesData["Mean of all file types"] = numpy.mean(fileSizes)
        filesData["Median of all file types"] = numpy.median(fileSizes)
        filesData["Folder with highest number of files"] = folderNames[maxCountIndex]
        filesData["Folder with biggest size"] = folderNames[maxSizeIndex]
        ask = str(input("Save report? "))
        if ask == "yes":
            with open("generatedReport", "w") as fi:
                json.dump(filesData, fi)
            print("Report saved with name generatedReport")

currentWorkingDir = os.getcwd()                         # retrieving the current working directory
fileObj = FilePro(currentWorkingDir)                    # creating the FilePro object which will trigger __intit__
fileObj.generate_report(currentWorkingDir)              # function call to generate the report
    
    


Save report? no
