In [171]:
import os
import sys
import random
import time
import math
import psutil
import datetime

In [193]:
class ShuffleBigFile(object):
    """ The class can be used to shuffle large files, that doesn't fir into
    the memory (RAM).
    """
    
    def __init__(self, input_file, output_file, buffer_size=1_000_000, seed=50):
        """ Class initializers
            
            Arguments
            =========
                `input_file`  : (str), The large file whose content needs to be shuffled.
                `output_file` : (str), The generated shuffled file.
                `buffer_size` : (int:1_000_000), Maximum size (number of lines) that can be fitted into the memory (RAM).
                `seed`        : (int:50), To reproduce the same shuffle file everytime.
        """
        
        self.input_file = input_file
        self.output_file = output_file
        self.buffer_size = buffer_size
        self.seed = seed
        random.seed(self.seed)
        
        assert type(self.input_file).__name__ == "str", "Input file should be of type str"
        assert type(self.output_file).__name__ == "str", "Output file should be of type str"
        assert type(self.buffer_size).__name__ == "int", "Buffer size should be of type int"
        assert type(self.seed).__name__ == "int", "Seed should be of type int"
        
        assert os.path.exists(self.input_file), "Input file should exists!"
        assert not os.path.exists(self.output_file), "Please delete the output file!"
        
        self.__frac_available_memory = 0.8
        
        self.__start_time = time.time()
        
        print("Collecting metadata...")
        self.collect_metadata()
        print("Metadata collected!")
        
        print("Following are the metadata collected:")
        print("\n\n","="*50)
        print("Number of lines in the input file: {}".format(self.nb_lines))
        print("Number of reads needed for the enitre process: {}".format(self.nb_reads))
        print("Used system memory (RAM): {} GB".format(round(self.__used_memory, 2)))
        print("Available system memory ~ approximated for the use for this process (RAM): {} GB".format(round(self.__available_memory, 2)))
        print("Input file size: {} {}".format(round(self.__input_file_size, 2), self.__file_size_str))
        if self.__available_memory >= self.__input_file_size_gb:
            print("Available memory is more than the file size, you can shuffle the file in-memory.")
            input_val = input("Press 1 to continue, else abort: ")
            if input_val != "1":
                print("Aborting!")
                sys.exit(1)
        print("Suggested maximum buffer size: {}".format(self.__max_buffer_can_be_used))
        input_val2 = input("Press 1 to use the suggested buffer size, else continue with the mentioned buffersize: ")
        if input_val2 == "1":
            self.buffer_size = self.__max_buffer_can_be_used
        print("="*50, "\n\n")
        
        print("File shuffling started...")
        self.shuffle_file()
        print("File shuffled!")
        
        self.__end_time = time.time()
        total_seconds = self.__end_time - self.__start_time
        formated_total_seconds = str(datetime.timedelta(seconds=total_seconds))
        print("Total time taken: {}".format(formated_total_seconds))
        
        
        
    def collect_metadata(self):
        """ This method collects below mentioned file metadata.
                - Number of lines
                - Number of reads
                - Shuffled index
                - Available Memory (RAM) ~ approximated for the use for this process
                - Used Memory (RAM)
                - Input file size in GB/MB/KB
                - Max buffer size
        """
        with open(self.input_file) as in_obj:
            for idx, _ in enumerate(in_obj):
                pass
        self.nb_lines = idx + 1
        self.nb_reads = math.ceil(self.nb_lines / self.buffer_size)
        self.__shuffled_idx = list(range(0, self.nb_lines))
        random.shuffle(self.__shuffled_idx)
        to_gb = (1024*1024*1024)
        file_size_str_list = ["GB", "MB", "KB"]
        _idx = 0
        self.__file_size_str = file_size_str_list[_idx]
        self.__available_memory = ((psutil.virtual_memory().available)/to_gb)*self.__frac_available_memory
        self.__used_memory = (psutil.virtual_memory().used)/to_gb
        self.__input_file_size = os.path.getsize(self.input_file)/to_gb
        self.__input_file_size_gb = os.path.getsize(self.input_file)/to_gb
        while(self.__input_file_size <= 1):
            to_gb = to_gb/1024
            _idx += 1 
            self.__input_file_size = os.path.getsize(self.input_file)/to_gb
        self.__file_size_str = file_size_str_list[_idx]
        self.__max_buffer_can_be_used = int((self.__available_memory/self.__input_file_size_gb)*self.nb_lines)
        
    def shuffle_file(self):
        with open(self.output_file, "a") as out_obj:
            for i in range(self.nb_reads):
                print("Reading number: {}".format(i+1))
                temp_list = list()
                temp_count = 0
                current_idx = self.__shuffled_idx[i*self.buffer_size : (i+1)*self.buffer_size]
                current_idx = sorted(current_idx)
                temp_len = len(current_idx)
                with open(self.input_file) as in_obj:
                    for in_idx, line in enumerate(in_obj):
                        if temp_count < temp_len:
                            if in_idx == current_idx[temp_count]:
                                temp_count += 1
                                temp_list.append(line)
                        else:
                            break
                random.shuffle(temp_list)
                for temp_line in temp_list:
                    out_obj.write(temp_line)
        

In [196]:
input_file = "reviews_rating.csv"
output_file = "reviews_rating_shuffled.csv"

SBF = ShuffleBigFile(input_file, output_file)

Collecting metadata...
Metadata collected!
Following are the metadata collected:


Number of lines in the input file: 5166497
Number of reads needed for the enitre process: 6
Used system memory (RAM): 8.38 GB
Available system memory ~ approximated for the use for this process (RAM): 4.11 GB
Input file size: 4.36 GB
Suggested maximum buffer size: 4878766
Press 1 to use the suggested buffer size, else continue with the mentioned buffersize: 2


File shuffling started...
Reading number: 1
Reading number: 2
Reading number: 3
Reading number: 4
Reading number: 5
Reading number: 6
File shuffled!
Total time taken: 0:01:28.441474
