In [1]:
# Import the required modules.
import gzip
import requests
import os

# URL for the file download.
gzip_file_url = "https://snap.stanford.edu/data/finefoods.txt.gz"

# Paths for the data directory, text file, and compressed file.
data_directory = os.path.join('.', 'data')
if not os.path.exists(data_directory):
    print("Data directory ('{}') does not exist - creating it".format(data_directory))
    os.mkdir(data_directory)

text_file_path = os.path.join(data_directory, 'finefoods.txt')
gzip_file_path = text_file_path + '.gz'

Data directory ('.\data') does not exist - creating it


In [4]:
# Function to download a file from a website.
#
# Assumes that the file is large and should be downloaded as a stream.
#
# url - URL to the file
# save_file_path - local path where the file should be saved
# chunk_size - size of the chunks to use when saving the streamed bytes to a file; default = 8KiB
#
# returns - None
def download_data_file(url, save_file_path, chunk_size=8192):
    # Request the file as a stream.
    response = requests.get(url, stream=True)

    # Save the data to the given file_path.
    with open(save_file_path, 'wb') as fd:
        for chunk in response.iter_content(chunk_size=chunk_size):
            fd.write(chunk)
            
    return None


# Function to decompress a gzipped data file.
#
# text_file - path to the decompressed file; will be created
# gzip_file - path to the gzipped file; must already exist
#
# returns - None
def decompress_data_file(text_file, gzip_file):
    with gzip.open(gzip_file, 'rb') as infile:
        #gz_content = infile.read()
        #text_content = str(gz_content)
        with open(text_file, 'wb') as outfile:
            outfile.write(infile.read())
            
    return None


# Function to get the decompressed data file.
#
# Does nothing if the data file already exists.
# If the data file doesn't exist, but the compressed data file does, then it decompresses the compressed file.
# If the compressed data file doesn't exist, then it downloads and decompresses the data file.
#
# text_file - path to the decompressed file; will be created if it doesn't already exist
# gzip_file - path to the gzipped file; will be downloaded if it doesn't already exist
#
# returns - None
def get_data_file(text_file, gzip_file):
    if os.path.exists(text_file):
        print("Decompressed file ('{}') already exists".format(text_file))
    else:
        print("Decompressed file ('{}') does not exist".format(text_file))
        
        # To get the decompressed file, we need the gzipped file.
        if not os.path.exists(gzip_file):
            print("Compressed data file ('{}') does not exist, downloading...".format(gzip_file))
            download_data_file(gzip_file_url, gzip_file)
            print("...download finished")

        print("Compressed data file ('{}') exists, decompressing".format(gzip_file))
        decompress_data_file(text_file, gzip_file)

    
    return None

In [5]:
get_data_file(text_file_path, gzip_file_path)

Decompressed file ('.\data\finefoods.txt') already exists
