get rid of big data files (#15)

replace large test files with random data generator
siddhantgoel · May 20, 2018 · 0fed805 · 0fed805
1 parent 1d5637e
commit 0fed805
Show file tree

Hide file tree

Showing 7 changed files with 67 additions and 45 deletions.
diff --git a/tests/data/file.txt b/tests/data/file.txt
diff --git a/tests/data/image-2560x1600.png b/tests/data/image-2560x1600.png
diff --git a/tests/data/image-500k.png b/tests/data/image-500k.png
diff --git a/tests/data/image-600x400.png b/tests/data/image-600x400.png
diff --git a/tests/data/image-high-res.jpg b/tests/data/image-high-res.jpg
diff --git a/tests/test_parser.py b/tests/test_parser.py
@@ -1,4 +1,5 @@
-import os.path
+from io import BytesIO
+from numpy import random
 from unittest import TestCase
 
 from requests_toolbelt import MultipartEncoder
@@ -7,19 +8,31 @@
 from streaming_form_data.targets import ValueTarget
 
 
-DATA_DIR = 'tests/data'
+def get_random_bytes(size, seed):
+    random.seed(seed)
+    return random.bytes(size)
 
 
-def data_file_path(filename):
-    return os.path.join(DATA_DIR, filename)
+def open_dataset(filename):
+    if filename == 'file.txt':
+        filedata = b'this is a txt file\r\n' * 10
+    elif filename == 'image-600x400.png':
+        filedata = get_random_bytes(1780, 600)
+    elif filename == 'image-2560x1600.png':
+        filedata = get_random_bytes(11742, 2560)
+    elif filename == 'image-500k.png':
+        filedata = get_random_bytes(437814, 500)
+    elif filename == 'image-high-res.jpg':
+        filedata = get_random_bytes(9450866, 945)
+    else:
+        raise Exception('Unknown file name: ' + filename)
+    return BytesIO(filedata)
 
 
-def load_file(path):
-    _, filename = os.path.split(path)
-
-    with open(path, 'rb') as file_:
+def encoded_dataset(filename):
+    with open_dataset(filename) as dataset_:
         fields = {
-            filename: (filename, file_, 'text/plain')
+            filename: (filename, dataset_, 'text/plain')
         }
 
         encoder = MultipartEncoder(fields=fields)
@@ -204,10 +217,10 @@ def test_file_content_single(self):
         filenames = ('file.txt', 'image-600x400.png', 'image-2560x1600.png')
 
         for filename in filenames:
-            with open(data_file_path(filename), 'rb') as file_:
-                expected_value = file_.read()
+            with open_dataset(filename) as dataset_:
+                expected_value = dataset_.read()
 
-            content_type, body = load_file(data_file_path(filename))
+            content_type, body = encoded_dataset(filename)
 
             value = ValueTarget()
 
@@ -220,10 +233,10 @@ def test_file_content_single(self):
             self.assertEqual(value.value, expected_value)
 
     def test_file_content_multiple(self):
-        with open(data_file_path('file.txt'), 'rb') as file_:
-            expected_value = file_.read()
+        with open_dataset('file.txt') as dataset_:
+            expected_value = dataset_.read()
 
-        content_type, body = load_file(data_file_path('file.txt'))
+        content_type, body = encoded_dataset('file.txt')
 
         txt = ValueTarget()
 
@@ -244,10 +257,10 @@ def test_file_content_multiple(self):
         self.assertEqual(txt.value, expected_value)
 
     def test_file_content_varying_chunk_size(self):
-        with open(data_file_path('file.txt'), 'rb') as file_:
-            expected_value = file_.read()
+        with open_dataset('file.txt') as dataset_:
+            expected_value = dataset_.read()
 
-        content_type, body = load_file(data_file_path('file.txt'))
+        content_type, body = encoded_dataset('file.txt')
 
         for index in range(len(body)):
             txt = ValueTarget()
@@ -262,14 +275,14 @@ def test_file_content_varying_chunk_size(self):
             self.assertEqual(txt.value, expected_value)
 
     def test_mixed_content_varying_chunk_size(self):
-        with open(data_file_path('file.txt'), 'rb') as file_:
-            expected_value = file_.read()
+        with open_dataset('file.txt') as dataset_:
+            expected_value = dataset_.read()
 
-        with open(data_file_path('file.txt'), 'rb') as file_:
+        with open_dataset('file.txt') as dataset_:
             fields = {
                 'name': 'hello world',
                 'age': '10',
-                'cv.txt': ('file.txt', file_, 'text/plain')
+                'cv.txt': ('file.txt', dataset_, 'text/plain')
             }
 
             encoder = MultipartEncoder(fields=fields)
@@ -338,17 +351,17 @@ def test_multiple_files(self):
         txt_filename = 'file.txt'
         png_filename = 'image-600x400.png'
 
-        with open(data_file_path(txt_filename), 'rb') as file_:
-            expected_txt = file_.read()
+        with open_dataset(txt_filename) as dataset_:
+            expected_txt = dataset_.read()
 
-        with open(data_file_path(png_filename), 'rb') as file_:
-            expected_png = file_.read()
+        with open_dataset(png_filename) as dataset_:
+            expected_png = dataset_.read()
 
         txt_target = ValueTarget()
         png_target = ValueTarget()
 
-        with open(data_file_path(txt_filename), 'rb') as txt_file, \
-                open(data_file_path(png_filename), 'rb') as png_file:
+        with open_dataset(txt_filename) as txt_file, \
+                open_dataset(png_filename) as png_file:
             encoder = MultipartEncoder(fields={
                 txt_filename: (txt_filename, txt_file,
                                'application/plain'),
@@ -369,10 +382,10 @@ def test_multiple_files(self):
     def test_large_file(self):
         for filename in ['image-500k.png', 'image-2560x1600.png',
                          'image-600x400.png', 'image-high-res.jpg']:
-            with open(data_file_path(filename), 'rb') as file_:
-                expected_value = file_.read()
+            with open_dataset(filename) as dataset_:
+                expected_value = dataset_.read()
 
-            content_type, body = load_file(data_file_path(filename))
+            content_type, body = encoded_dataset(filename)
 
             value = ValueTarget()
 

diff --git a/utils/benchmark.py b/utils/benchmark.py
@@ -1,7 +1,8 @@
 from argparse import ArgumentParser
-from functools import wraps
-from io import StringIO
 import cProfile
+from functools import wraps
+from io import StringIO, BytesIO
+from numpy import random
 import pstats
 
 from requests_toolbelt import MultipartEncoder
@@ -64,16 +65,34 @@ def parse_args():
     parser = ArgumentParser()
     parser.add_argument('-c', '--content-type', type=str, required=True,
                         help='Content Type of the input file')
-    parser.add_argument('-f', '--filename', type=str, required=True,
+    parser.add_argument('-f', '--filename', type=str, required=False,
                         help='File to be uploaded')
+    parser.add_argument('--data-size', metavar='SIZE',
+                        type=int, required=False,
+                        help='Size of generated data' +
+                        ' to be used instead of real file')
     return parser.parse_args()
 
 
+def get_random_bytes(size, seed):
+    random.seed(seed)
+    return random.bytes(size)
+
+
+def open_data(args):
+    if args.filename is not None:
+        return open(args.filename, 'rb')
+    if args.data_size is not None:
+        return BytesIO(get_random_bytes(args.data_size, 42))
+    raise Exception('Not enough arguments passed: ' +
+                    'please specify --filename or --data_size argument')
+
+
 @c_profile()
 def main():
     args = parse_args()
 
-    with open(args.filename, 'rb') as fd:
+    with open_data(args) as fd:
         encoder = MultipartEncoder(fields={
             'file': ('file', fd, args.content_type)
         })