Skip to content
Permalink
Browse files

improve the behavior of download_data #24

  • Loading branch information...
sdpython committed May 24, 2018
1 parent c8fd0ac commit 03bf43ceae1c3741e6860161b723cc27600eb1c9
@@ -1,5 +1,5 @@
"""
@brief test log(time=2s)
@brief test log(time=3s)
You should indicate a time in seconds. The program ``run_unittests.py``
will sort all test files by increasing time and run them.
@@ -9,7 +9,7 @@
import sys
import os
import unittest
from pyquickhelper.loghelper import fLOG
from pyquickhelper.pycode import ExtTestCase, get_temp_folder


try:
@@ -25,47 +25,27 @@
sys.path.append(path)
import src

from src.pyensae.datasource.http_retrieve import download_data
from src.pyensae.datasource.http_retrieve import download_data, DownloadDataException


class TestResources (unittest.TestCase):
class TestResources(ExtTestCase):

def test_import_one(self):
fLOG(
__file__,
self._testMethodName,
OutputPrint=__name__ == "__main__")
fold = os.path.join(
os.path.abspath(
os.path.split(__file__)[0]),
"temp_http")
if not os.path.exists(fold):
os.mkdir(fold)
def test_download_data(self):
fold = get_temp_folder(__file__, "temp_download_data")
exp = ["VOEUX01.txt", "voeux.zip"]
for f in exp:
g = os.path.join(fold, f)
if os.path.exists(g):
os.remove(g)
one = "voeux.zip"
res = download_data(one, website="xd", whereTo=fold, fLOG=fLOG)
fLOG(len(res), res)
assert len(res) == 14
assert "VOEUX01.txt" in res[0]
res = download_data(one, website="xd", whereTo=fold, timeout=2)
self.assertEqual(len(res), 14)
self.assertIn("VOEUX01.txt", res[0])
for f in exp:
g = os.path.join(fold, f)
assert os.path.exists(g)
self.assertExists(g)

def test_import_all(self):
fLOG(
__file__,
self._testMethodName,
OutputPrint=__name__ == "__main__")
all = [" "]
if __name__ == "__main__":
# we only test all the resources if this file is the main file
# otherwise it takes too much time
for a in all:
pass
def test_download_data_failures(self):
fold = get_temp_folder(__file__, "temp_download_data_failures")
one = "voeux2.zip"
self.assertRaise(lambda: download_data(one, website="xd", whereTo=fold, timeout=2),
DownloadDataException)


if __name__ == "__main__":
@@ -1,16 +1,12 @@
"""
@brief test log(time=2s)
You should indicate a time in seconds. The program ``run_unittests.py``
will sort all test files by increasing time and run them.
"""


import sys
import os
import unittest
from pyquickhelper.loghelper import fLOG
from pyquickhelper.pycode import get_temp_folder
from pyquickhelper.pycode import get_temp_folder, ExtTestCase


try:
@@ -29,21 +25,16 @@
from src.pyensae.datasource.http_retrieve import download_data


class TestResourcesBug(unittest.TestCase):
class TestResourcesBug(ExtTestCase):

def test_damir(self):
fLOG(
__file__,
self._testMethodName,
OutputPrint=__name__ == "__main__")

temp = get_temp_folder(__file__, "temp_damir")
res = download_data("A201612_small.csv.gz", whereTo=temp)
self.assertEqual(len(res), 1)
checks = [os.path.join(temp, _)
for _ in ["A201612_small.csv", "A201612_small.csv.gz"]]
self.assertTrue(os.path.exists(checks[0]))
self.assertTrue(os.path.exists(checks[1]))
self.assertExists(checks[0])
self.assertExists(checks[1])


if __name__ == "__main__":
@@ -1,15 +1,12 @@
"""
@brief test log(time=2s)
You should indicate a time in seconds. The program ``run_unittests.py``
will sort all test files by increasing time and run them.
"""


import sys
import os
import unittest
from pyquickhelper.loghelper import fLOG
from pyquickhelper.pycode import ExtTestCase, get_temp_folder


try:
@@ -28,52 +25,20 @@
from src.pyensae.datasource.http_retrieve import download_data


class TestResourcesStanford (unittest.TestCase):
class TestResourcesStanford(ExtTestCase):

def test_tar_gz(self):
fLOG(
__file__,
self._testMethodName,
OutputPrint=__name__ == "__main__")
fold = os.path.join(
os.path.abspath(
os.path.split(__file__)[0]),
"temp_stand")
if not os.path.exists(fold):
os.mkdir(fold)
for f in os.listdir(fold):
if os.path.isfile(f):
os.remove(os.path.join(fold, f))
files = download_data(
"facebook.tar.gz",
website="xd",
fLOG=fLOG,
whereTo=fold)
fLOG(files)
fold = get_temp_folder(__file__, "temp_tar_gz")
files = download_data("facebook.tar.gz", website="xd", whereTo=fold)
sh = [g for g in files if g.endswith("3980.egofeat")]
assert len(files) > 0
assert len(sh) == 1
self.assertNotEmpty(files)
self.assertEqual(len(sh), 1)

def test_gz(self):
fLOG(
__file__,
self._testMethodName,
OutputPrint=__name__ == "__main__")
fold = os.path.join(
os.path.abspath(
os.path.split(__file__)[0]),
"temp_stand")
if not os.path.exists(fold):
os.mkdir(fold)
for f in os.listdir(fold):
if os.path.isfile(f):
os.remove(os.path.join(fold, f))
files = download_data(
"facebook_combined.txt.gz",
website="xd",
fLOG=fLOG,
whereTo=fold)
fLOG(files)
fold = get_temp_folder(__file__, "temp_gz")
files = download_data("facebook_combined.txt.gz",
website="xd", whereTo=fold)
self.assertNotEmpty(files)


if __name__ == "__main__":
@@ -18,6 +18,13 @@ class DownloadDataException(Exception):
pass


class RetrieveDataException(Exception):
"""
raised when data cannot be downloaded
"""
pass


def remove_empty_line(file):
"""
Removes empty line in an imported file.
@@ -71,8 +78,8 @@ def download_data(name, moduleName=None, url=None, glo=None,
look for it on http://www.xavierdupre.fr/... (website),
the file is copied at this file location and uncompressed if it is a zip file (or a tar.gz file).
@param name (str) name of the module
@param moduleName (str|None) like import name as moduleName, None for name
@param name (str) name of the file to download
@param moduleName (str|None) like import name as moduleName if *name* is a module
@param url (str|list|None) link to the website to use (or the websites if list)
@param glo (dict|None) if None, it will be replaced ``globals()``
@param loc (dict|None) if None, it will be replaced ``locals()``
@@ -114,6 +121,20 @@ def download_data(name, moduleName=None, url=None, glo=None,
tries the first one which contains the file.
"""
if isinstance(url, list):
outfiles = []
for i, u in enumerate(url):
n, e = os.path.splitext(name)
n2 = "{0}-{1}.{2}".format(n, i, e)
res = download_data(n2, moduleName=moduleName, url=u, glo=glo,
loc=loc, whereTo=whereTo, website=website, timeout=timeout,
retry=retry, silent=silent, fLOG=fLOG)
if isinstance(res, list):
outfile.extend(res)
else:
outfile.append(res)
return outfiles

from ..file_helper.decompress_helper import decompress_zip, decompress_targz, decompress_gz, decompress_bz2

if glo is None:
@@ -143,77 +164,61 @@ def transform_url(w):
if name in sys.modules:
return sys.modules[name]
elif "." not in name:
fLOG(" unable to find module ", name)
fLOG("[download_data] unable to find module ", name)

file = name if "." in name else "%s.py" % name
outfile = file if whereTo == "." else os.path.join(whereTo, file)

if not os.path.exists(outfile):
path = "../../../../complements_site_web"
f2 = os.path.join(path, file)
if os.path.exists(f2):
fLOG("[download_data] adding file", f2)
u = open(f2, "r")
alls = u.read()
u.close()
else:
if not isinstance(url, list):
urls = [url]
else:
urls = url
excs = []
success = False
for url in urls:
if success:
if url is not None and not os.path.exists(outfile):
excs = []
success = False
alls = None
url += file
fLOG("[download_data] download '{0}' to '{1}'".format(
url, outfile))
while retry > 0:
try:
u = urllib.request.urlopen(
url) if timeout is None else urllib.request.urlopen(url, timeout=timeout)
alls = u.read()
u.close()
success = True
break
except ConnectionResetError as ee:
if retry <= 0:
exc = DownloadDataException(
"Unable (1) to retrieve data from '{0}'. Error: {1}".format(url, ee))
excs.append(exc)
excs.append(ee)
break
url += file
fLOG("[download_data] download '{0}' to '{1}'".format(
url, outfile))
while retry > 0:
try:
u = urllib.request.urlopen(
url) if timeout is None else urllib.request.urlopen(url, timeout=timeout)
alls = u.read()
u.close()
success = True
break
except ConnectionResetError as ee:
if retry <= 0:
exc = DownloadDataException(
"Unable (1) to retrieve data from '{0}'. Error: {1}".format(url, ee))
excs.append(exc)
excs.append(ee)
break
else:
fLOG("[download_data] (1) fail and retry to download '{0}' to '{1}'".format(
url, outfile))
# We wait for 2 seconds.
time.sleep(2)
except Exception as e:
if retry <= 1:
exc = DownloadDataException(
"Unable (2) to retrieve data from '{0}'. Error: {1}".format(url, e))
excs.append(exc)
excs.append(e)
break
else:
fLOG("[download_data] (2) fail and retry to download '{0}' to '{1}'".format(
url, outfile))
# We wait for 2 seconds.
time.sleep(2)
retry -= 1
if not success:
if len(excs) > 0:
raise excs[0]
else:
raise DownloadDataException(
"Unable (3) to retrieve data from '{0}'. Error: {1}".format(url, e))
fLOG("[download_data] (1) fail and retry to download '{0}' to '{1}'".format(
url, outfile))
# We wait for 2 seconds.
time.sleep(2)
except Exception as e:
if retry <= 1:
exc = DownloadDataException(
"Unable (2) to retrieve data from '{0}'. Error: {1}".format(url, e))
excs.append(exc)
excs.append(e)
break
else:
fLOG("[download_data] (2) fail and retry to download '{0}' to '{1}'".format(
url, outfile))
# We wait for 2 seconds.
time.sleep(2)
retry -= 1

if success and alls is not None:
u = open(outfile, "wb")
u.write(alls)
u.close()
else:
if name.endswith(".tar.gz") and os.stat(outfile).st_size > 2 ** 20:
return [outfile]
elif len(excs) > 0:
raise excs[0]
else:
raise DownloadDataException(
"Unable to retrieve data from '{0}'".format(url))

if name.endswith(".zip"):
return decompress_zip(outfile, whereTo, fLOG)
@@ -286,7 +291,7 @@ def transform_url(w):
fLOG("[download_data] sys.path ", sys.path)
for _ in sys.path:
fLOG("[download_data] path ", _)
fLOG("sys.modules.keys()", list(sys.modules.keys()))
fLOG("[download_data] sys.modules.keys()", list(sys.modules.keys()))
for _ in sorted(sys.modules):
fLOG("[download_data] modules ", _)
raise e

0 comments on commit 03bf43c

Please sign in to comment.
You can’t perform that action at this time.