From c8a912f3e4e3bdfb1e8e177c5bd7b961d74039cb Mon Sep 17 00:00:00 2001 From: dupre Date: Tue, 24 Nov 2015 23:20:46 +0100 Subject: [PATCH] function and documentation --- _doc/sphinxdoc/source/cheat_sheets.rst | 2 + _unittests/ut_data/test_data_helper.py | 7 ++- .../test_notebook_hackathon.py | 8 ++-- src/ensae_projects/data/__init__.py | 2 +- src/ensae_projects/data/data_exception.py | 2 +- src/ensae_projects/data/data_helper.py | 46 +++++++++++-------- 6 files changed, 40 insertions(+), 27 deletions(-) diff --git a/_doc/sphinxdoc/source/cheat_sheets.rst b/_doc/sphinxdoc/source/cheat_sheets.rst index ff96ec6..b8a88b8 100644 --- a/_doc/sphinxdoc/source/cheat_sheets.rst +++ b/_doc/sphinxdoc/source/cheat_sheets.rst @@ -5,6 +5,8 @@ Cheat sheets .. toctree:: + :maxdepth: 2 ../notebooks/chsh_graphs + ../notebooks/chsh_files diff --git a/_unittests/ut_data/test_data_helper.py b/_unittests/ut_data/test_data_helper.py index 52e2d10..91d9abb 100644 --- a/_unittests/ut_data/test_data_helper.py +++ b/_unittests/ut_data/test_data_helper.py @@ -65,10 +65,13 @@ def test_meaning_table(self): self._testMethodName, OutputPrint=__name__ == "__main__") - filename = os.path.join(os.path.abspath(os.path.dirname(__file__)), "data", "data_qutoes.txt") + filename = os.path.join(os.path.abspath( + os.path.dirname(__file__)), "data", "data_qutoes.txt") + def clean_column_name(s): return s.replace("_0", "") - l = list(enumerate_text_lines(filename, encoding="utf-8", quotes_as_str=True, clean_column_name=clean_column_name)) + l = list(enumerate_text_lines(filename, encoding="utf-8", + quotes_as_str=True, clean_column_name=clean_column_name)) fLOG(l) assert len(l) == 1 diff --git a/_unittests/ut_documentation/test_notebook_hackathon.py b/_unittests/ut_documentation/test_notebook_hackathon.py index 0db2cba..78ab2ab 100644 --- a/_unittests/ut_documentation/test_notebook_hackathon.py +++ b/_unittests/ut_documentation/test_notebook_hackathon.py @@ -70,10 +70,10 @@ def test_notebook_hackathon(self): keepnote = ls_notebooks("hackathon_2015") assert len(keepnote) > 0 keepnote = [ - _ for _ in keepnote if "upload" not in _ \ - and "schemas" not in _ \ - and "download" not in _\ - and "times_series" not in _] + _ for _ in keepnote if "upload" not in _ + and "schemas" not in _ + and "download" not in _ + and "times_series" not in _] if len(keepnote) > 0: res = execute_notebooks(temp, keepnote, lambda i, n: "deviner" not in n, diff --git a/src/ensae_projects/data/__init__.py b/src/ensae_projects/data/__init__.py index 071c524..4dc6240 100644 --- a/src/ensae_projects/data/__init__.py +++ b/src/ensae_projects/data/__init__.py @@ -4,4 +4,4 @@ """ from .data_exception import PasswordException -from .data_helper import change_encoding, enumerate_text_lines \ No newline at end of file +from .data_helper import change_encoding, enumerate_text_lines diff --git a/src/ensae_projects/data/data_exception.py b/src/ensae_projects/data/data_exception.py index a806a99..f49dbd1 100644 --- a/src/ensae_projects/data/data_exception.py +++ b/src/ensae_projects/data/data_exception.py @@ -22,4 +22,4 @@ class FileFormatException(Exception): """ raised when unable to parse a file """ - pass \ No newline at end of file + pass diff --git a/src/ensae_projects/data/data_helper.py b/src/ensae_projects/data/data_helper.py index 7427257..159c2e0 100644 --- a/src/ensae_projects/data/data_helper.py +++ b/src/ensae_projects/data/data_helper.py @@ -9,7 +9,7 @@ def change_encoding(infile, outfile, enc1, enc2="utf-8", process=None, fLOG=noLOG): """ change the encoding of a text file - + @param infile input file @param outfile output file @param enc1 encoding of the input file @@ -25,27 +25,31 @@ def process_line(s): with open(infile, "r", encoding=enc1) as f: with open(outfile, "w", encoding=enc2) as g: for i, line in enumerate(f): - if (i+1) % 1000000 == 0: + if (i + 1) % 1000000 == 0: fLOG(infile, "-", i, "lines") g.write(process(line)) return i - -def enumerate_text_lines(filename, sep="\t", encoding="utf-8", - quotes_as_str=False, header=True, + +def enumerate_text_lines(filename, sep="\t", + encoding="utf-8", + quotes_as_str=False, + header=True, clean_column_name=None, + convert_float=False, skip=0, take=-1, fLOG=noLOG): """ enumerate all lines from a text file, considers it as column - + @param filename filename @param sep column separator @param header first row is header @param encoding encoding @param clean_column_name function to clean column name + @param convert_float convert number into float wherever possible @param skip number of rows to skip @param take number of rows to consider (-1 for all) @param fLOG logging function @@ -59,19 +63,22 @@ def get_schema(row, header, clean_column_name): return sch else: return ["c%00d" % i for i in range(len(row))] - - def convert(s, quotes_as_str): + + def convert(s, convert_float): + if convert_float: + try: + return float(s) + except ValueError: + return s + else: + return s + + def clean_quotes(s, quotes_as_str): if quotes_as_str: if s and len(s) > 1 and s[0] == s[-1] == '"': return s[1:-1] - else: - try: - return float(s) - except ValueError: - return s - else: - return s - + return s + with open(filename, "r", encoding=encoding) as f: d = 0 nb = 0 @@ -91,10 +98,11 @@ def convert(s, quotes_as_str): # probably the last file continue else: - raise FileFormatException("different number of columns: schema {0} != {1} for line {2}".format(len(schema), len(spl), i+1)) - val = { k:convert(v, quotes_as_str) for k,v in zip(schema, spl) } + raise FileFormatException("different number of columns: schema {0} != {1} for line {2}".format( + len(schema), len(spl), i + 1)) + val = {k: convert(clean_quotes(v, quotes_as_str), convert_float) + for k, v in zip(schema, spl)} yield val nb += 1 if nb % 100000 == 0: fLOG(filename, "-", nb, "lines") - \ No newline at end of file