From c8a912f3e4e3bdfb1e8e177c5bd7b961d74039cb Mon Sep 17 00:00:00 2001
From: dupre <xavier.dupre@ensae.fr>
Date: Tue, 24 Nov 2015 23:20:46 +0100
Subject: [PATCH] function and documentation

---
 _doc/sphinxdoc/source/cheat_sheets.rst        |  2 +
 _unittests/ut_data/test_data_helper.py        |  7 ++-
 .../test_notebook_hackathon.py                |  8 ++--
 src/ensae_projects/data/__init__.py           |  2 +-
 src/ensae_projects/data/data_exception.py     |  2 +-
 src/ensae_projects/data/data_helper.py        | 46 +++++++++++--------
 6 files changed, 40 insertions(+), 27 deletions(-)

diff --git a/_doc/sphinxdoc/source/cheat_sheets.rst b/_doc/sphinxdoc/source/cheat_sheets.rst
index ff96ec6..b8a88b8 100644
--- a/_doc/sphinxdoc/source/cheat_sheets.rst
+++ b/_doc/sphinxdoc/source/cheat_sheets.rst
@@ -5,6 +5,8 @@ Cheat sheets
 
 
 .. toctree::
+    :maxdepth: 2
 
     ../notebooks/chsh_graphs
+    ../notebooks/chsh_files
 
diff --git a/_unittests/ut_data/test_data_helper.py b/_unittests/ut_data/test_data_helper.py
index 52e2d10..91d9abb 100644
--- a/_unittests/ut_data/test_data_helper.py
+++ b/_unittests/ut_data/test_data_helper.py
@@ -65,10 +65,13 @@ def test_meaning_table(self):
             self._testMethodName,
             OutputPrint=__name__ == "__main__")
 
-        filename = os.path.join(os.path.abspath(os.path.dirname(__file__)), "data", "data_qutoes.txt")
+        filename = os.path.join(os.path.abspath(
+            os.path.dirname(__file__)), "data", "data_qutoes.txt")
+
         def clean_column_name(s):
             return s.replace("_0", "")
-        l = list(enumerate_text_lines(filename, encoding="utf-8", quotes_as_str=True, clean_column_name=clean_column_name))
+        l = list(enumerate_text_lines(filename, encoding="utf-8",
+                                      quotes_as_str=True, clean_column_name=clean_column_name))
         fLOG(l)
         assert len(l) == 1
 
diff --git a/_unittests/ut_documentation/test_notebook_hackathon.py b/_unittests/ut_documentation/test_notebook_hackathon.py
index 0db2cba..78ab2ab 100644
--- a/_unittests/ut_documentation/test_notebook_hackathon.py
+++ b/_unittests/ut_documentation/test_notebook_hackathon.py
@@ -70,10 +70,10 @@ def test_notebook_hackathon(self):
         keepnote = ls_notebooks("hackathon_2015")
         assert len(keepnote) > 0
         keepnote = [
-            _ for _ in keepnote if "upload" not in _ \
-                                and "schemas" not in _ \
-                                and "download" not in _\
-                                and "times_series" not in _]
+            _ for _ in keepnote if "upload" not in _
+            and "schemas" not in _
+            and "download" not in _
+            and "times_series" not in _]
         if len(keepnote) > 0:
             res = execute_notebooks(temp, keepnote,
                                     lambda i, n: "deviner" not in n,
diff --git a/src/ensae_projects/data/__init__.py b/src/ensae_projects/data/__init__.py
index 071c524..4dc6240 100644
--- a/src/ensae_projects/data/__init__.py
+++ b/src/ensae_projects/data/__init__.py
@@ -4,4 +4,4 @@
 """
 
 from .data_exception import PasswordException
-from .data_helper import change_encoding, enumerate_text_lines
\ No newline at end of file
+from .data_helper import change_encoding, enumerate_text_lines
diff --git a/src/ensae_projects/data/data_exception.py b/src/ensae_projects/data/data_exception.py
index a806a99..f49dbd1 100644
--- a/src/ensae_projects/data/data_exception.py
+++ b/src/ensae_projects/data/data_exception.py
@@ -22,4 +22,4 @@ class FileFormatException(Exception):
     """
     raised when unable to parse a file
     """
-    pass
\ No newline at end of file
+    pass
diff --git a/src/ensae_projects/data/data_helper.py b/src/ensae_projects/data/data_helper.py
index 7427257..159c2e0 100644
--- a/src/ensae_projects/data/data_helper.py
+++ b/src/ensae_projects/data/data_helper.py
@@ -9,7 +9,7 @@
 def change_encoding(infile, outfile, enc1, enc2="utf-8", process=None, fLOG=noLOG):
     """
     change the encoding of a text file
-    
+
     @param      infile      input file
     @param      outfile     output file
     @param      enc1        encoding of the input file
@@ -25,27 +25,31 @@ def process_line(s):
     with open(infile, "r", encoding=enc1) as f:
         with open(outfile, "w", encoding=enc2) as g:
             for i, line in enumerate(f):
-                if (i+1) % 1000000 == 0:
+                if (i + 1) % 1000000 == 0:
                     fLOG(infile, "-", i, "lines")
                 g.write(process(line))
             return i
-            
 
-def enumerate_text_lines(filename, sep="\t", encoding="utf-8", 
-                         quotes_as_str=False, header=True,
+
+def enumerate_text_lines(filename, sep="\t",
+                         encoding="utf-8",
+                         quotes_as_str=False,
+                         header=True,
                          clean_column_name=None,
+                         convert_float=False,
                          skip=0,
                          take=-1,
                          fLOG=noLOG):
     """
     enumerate all lines from a text file,
     considers it as column
-    
+
     @param          filename            filename
     @param          sep                 column separator
     @param          header              first row is header
     @param          encoding            encoding
     @param          clean_column_name   function to clean column name
+    @param          convert_float       convert number into float wherever possible
     @param          skip                number of rows to skip
     @param          take                number of rows to consider (-1 for all)
     @param          fLOG                logging function
@@ -59,19 +63,22 @@ def get_schema(row, header, clean_column_name):
             return sch
         else:
             return ["c%00d" % i for i in range(len(row))]
-            
-    def convert(s, quotes_as_str):
+
+    def convert(s, convert_float):
+        if convert_float:
+            try:
+                return float(s)
+            except ValueError:
+                return s
+        else:
+            return s
+
+    def clean_quotes(s, quotes_as_str):
         if quotes_as_str:
             if s and len(s) > 1 and s[0] == s[-1] == '"':
                 return s[1:-1]
-            else:
-                try:
-                    return float(s)
-                except ValueError:
-                    return s
-        else:
-            return s
-    
+        return s
+
     with open(filename, "r", encoding=encoding) as f:
         d = 0
         nb = 0
@@ -91,10 +98,11 @@ def convert(s, quotes_as_str):
                     # probably the last file
                     continue
                 else:
-                    raise FileFormatException("different number of columns: schema {0} != {1} for line {2}".format(len(schema), len(spl), i+1))
-            val = { k:convert(v, quotes_as_str) for k,v in zip(schema, spl) }
+                    raise FileFormatException("different number of columns: schema {0} != {1} for line {2}".format(
+                        len(schema), len(spl), i + 1))
+            val = {k: convert(clean_quotes(v, quotes_as_str), convert_float)
+                   for k, v in zip(schema, spl)}
             yield val
             nb += 1
             if nb % 100000 == 0:
                 fLOG(filename, "-", nb, "lines")
-        
\ No newline at end of file