Skip to content

Commit

Permalink
function and documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
sdpython committed Nov 24, 2015
1 parent c68f07d commit c8a912f
Show file tree
Hide file tree
Showing 6 changed files with 40 additions and 27 deletions.
2 changes: 2 additions & 0 deletions _doc/sphinxdoc/source/cheat_sheets.rst
Expand Up @@ -5,6 +5,8 @@ Cheat sheets


.. toctree::
:maxdepth: 2

../notebooks/chsh_graphs
../notebooks/chsh_files

7 changes: 5 additions & 2 deletions _unittests/ut_data/test_data_helper.py
Expand Up @@ -65,10 +65,13 @@ def test_meaning_table(self):
self._testMethodName,
OutputPrint=__name__ == "__main__")

filename = os.path.join(os.path.abspath(os.path.dirname(__file__)), "data", "data_qutoes.txt")
filename = os.path.join(os.path.abspath(
os.path.dirname(__file__)), "data", "data_qutoes.txt")

def clean_column_name(s):
return s.replace("_0", "")
l = list(enumerate_text_lines(filename, encoding="utf-8", quotes_as_str=True, clean_column_name=clean_column_name))
l = list(enumerate_text_lines(filename, encoding="utf-8",
quotes_as_str=True, clean_column_name=clean_column_name))
fLOG(l)
assert len(l) == 1

Expand Down
8 changes: 4 additions & 4 deletions _unittests/ut_documentation/test_notebook_hackathon.py
Expand Up @@ -70,10 +70,10 @@ def test_notebook_hackathon(self):
keepnote = ls_notebooks("hackathon_2015")
assert len(keepnote) > 0
keepnote = [
_ for _ in keepnote if "upload" not in _ \
and "schemas" not in _ \
and "download" not in _\
and "times_series" not in _]
_ for _ in keepnote if "upload" not in _
and "schemas" not in _
and "download" not in _
and "times_series" not in _]
if len(keepnote) > 0:
res = execute_notebooks(temp, keepnote,
lambda i, n: "deviner" not in n,
Expand Down
2 changes: 1 addition & 1 deletion src/ensae_projects/data/__init__.py
Expand Up @@ -4,4 +4,4 @@
"""

from .data_exception import PasswordException
from .data_helper import change_encoding, enumerate_text_lines
from .data_helper import change_encoding, enumerate_text_lines
2 changes: 1 addition & 1 deletion src/ensae_projects/data/data_exception.py
Expand Up @@ -22,4 +22,4 @@ class FileFormatException(Exception):
"""
raised when unable to parse a file
"""
pass
pass
46 changes: 27 additions & 19 deletions src/ensae_projects/data/data_helper.py
Expand Up @@ -9,7 +9,7 @@
def change_encoding(infile, outfile, enc1, enc2="utf-8", process=None, fLOG=noLOG):
"""
change the encoding of a text file
@param infile input file
@param outfile output file
@param enc1 encoding of the input file
Expand All @@ -25,27 +25,31 @@ def process_line(s):
with open(infile, "r", encoding=enc1) as f:
with open(outfile, "w", encoding=enc2) as g:
for i, line in enumerate(f):
if (i+1) % 1000000 == 0:
if (i + 1) % 1000000 == 0:
fLOG(infile, "-", i, "lines")
g.write(process(line))
return i


def enumerate_text_lines(filename, sep="\t", encoding="utf-8",
quotes_as_str=False, header=True,

def enumerate_text_lines(filename, sep="\t",
encoding="utf-8",
quotes_as_str=False,
header=True,
clean_column_name=None,
convert_float=False,
skip=0,
take=-1,
fLOG=noLOG):
"""
enumerate all lines from a text file,
considers it as column
@param filename filename
@param sep column separator
@param header first row is header
@param encoding encoding
@param clean_column_name function to clean column name
@param convert_float convert number into float wherever possible
@param skip number of rows to skip
@param take number of rows to consider (-1 for all)
@param fLOG logging function
Expand All @@ -59,19 +63,22 @@ def get_schema(row, header, clean_column_name):
return sch
else:
return ["c%00d" % i for i in range(len(row))]

def convert(s, quotes_as_str):

def convert(s, convert_float):
if convert_float:
try:
return float(s)
except ValueError:
return s
else:
return s

def clean_quotes(s, quotes_as_str):
if quotes_as_str:
if s and len(s) > 1 and s[0] == s[-1] == '"':
return s[1:-1]
else:
try:
return float(s)
except ValueError:
return s
else:
return s

return s

with open(filename, "r", encoding=encoding) as f:
d = 0
nb = 0
Expand All @@ -91,10 +98,11 @@ def convert(s, quotes_as_str):
# probably the last file
continue
else:
raise FileFormatException("different number of columns: schema {0} != {1} for line {2}".format(len(schema), len(spl), i+1))
val = { k:convert(v, quotes_as_str) for k,v in zip(schema, spl) }
raise FileFormatException("different number of columns: schema {0} != {1} for line {2}".format(
len(schema), len(spl), i + 1))
val = {k: convert(clean_quotes(v, quotes_as_str), convert_float)
for k, v in zip(schema, spl)}
yield val
nb += 1
if nb % 100000 == 0:
fLOG(filename, "-", nb, "lines")

0 comments on commit c8a912f

Please sign in to comment.