Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 14 additions & 51 deletions src/_arraykit.c
Original file line number Diff line number Diff line change
Expand Up @@ -714,8 +714,10 @@ AK_TP_resolve_field(AK_TypeParser* tp,
if (tp->count_digit == 0) return TPS_STRING;
// int
if (tp->count_j == 0 &&
tp->count_e == 0 &&
tp->count_sign <= 1 &&
tp->last_sign_pos <= 0 &&
tp->count_decimal == 0 &&
tp->count_e == 0 &&
tp->count_paren_close == 0 &&
tp->count_paren_open == 0 &&
tp->count_nan == 0 &&
Expand Down Expand Up @@ -1283,9 +1285,6 @@ AK_CPL_Free(AK_CodePointLine* cpl)
{
PyMem_Free(cpl->buffer);
PyMem_Free(cpl->offsets);
// if (cpl->field) {
// PyMem_Free(cpl->field);
// }
if (cpl->type_parser) { // can exclude the check
PyMem_Free(cpl->type_parser);
}
Expand Down Expand Up @@ -1469,32 +1468,6 @@ AK_CPL_CurrentAdvance(AK_CodePointLine* cpl)
}

//------------------------------------------------------------------------------
// Set the CPL field to the characters accumulated in the CPL's buffer. This is only used for field converters that need a char* as an input argument. This has to be dynamically allocated and cleaned up appropriately.
// static inline char*
// AK_CPL_current_to_field(AK_CodePointLine* cpl)
// {
// // NOTE: we assume this is only called after offset_max is complete, and that this is only called once per CPL; we set it to the maximum size on first usage and then overwrite context on each subsequent usage.
// if (cpl->field == NULL) {
// // create a NULL-terminated string; need one more for string terminator
// cpl->field = (char*)PyMem_Malloc(sizeof(char) * (cpl->offset_max + 1));
// if (cpl->field == NULL) return (char*)PyErr_NoMemory();
// }
// Py_UCS4 *p = cpl->buffer_current_ptr;
// Py_UCS4 *end = p + cpl->offsets[cpl->offsets_current_index];

// // get pointer to field buffer to write to
// char *t = cpl->field;
// while (p < end) {
// if (AK_is_space(*p)) {
// ++p;
// continue;
// }
// *t++ = (char)*p++;
// }
// *t = '\0'; // must be NULL-terminated string
// return cpl->field;
// }

// This will take any case of "TRUE" as True, while marking everything else as False; this is the same approach taken with genfromtxt when the dtype is given as bool. This will not fail for invalid true or false strings.
static inline bool
AK_CPL_current_to_bool(AK_CodePointLine* cpl) {
Expand Down Expand Up @@ -2065,7 +2038,7 @@ AK_line_select_keep(
}

//------------------------------------------------------------------------------
// CodePointGrid Type, New, Destrctor
// CodePointGrid Type, New, Destructor

typedef struct AK_CodePointGrid {
Py_ssize_t lines_count; // accumulated number of lines
Expand Down Expand Up @@ -2464,11 +2437,11 @@ typedef struct AK_DelimitedReader{
AK_Dialect *dialect;
AK_DelimitedReaderState state;
Py_ssize_t field_len;
Py_ssize_t record_number;
Py_ssize_t record_iter_number;
Py_ssize_t field_number;
Py_ssize_t record_number; // total records loaded
Py_ssize_t record_iter_number; // records iterated (counting exclusion)
Py_ssize_t field_number; // field in current record, reset for each record
int axis;
Py_ssize_t *axis_pos;
Py_ssize_t *axis_pos; // points to either record_number or field_number
} AK_DelimitedReader;

// Called once at the close of each field in a line. Returns 0 on success, -1 on failure
Expand Down Expand Up @@ -2687,7 +2660,7 @@ AK_DR_ProcessRecord(AK_DelimitedReader *dr,
return -1;
case 0:
Py_DECREF(record);
return 1; // skip, process more lines
return 1; // skip, process more records
}
// NOTE: record_number should reflect the processed line count, and exlude any skipped lines. The value is initialized to -1 such the first line is number 0
++dr->record_number;
Expand Down Expand Up @@ -2721,9 +2694,10 @@ AK_DR_ProcessRecord(AK_DelimitedReader *dr,
static void
AK_DR_Free(AK_DelimitedReader *dr)
{
AK_Dialect_Free(dr->dialect);
dr->dialect = NULL;
Py_CLEAR(dr->input_iter);
if (dr->dialect) {
AK_Dialect_Free(dr->dialect);
}
Py_XDECREF(dr->input_iter); // might already be NULL
PyMem_Free(dr);
}

Expand Down Expand Up @@ -2755,6 +2729,7 @@ AK_DR_New(PyObject *iterable,

dr->record_number = -1;
dr->record_iter_number = -1;
dr->dialect = NULL; // init in case input_iter fails to init

dr->input_iter = PyObject_GetIter(iterable); // new ref, decref in free
if (dr->input_iter == NULL) {
Expand All @@ -2770,7 +2745,6 @@ AK_DR_New(PyObject *iterable,
quoting,
skipinitialspace,
strict);

if (dr->dialect == NULL) {
AK_DR_Free(dr);
return NULL;
Expand Down Expand Up @@ -2870,7 +2844,6 @@ delimited_to_arrays(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs)
PyErr_SetString(PyExc_TypeError, "line_select must be a callable or None");
return NULL;
}
Py_XINCREF(line_select);

if ((axis < 0) || (axis > 1)) {
PyErr_SetString(PyExc_ValueError, "axis must be 0 or 1");
Expand All @@ -2886,7 +2859,6 @@ delimited_to_arrays(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs)
skipinitialspace,
strict);
if (dr == NULL) { // can happen due to validation of dialect parameters
Py_XDECREF(line_select);
return NULL;
}

Expand All @@ -2896,7 +2868,6 @@ delimited_to_arrays(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs)
&tsep,
thousandschar,
'\0')) {
Py_XDECREF(line_select);
AK_DR_Free(dr);
return NULL; // default is off (skips evaluation)
}
Expand All @@ -2906,15 +2877,13 @@ delimited_to_arrays(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs)
&decc,
decimalchar,
'.')) {
Py_XDECREF(line_select);
AK_DR_Free(dr);
return NULL;
}

// dtypes inc / dec ref bound within CPG life
AK_CodePointGrid* cpg = AK_CPG_New(dtypes, tsep, decc);
if (cpg == NULL) { // error will be set
Py_XDECREF(line_select);
AK_DR_Free(dr);
return NULL;
}
Expand All @@ -2929,7 +2898,6 @@ delimited_to_arrays(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs)
break;
}
else if (status == -1) {
Py_XDECREF(line_select);
AK_DR_Free(dr);
AK_CPG_Free(cpg);
return NULL;
Expand All @@ -2938,17 +2906,12 @@ delimited_to_arrays(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs)
}
AK_DR_Free(dr);


PyObject* arrays = AK_CPG_ToArrayList(cpg, axis, line_select, tsep, decc);
// NOTE: do not need to check if arrays is NULL as we will return NULL anyway

Py_XDECREF(line_select);
AK_CPG_Free(cpg); // will free reference to dtypes

return arrays; // could be NULL
}


static char *iterable_str_to_array_1d_kwarg_names[] = {
"iterable",
"dtype",
Expand Down
63 changes: 63 additions & 0 deletions test/test_delimited_to_arrays.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ def test_iterable_str_to_array_1d_int_13(self) -> None:
with self.assertRaises(TypeError):
a1 = iterable_str_to_array_1d(['3.000', '4.000', '1.000'], dtype=int, thousandschar=',')


#---------------------------------------------------------------------------

def test_iterable_str_to_array_1d_uint_1(self) -> None:
Expand Down Expand Up @@ -703,6 +704,42 @@ def test_delimited_to_arrays_parse_i(self) -> None:
post2 = delimited_to_arrays(msg, axis=1, skipinitialspace=True)
self.assertEqual([a.tolist() for a in post2], [['a', 'b'], [10, 20], ['foo', 'c']])

def test_delimited_to_arrays_parse_j(self) -> None:
msg = [
'2021,2021-04-01,4',
'2022,2022-05-01,3',
]
post1 = delimited_to_arrays(msg, axis=1, skipinitialspace=False)
self.assertEqual([a.tolist() for a in post1], [[2021, 2022], ['2021-04-01', '2022-05-01'], [4, 3]])


def test_delimited_to_arrays_parse_k(self) -> None:
msg = [
'2021,2021-04,4',
'2022,2022-05,3',
]
post1 = delimited_to_arrays(msg, axis=1, skipinitialspace=False)
self.assertEqual([a.tolist() for a in post1], [[2021, 2022], ['2021-04', '2022-05'], [4, 3]])


def test_delimited_to_arrays_parse_l(self) -> None:
msg = [
'1,2,3',
'2-,2-0,-3',
]
post1 = delimited_to_arrays(msg, axis=1, skipinitialspace=False)
self.assertEqual([a.tolist() for a in post1], [['1', '2-'], ['2', '2-0'], [3, -3]])

def test_delimited_to_arrays_parse_m(self) -> None:
msg = [
' 1, 2,3',
' 2-, 2-0, -3',
]
post1 = delimited_to_arrays(msg, axis=1, skipinitialspace=False)
self.assertEqual([a.tolist() for a in post1], [[' 1', ' 2-'], [' 2', ' 2-0'], [3, -3]])


# import ipdb; ipdb.set_trace()

#---------------------------------------------------------------------------
def test_delimited_to_arrays_float_a(self) -> None:
Expand Down Expand Up @@ -1008,6 +1045,32 @@ def test_delimited_to_arrays_decimalchar_b(self) -> None:
[[1000, 2000, 4000], [4.0, 5.055, 6000.155]])


#---------------------------------------------------------------------------
def test_delimited_to_arrays_file_like_a(self) -> None:
def records():
msg = [
'1000;4',
'2000;5055',
]
yield from msg

with self.assertRaises(TypeError):
_ = delimited_to_arrays(records,
axis=1,
delimiter=';',
)

def test_delimited_to_arrays_file_like_b(self) -> None:

with self.assertRaises(TypeError):
_ = delimited_to_arrays(3,
axis=1,
delimiter=';',
dtypes=lambda x: int,
)



#---------------------------------------------------------------------------
def test_delimited_to_arrays_compare_int_a(self) -> None:
# genfromtxt might translate an empty field to -1 or 0
Expand Down