-
-
Notifications
You must be signed in to change notification settings - Fork 1.6k
/
dataframe.py
1336 lines (1179 loc) · 43.9 KB
/
dataframe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
from __future__ import annotations
import contextlib
from datetime import date, datetime, time, timedelta
from functools import singledispatch
from itertools import islice, zip_longest
from operator import itemgetter
from typing import (
TYPE_CHECKING,
Any,
Callable,
Generator,
Iterable,
Mapping,
MutableMapping,
Sequence,
)
import polars._reexport as pl
import polars._utils.construction as plc
from polars import functions as F
from polars._utils.construction.utils import (
contains_nested,
is_namedtuple,
is_pydantic_model,
nt_unpack,
try_get_type_hints,
)
from polars._utils.various import (
_is_generator,
arrlen,
parse_version,
)
from polars._utils.wrap import wrap_df, wrap_s
from polars.datatypes import (
N_INFER_DEFAULT,
Categorical,
Enum,
String,
Struct,
Unknown,
is_polars_dtype,
py_type_to_dtype,
)
from polars.dependencies import (
_NUMPY_AVAILABLE,
_check_for_numpy,
_check_for_pandas,
dataclasses,
)
from polars.dependencies import numpy as np
from polars.dependencies import pandas as pd
from polars.dependencies import pyarrow as pa
from polars.exceptions import ShapeError
from polars.meta import thread_pool_size
with contextlib.suppress(ImportError): # Module not available when building docs
from polars.polars import PyDataFrame
if TYPE_CHECKING:
from polars import DataFrame, Series
from polars.polars import PySeries
from polars.type_aliases import (
Orientation,
PolarsDataType,
SchemaDefinition,
SchemaDict,
)
def dict_to_pydf(
data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series],
schema: SchemaDefinition | None = None,
*,
schema_overrides: SchemaDict | None = None,
strict: bool = True,
nan_to_null: bool = False,
) -> PyDataFrame:
"""Construct a PyDataFrame from a dictionary of sequences."""
if isinstance(schema, Mapping) and data:
if not all((col in schema) for col in data):
msg = "the given column-schema names do not match the data dictionary"
raise ValueError(msg)
data = {col: data[col] for col in schema}
column_names, schema_overrides = _unpack_schema(
schema, lookup_names=data.keys(), schema_overrides=schema_overrides
)
if not column_names:
column_names = list(data)
if data and _NUMPY_AVAILABLE:
# if there are 3 or more numpy arrays of sufficient size, we multi-thread:
count_numpy = sum(
int(
_check_for_numpy(val)
and isinstance(val, np.ndarray)
and len(val) > 1000
)
for val in data.values()
)
if count_numpy >= 3:
# yes, multi-threading was easier in python here; we cannot have multiple
# threads running python and release the gil in pyo3 (it will deadlock).
# (note: 'dummy' is threaded)
import multiprocessing.dummy
pool_size = thread_pool_size()
with multiprocessing.dummy.Pool(pool_size) as pool:
data = dict(
zip(
column_names,
pool.map(
lambda t: pl.Series(t[0], t[1])
if isinstance(t[1], np.ndarray)
else t[1],
list(data.items()),
),
)
)
if not data and schema_overrides:
data_series = [
pl.Series(
name,
[],
dtype=schema_overrides.get(name),
strict=strict,
nan_to_null=nan_to_null,
)._s
for name in column_names
]
else:
data_series = [
s._s
for s in _expand_dict_values(
data,
schema_overrides=schema_overrides,
strict=strict,
nan_to_null=nan_to_null,
).values()
]
data_series = _handle_columns_arg(data_series, columns=column_names, from_dict=True)
pydf = PyDataFrame(data_series)
if schema_overrides and pydf.dtypes() != list(schema_overrides.values()):
pydf = _post_apply_columns(
pydf, column_names, schema_overrides=schema_overrides, strict=strict
)
return pydf
def _unpack_schema(
schema: SchemaDefinition | None,
*,
schema_overrides: SchemaDict | None = None,
n_expected: int | None = None,
lookup_names: Iterable[str] | None = None,
) -> tuple[list[str], SchemaDict]:
"""
Unpack column names and create dtype lookup.
Works for any (name, dtype) pairs or schema dict input,
overriding any inferred dtypes with explicit dtypes if supplied.
"""
def _normalize_dtype(dtype: Any) -> PolarsDataType:
"""Parse non-Polars data types as Polars data types."""
if is_polars_dtype(dtype, include_unknown=True):
return dtype
else:
return py_type_to_dtype(dtype)
def _parse_schema_overrides(
schema_overrides: SchemaDict | None = None,
) -> dict[str, PolarsDataType]:
"""Parse schema overrides as a dictionary of name to Polars data type."""
if schema_overrides is None:
return {}
return {
name: _normalize_dtype(dtype) for name, dtype in schema_overrides.items()
}
schema_overrides = _parse_schema_overrides(schema_overrides)
# Fast path for empty schema
if not schema:
columns = (
[f"column_{i}" for i in range(n_expected)] if n_expected is not None else []
)
return columns, schema_overrides
# determine column names from schema
if isinstance(schema, Mapping):
column_names: list[str] = list(schema)
schema = list(schema.items())
else:
column_names = []
for i, col in enumerate(schema):
if isinstance(col, str):
unnamed = not col and col not in schema_overrides
col = f"column_{i}" if unnamed else col
else:
col = col[0]
column_names.append(col)
# determine column dtypes from schema and lookup_names
lookup: dict[str, str] | None = (
{
col: name
for col, name in zip_longest(column_names, lookup_names)
if name is not None
}
if lookup_names
else None
)
column_dtypes: dict[str, PolarsDataType] = {}
for col in schema:
if isinstance(col, str):
continue
name, dtype = col
if dtype is None:
continue
else:
dtype = _normalize_dtype(dtype)
name = lookup.get(name, name) if lookup else name
column_dtypes[name] = dtype # type: ignore[assignment]
# apply schema overrides
if schema_overrides:
column_dtypes.update(schema_overrides)
return column_names, column_dtypes
def _handle_columns_arg(
data: list[PySeries],
columns: Sequence[str] | None = None,
*,
from_dict: bool = False,
) -> list[PySeries]:
"""Rename data according to columns argument."""
if columns is None:
return data
elif not data:
return [pl.Series(name=c)._s for c in columns]
elif len(data) != len(columns):
msg = f"dimensions of columns arg ({len(columns)}) must match data dimensions ({len(data)})"
raise ValueError(msg)
if from_dict:
series_map = {s.name(): s for s in data}
if all((col in series_map) for col in columns):
return [series_map[col] for col in columns]
for i, c in enumerate(columns):
if c != data[i].name():
data[i] = data[i].clone()
data[i].rename(c)
return data
def _post_apply_columns(
pydf: PyDataFrame,
columns: SchemaDefinition | None,
structs: dict[str, Struct] | None = None,
schema_overrides: SchemaDict | None = None,
*,
strict: bool = True,
) -> PyDataFrame:
"""Apply 'columns' param *after* PyDataFrame creation (if no alternative)."""
pydf_columns, pydf_dtypes = pydf.columns(), pydf.dtypes()
columns, dtypes = _unpack_schema(
(columns or pydf_columns), schema_overrides=schema_overrides
)
column_subset: list[str] = []
if columns != pydf_columns:
if len(columns) < len(pydf_columns) and columns == pydf_columns[: len(columns)]:
column_subset = columns
else:
pydf.set_column_names(columns)
column_casts = []
for i, col in enumerate(columns):
dtype = dtypes.get(col)
pydf_dtype = pydf_dtypes[i]
if dtype == Categorical != pydf_dtype:
column_casts.append(F.col(col).cast(Categorical, strict=strict)._pyexpr)
elif dtype == Enum != pydf_dtype:
column_casts.append(F.col(col).cast(dtype, strict=strict)._pyexpr)
elif structs and (struct := structs.get(col)) and struct != pydf_dtype:
column_casts.append(F.col(col).cast(struct, strict=strict)._pyexpr)
elif dtype is not None and dtype != Unknown and dtype != pydf_dtype:
column_casts.append(F.col(col).cast(dtype, strict=strict)._pyexpr)
if column_casts or column_subset:
pydf = pydf.lazy()
if column_casts:
pydf = pydf.with_columns(column_casts)
if column_subset:
pydf = pydf.select([F.col(col)._pyexpr for col in column_subset])
pydf = pydf.collect()
return pydf
def _expand_dict_values(
data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series],
*,
schema_overrides: SchemaDict | None = None,
strict: bool = True,
order: Sequence[str] | None = None,
nan_to_null: bool = False,
) -> dict[str, Series]:
"""Expand any scalar values in dict data (propagate literal as array)."""
updated_data = {}
if data:
if any(isinstance(val, pl.Expr) for val in data.values()):
msg = (
"passing Expr objects to the DataFrame constructor is not supported"
"\n\nHint: Try evaluating the expression first using `select`,"
" or if you meant to create an Object column containing expressions,"
" pass a list of Expr objects instead."
)
raise TypeError(msg)
dtypes = schema_overrides or {}
data = _expand_dict_data(data, dtypes, strict=strict)
array_len = max((arrlen(val) or 0) for val in data.values())
if array_len > 0:
for name, val in data.items():
dtype = dtypes.get(name)
if isinstance(val, dict) and dtype != Struct:
vdf = pl.DataFrame(val, strict=strict)
if (
len(vdf) == 1
and array_len > 1
and all(not d.is_nested() for d in vdf.schema.values())
):
s_vals = {
nm: vdf[nm].extend_constant(v, n=(array_len - 1))
for nm, v in val.items()
}
st = pl.DataFrame(s_vals).to_struct(name)
else:
st = vdf.to_struct(name)
updated_data[name] = st
elif isinstance(val, pl.Series):
s = val.rename(name) if name != val.name else val
if dtype and dtype != s.dtype:
s = s.cast(dtype, strict=strict)
updated_data[name] = s
elif arrlen(val) is not None or _is_generator(val):
updated_data[name] = pl.Series(
name=name,
values=val,
dtype=dtype,
strict=strict,
nan_to_null=nan_to_null,
)
elif val is None or isinstance( # type: ignore[redundant-expr]
val, (int, float, str, bool, date, datetime, time, timedelta)
):
updated_data[name] = F.repeat(
val, array_len, dtype=dtype, eager=True
).alias(name)
else:
updated_data[name] = pl.Series(
name=name, values=[val] * array_len, dtype=dtype, strict=strict
)
elif all((arrlen(val) == 0) for val in data.values()):
for name, val in data.items():
updated_data[name] = pl.Series(
name, values=val, dtype=dtypes.get(name), strict=strict
)
elif all((arrlen(val) is None) for val in data.values()):
for name, val in data.items():
updated_data[name] = pl.Series(
name,
values=(val if _is_generator(val) else [val]),
dtype=dtypes.get(name),
strict=strict,
)
if order and list(updated_data) != order:
return {col: updated_data.pop(col) for col in order}
return updated_data
def _expand_dict_data(
data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series],
dtypes: SchemaDict,
*,
strict: bool = True,
) -> Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series]:
"""
Expand any unsized generators/iterators.
(Note that `range` is sized, and will take a fast-path on Series init).
"""
expanded_data = {}
for name, val in data.items():
expanded_data[name] = (
pl.Series(name, val, dtypes.get(name), strict=strict)
if _is_generator(val)
else val
)
return expanded_data
def sequence_to_pydf(
data: Sequence[Any],
schema: SchemaDefinition | None = None,
*,
schema_overrides: SchemaDict | None = None,
strict: bool = True,
orient: Orientation | None = None,
infer_schema_length: int | None = N_INFER_DEFAULT,
) -> PyDataFrame:
"""Construct a PyDataFrame from a sequence."""
if not data:
return dict_to_pydf({}, schema=schema, schema_overrides=schema_overrides)
return _sequence_to_pydf_dispatcher(
data[0],
data=data,
schema=schema,
schema_overrides=schema_overrides,
strict=strict,
orient=orient,
infer_schema_length=infer_schema_length,
)
@singledispatch
def _sequence_to_pydf_dispatcher(
first_element: Any,
data: Sequence[Any],
schema: SchemaDefinition | None,
*,
schema_overrides: SchemaDict | None,
strict: bool = True,
orient: Orientation | None,
infer_schema_length: int | None,
) -> PyDataFrame:
# note: ONLY python-native data should participate in singledispatch registration
# via top-level decorators. third-party libraries (such as numpy/pandas) should
# first be identified inline (here) and THEN registered for dispatch dynamically
# so as not to break lazy-loading behaviour.
common_params = {
"data": data,
"schema": schema,
"schema_overrides": schema_overrides,
"strict": strict,
"orient": orient,
"infer_schema_length": infer_schema_length,
}
to_pydf: Callable[..., PyDataFrame]
register_with_singledispatch = True
if isinstance(first_element, Generator):
to_pydf = _sequence_of_sequence_to_pydf
data = [list(row) for row in data]
first_element = data[0]
register_with_singledispatch = False
elif isinstance(first_element, pl.Series):
to_pydf = _sequence_of_series_to_pydf
elif _check_for_numpy(first_element) and isinstance(first_element, np.ndarray):
to_pydf = _sequence_of_numpy_to_pydf
elif _check_for_pandas(first_element) and isinstance(
first_element, (pd.Series, pd.Index, pd.DatetimeIndex)
):
to_pydf = _sequence_of_pandas_to_pydf
elif dataclasses.is_dataclass(first_element):
to_pydf = _sequence_of_dataclasses_to_pydf
elif is_pydantic_model(first_element):
to_pydf = _sequence_of_pydantic_models_to_pydf
else:
to_pydf = _sequence_of_elements_to_pydf
if register_with_singledispatch:
_sequence_to_pydf_dispatcher.register(type(first_element), to_pydf)
common_params["first_element"] = first_element
return to_pydf(**common_params)
@_sequence_to_pydf_dispatcher.register(list)
def _sequence_of_sequence_to_pydf(
first_element: Sequence[Any] | np.ndarray[Any, Any],
data: Sequence[Any],
schema: SchemaDefinition | None,
*,
schema_overrides: SchemaDict | None,
strict: bool,
orient: Orientation | None,
infer_schema_length: int | None,
) -> PyDataFrame:
if orient is None:
# note: limit type-checking to smaller data; larger values are much more
# likely to indicate col orientation anyway, so minimise extra checks.
if len(first_element) > 1000:
orient = "col" if schema and len(schema) == len(data) else "row"
elif (schema is not None and len(schema) == len(data)) or not schema:
# check if element types in the first 'row' resolve to a single dtype.
row_types = {type(value) for value in first_element if value is not None}
if int in row_types and float in row_types:
row_types.discard(int)
orient = "col" if len(row_types) == 1 else "row"
else:
orient = "row"
if orient == "row":
column_names, schema_overrides = _unpack_schema(
schema, schema_overrides=schema_overrides, n_expected=len(first_element)
)
local_schema_override = (
_include_unknowns(schema_overrides, column_names)
if schema_overrides
else {}
)
if (
column_names
and len(first_element) > 0
and len(first_element) != len(column_names)
):
msg = "the row data does not match the number of columns"
raise ShapeError(msg)
unpack_nested = False
for col, tp in local_schema_override.items():
if tp in (Categorical, Enum):
local_schema_override[col] = String
elif not unpack_nested and (tp.base_type() in (Unknown, Struct)):
unpack_nested = contains_nested(
getattr(first_element, col, None).__class__, is_namedtuple
)
if unpack_nested:
dicts = [nt_unpack(d) for d in data]
pydf = PyDataFrame.from_dicts(
dicts, infer_schema_length=infer_schema_length
)
else:
pydf = PyDataFrame.from_rows(
data,
schema=local_schema_override or None,
infer_schema_length=infer_schema_length,
)
if column_names or schema_overrides:
pydf = _post_apply_columns(
pydf, column_names, schema_overrides=schema_overrides, strict=strict
)
return pydf
if orient == "col" or orient is None:
column_names, schema_overrides = _unpack_schema(
schema, schema_overrides=schema_overrides, n_expected=len(data)
)
data_series: list[PySeries] = [
pl.Series(
column_names[i],
element,
dtype=schema_overrides.get(column_names[i]),
strict=strict,
)._s
for i, element in enumerate(data)
]
return PyDataFrame(data_series)
msg = f"`orient` must be one of {{'col', 'row', None}}, got {orient!r}"
raise ValueError(msg)
def _sequence_of_series_to_pydf(
first_element: Series,
data: Sequence[Any],
schema: SchemaDefinition | None,
*,
schema_overrides: SchemaDict | None,
strict: bool,
**kwargs: Any,
) -> PyDataFrame:
series_names = [s.name for s in data]
column_names, schema_overrides = _unpack_schema(
schema or series_names,
schema_overrides=schema_overrides,
n_expected=len(data),
)
data_series: list[PySeries] = []
for i, s in enumerate(data):
if not s.name:
s = s.alias(column_names[i])
new_dtype = schema_overrides.get(column_names[i])
if new_dtype and new_dtype != s.dtype:
s = s.cast(new_dtype, strict=strict)
data_series.append(s._s)
data_series = _handle_columns_arg(data_series, columns=column_names)
return PyDataFrame(data_series)
@_sequence_to_pydf_dispatcher.register(tuple)
def _sequence_of_tuple_to_pydf(
first_element: tuple[Any, ...],
data: Sequence[Any],
schema: SchemaDefinition | None,
*,
schema_overrides: SchemaDict | None,
strict: bool,
orient: Orientation | None,
infer_schema_length: int | None,
) -> PyDataFrame:
# infer additional meta information if named tuple
if is_namedtuple(first_element.__class__):
if schema is None:
schema = first_element._fields # type: ignore[attr-defined]
annotations = getattr(first_element, "__annotations__", None)
if annotations and len(annotations) == len(schema):
schema = [
(name, py_type_to_dtype(tp, raise_unmatched=False))
for name, tp in first_element.__annotations__.items()
]
if orient is None:
orient = "row"
# ...then defer to generic sequence processing
return _sequence_of_sequence_to_pydf(
first_element,
data=data,
schema=schema,
schema_overrides=schema_overrides,
strict=strict,
orient=orient,
infer_schema_length=infer_schema_length,
)
@_sequence_to_pydf_dispatcher.register(dict)
def _sequence_of_dict_to_pydf(
first_element: Any,
data: Sequence[Any],
schema: SchemaDefinition | None,
*,
schema_overrides: SchemaDict | None,
strict: bool,
infer_schema_length: int | None,
**kwargs: Any,
) -> PyDataFrame:
column_names, schema_overrides = _unpack_schema(
schema, schema_overrides=schema_overrides
)
dicts_schema = (
_include_unknowns(schema_overrides, column_names or list(schema_overrides))
if column_names
else None
)
pydf = PyDataFrame.from_dicts(
data,
dicts_schema,
schema_overrides,
infer_schema_length=infer_schema_length,
)
# TODO: we can remove this `schema_overrides` block completely
# once https://github.com/pola-rs/polars/issues/11044 is fixed
if schema_overrides:
pydf = _post_apply_columns(
pydf, columns=column_names, schema_overrides=schema_overrides, strict=strict
)
return pydf
@_sequence_to_pydf_dispatcher.register(str)
def _sequence_of_elements_to_pydf(
first_element: Any,
data: Sequence[Any],
schema: SchemaDefinition | None,
schema_overrides: SchemaDict | None,
*,
strict: bool,
**kwargs: Any,
) -> PyDataFrame:
column_names, schema_overrides = _unpack_schema(
schema, schema_overrides=schema_overrides, n_expected=1
)
data_series: list[PySeries] = [
pl.Series(
column_names[0],
data,
schema_overrides.get(column_names[0]),
strict=strict,
)._s
]
data_series = _handle_columns_arg(data_series, columns=column_names)
return PyDataFrame(data_series)
def _sequence_of_numpy_to_pydf(
first_element: np.ndarray[Any, Any],
**kwargs: Any,
) -> PyDataFrame:
if first_element.ndim == 1:
return _sequence_of_sequence_to_pydf(first_element, **kwargs)
else:
return _sequence_of_elements_to_pydf(first_element, **kwargs)
def _sequence_of_pandas_to_pydf(
first_element: pd.Series[Any] | pd.Index[Any] | pd.DatetimeIndex,
data: Sequence[Any],
schema: SchemaDefinition | None,
schema_overrides: SchemaDict | None,
*,
strict: bool,
**kwargs: Any,
) -> PyDataFrame:
if schema is None:
column_names: list[str] = []
else:
column_names, schema_overrides = _unpack_schema(
schema, schema_overrides=schema_overrides, n_expected=1
)
schema_overrides = schema_overrides or {}
data_series: list[PySeries] = []
for i, s in enumerate(data):
name = column_names[i] if column_names else s.name
pyseries = plc.pandas_to_pyseries(name=name, values=s)
dtype = schema_overrides.get(name)
if dtype is not None and dtype != pyseries.dtype():
pyseries = pyseries.cast(dtype, strict=strict)
data_series.append(pyseries)
return PyDataFrame(data_series)
def _sequence_of_dataclasses_to_pydf(
first_element: Any,
data: Sequence[Any],
schema: SchemaDefinition | None,
schema_overrides: SchemaDict | None,
infer_schema_length: int | None,
*,
strict: bool = True,
**kwargs: Any,
) -> PyDataFrame:
"""Initialize DataFrame from Python dataclasses."""
from dataclasses import asdict, astuple
(
unpack_nested,
column_names,
schema_overrides,
overrides,
) = _establish_dataclass_or_model_schema(
first_element, schema, schema_overrides, model_fields=None
)
if unpack_nested:
dicts = [asdict(md) for md in data]
pydf = PyDataFrame.from_dicts(dicts, infer_schema_length=infer_schema_length)
else:
rows = [astuple(dc) for dc in data]
pydf = PyDataFrame.from_rows(
rows, schema=overrides or None, infer_schema_length=infer_schema_length
)
if overrides:
structs = {c: tp for c, tp in overrides.items() if isinstance(tp, Struct)}
pydf = _post_apply_columns(
pydf, column_names, structs, schema_overrides, strict=strict
)
return pydf
def _sequence_of_pydantic_models_to_pydf(
first_element: Any,
data: Sequence[Any],
schema: SchemaDefinition | None,
schema_overrides: SchemaDict | None,
infer_schema_length: int | None,
*,
strict: bool,
**kwargs: Any,
) -> PyDataFrame:
"""Initialise DataFrame from pydantic model objects."""
import pydantic # note: must already be available in the env here
old_pydantic = parse_version(pydantic.__version__) < (2, 0)
model_fields = list(
first_element.__fields__ if old_pydantic else first_element.model_fields
)
(
unpack_nested,
column_names,
schema_overrides,
overrides,
) = _establish_dataclass_or_model_schema(
first_element, schema, schema_overrides, model_fields
)
if unpack_nested:
# note: this is an *extremely* slow path, due to the requirement to
# use pydantic's 'dict()' method to properly unpack nested models
dicts = (
[md.dict() for md in data]
if old_pydantic
else [md.model_dump(mode="python") for md in data]
)
pydf = PyDataFrame.from_dicts(dicts, infer_schema_length=infer_schema_length)
elif len(model_fields) > 50:
# 'from_rows' is the faster codepath for models with a lot of fields...
get_values = itemgetter(*model_fields)
rows = [get_values(md.__dict__) for md in data]
pydf = PyDataFrame.from_rows(
rows, schema=overrides, infer_schema_length=infer_schema_length
)
else:
# ...and 'from_dicts' is faster otherwise
dicts = [md.__dict__ for md in data]
pydf = PyDataFrame.from_dicts(
dicts, schema=overrides, infer_schema_length=infer_schema_length
)
if overrides:
structs = {c: tp for c, tp in overrides.items() if isinstance(tp, Struct)}
pydf = _post_apply_columns(
pydf, column_names, structs, schema_overrides, strict=strict
)
return pydf
def _establish_dataclass_or_model_schema(
first_element: Any,
schema: SchemaDefinition | None,
schema_overrides: SchemaDict | None,
model_fields: list[str] | None,
) -> tuple[bool, list[str], SchemaDict, SchemaDict]:
"""Shared utility code for establishing dataclasses/pydantic model cols/schema."""
from dataclasses import asdict
unpack_nested = False
if schema:
column_names, schema_overrides = _unpack_schema(
schema, schema_overrides=schema_overrides
)
overrides = {col: schema_overrides.get(col, Unknown) for col in column_names}
else:
column_names = []
overrides = {
col: (py_type_to_dtype(tp, raise_unmatched=False) or Unknown)
for col, tp in try_get_type_hints(first_element.__class__).items()
if ((col in model_fields) if model_fields else (col != "__slots__"))
}
if schema_overrides:
overrides.update(schema_overrides)
elif not model_fields:
dc_fields = set(asdict(first_element))
schema_overrides = overrides = {
nm: tp for nm, tp in overrides.items() if nm in dc_fields
}
else:
schema_overrides = overrides
for col, tp in overrides.items():
if tp in (Categorical, Enum):
overrides[col] = String
elif not unpack_nested and (tp.base_type() in (Unknown, Struct)):
unpack_nested = contains_nested(
getattr(first_element, col, None),
is_pydantic_model if model_fields else dataclasses.is_dataclass, # type: ignore[arg-type]
)
if model_fields and len(model_fields) == len(overrides):
overrides = dict(zip(model_fields, overrides.values()))
return unpack_nested, column_names, schema_overrides, overrides
def _include_unknowns(
schema: SchemaDict, cols: Sequence[str]
) -> MutableMapping[str, PolarsDataType]:
"""Complete partial schema dict by including Unknown type."""
return {
col: (
schema.get(col, Unknown) or Unknown # type: ignore[truthy-bool]
)
for col in cols
}
def iterable_to_pydf(
data: Iterable[Any],
schema: SchemaDefinition | None = None,
*,
schema_overrides: SchemaDict | None = None,
strict: bool = True,
orient: Orientation | None = None,
chunk_size: int | None = None,
infer_schema_length: int | None = N_INFER_DEFAULT,
) -> PyDataFrame:
"""Construct a PyDataFrame from an iterable/generator."""
original_schema = schema
column_names: list[str] = []
dtypes_by_idx: dict[int, PolarsDataType] = {}
if schema is not None:
column_names, schema_overrides = _unpack_schema(
schema, schema_overrides=schema_overrides
)
elif schema_overrides:
_, schema_overrides = _unpack_schema(schema, schema_overrides=schema_overrides)
if not isinstance(data, Generator):
data = iter(data)
if orient == "col":
if column_names and schema_overrides:
dtypes_by_idx = {
idx: schema_overrides.get(col, Unknown)
for idx, col in enumerate(column_names)
}
return pl.DataFrame(
{
(column_names[idx] if column_names else f"column_{idx}"): pl.Series(
coldata,
dtype=dtypes_by_idx.get(idx),
strict=strict,
)
for idx, coldata in enumerate(data)
},
)._df
def to_frame_chunk(values: list[Any], schema: SchemaDefinition | None) -> DataFrame:
return pl.DataFrame(
data=values,
schema=schema,
strict=strict,
orient="row",
infer_schema_length=infer_schema_length,
)
n_chunks = 0
n_chunk_elems = 1_000_000
if chunk_size:
adaptive_chunk_size = chunk_size
elif column_names:
adaptive_chunk_size = n_chunk_elems // len(column_names)
else:
adaptive_chunk_size = None
df: DataFrame = None # type: ignore[assignment]
chunk_size = max(
(infer_schema_length or 0),
(adaptive_chunk_size or 1000),
)
while True:
values = list(islice(data, chunk_size))
if not values:
break
frame_chunk = to_frame_chunk(values, original_schema)
if df is None:
df = frame_chunk
if not original_schema:
original_schema = list(df.schema.items())
if chunk_size != adaptive_chunk_size:
if (n_columns := len(df.columns)) > 0:
chunk_size = adaptive_chunk_size = n_chunk_elems // n_columns
else:
df.vstack(frame_chunk, in_place=True)
n_chunks += 1
if df is None:
df = to_frame_chunk([], original_schema)
if n_chunks > 0:
df = df.rechunk()
return df._df