Skip to content

Commit

Permalink
Upgrade pandas and use pyarrow for reading event files
Browse files Browse the repository at this point in the history
  • Loading branch information
polyaxon-ci committed Jun 30, 2023
1 parent 1c4af20 commit f5d3918
Show file tree
Hide file tree
Showing 5 changed files with 9 additions and 6 deletions.
2 changes: 0 additions & 2 deletions traceml/tests/test_events_processing/test_df_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,6 @@ def test_numerics_summary(self):
"deviating_of_mean_perc",
"deviating_of_median",
"deviating_of_median_perc",
"top_correlations",
"counts",
"uniques",
"missing",
Expand Down Expand Up @@ -362,7 +361,6 @@ def test_numerics_summary(self):
dmp,
dam,
damp,
"dnumerics2: 100%",
self.size,
self.size,
0,
Expand Down
2 changes: 0 additions & 2 deletions traceml/tests/test_summary/test_dfsummary.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,6 @@ def test_numerics_summary(self):
"deviating_of_mean_perc",
"deviating_of_median",
"deviating_of_median_perc",
"top_correlations",
"counts",
"uniques",
"missing",
Expand Down Expand Up @@ -349,7 +348,6 @@ def test_numerics_summary(self):
dmp,
dam,
damp,
"dnumerics2: 100%",
self.size,
self.size,
0,
Expand Down
2 changes: 2 additions & 0 deletions traceml/traceml/events/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,11 +380,13 @@ def read(
csv,
sep=V1Event._SEPARATOR,
parse_dates=["timestamp"],
engine="pyarrow",
)
else:
df = pd.read_csv(
csv,
sep=V1Event._SEPARATOR,
engine="pyarrow",
)
elif isinstance(data, dict):
df = pd.DataFrame.from_dict(data)
Expand Down
2 changes: 2 additions & 0 deletions traceml/traceml/logging/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,11 +112,13 @@ def read_csv(cls, data: str, parse_dates: bool = True) -> "V1Logs":
sep=V1Log._SEPARATOR,
parse_dates=["timestamp"],
error_bad_lines=False,
engine="pyarrow",
)
else:
df = pd.read_csv(
csv,
sep=V1Log._SEPARATOR,
engine="pyarrow",
)

return cls.construct(
Expand Down
7 changes: 5 additions & 2 deletions traceml/traceml/processors/df_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,11 @@ def get_median_absolute_deviation(


def get_top_correlations(
df: pd.DataFrame, column: str, threshold: float = 0.65, top: int = 3, df_corr=None
df: pd.DataFrame,
column: str,
threshold: float = 0.65,
top: int = 3,
df_corr: Optional[pd.DataFrame] = None,
) -> Dict:
"""
Returns count of values larger than `multiplier` * `mad`
Expand Down Expand Up @@ -284,7 +288,6 @@ def get_numeric_summary(
) = get_median_absolute_deviation(series, df_length=df_length)
stats["deviating_of_median"] = deviating_of_median
stats["deviating_of_median_perc"] = deviating_of_median_perc
stats["top_correlations"] = get_top_correlations_description(df=df, column=column)
return pd.concat([pd.Series(stats, name=column), columns_stats[column]], sort=True)


Expand Down

0 comments on commit f5d3918

Please sign in to comment.