Upgrade pandas and use pyarrow for reading event files

polyaxon · Jun 30, 2023 · f5d3918 · f5d3918
1 parent 1c4af20
commit f5d3918
Show file tree

Hide file tree

Showing 5 changed files with 9 additions and 6 deletions.
diff --git a/traceml/tests/test_events_processing/test_df_processor.py b/traceml/tests/test_events_processing/test_df_processor.py
@@ -331,7 +331,6 @@ def test_numerics_summary(self):
                 "deviating_of_mean_perc",
                 "deviating_of_median",
                 "deviating_of_median_perc",
-                "top_correlations",
                 "counts",
                 "uniques",
                 "missing",
@@ -362,7 +361,6 @@ def test_numerics_summary(self):
                 dmp,
                 dam,
                 damp,
-                "dnumerics2: 100%",
                 self.size,
                 self.size,
                 0,

diff --git a/traceml/tests/test_summary/test_dfsummary.py b/traceml/tests/test_summary/test_dfsummary.py
@@ -318,7 +318,6 @@ def test_numerics_summary(self):
                 "deviating_of_mean_perc",
                 "deviating_of_median",
                 "deviating_of_median_perc",
-                "top_correlations",
                 "counts",
                 "uniques",
                 "missing",
@@ -349,7 +348,6 @@ def test_numerics_summary(self):
                 dmp,
                 dam,
                 damp,
-                "dnumerics2: 100%",
                 self.size,
                 self.size,
                 0,

diff --git a/traceml/traceml/events/schemas.py b/traceml/traceml/events/schemas.py
@@ -380,11 +380,13 @@ def read(
                     csv,
                     sep=V1Event._SEPARATOR,
                     parse_dates=["timestamp"],
+                    engine="pyarrow",
                 )
             else:
                 df = pd.read_csv(
                     csv,
                     sep=V1Event._SEPARATOR,
+                    engine="pyarrow",
                 )
         elif isinstance(data, dict):
             df = pd.DataFrame.from_dict(data)

diff --git a/traceml/traceml/logging/schemas.py b/traceml/traceml/logging/schemas.py
@@ -112,11 +112,13 @@ def read_csv(cls, data: str, parse_dates: bool = True) -> "V1Logs":
                 sep=V1Log._SEPARATOR,
                 parse_dates=["timestamp"],
                 error_bad_lines=False,
+                engine="pyarrow",
             )
         else:
             df = pd.read_csv(
                 csv,
                 sep=V1Log._SEPARATOR,
+                engine="pyarrow",
             )
 
         return cls.construct(

diff --git a/traceml/traceml/processors/df_processors.py b/traceml/traceml/processors/df_processors.py
@@ -202,7 +202,11 @@ def get_median_absolute_deviation(
 
 
 def get_top_correlations(
-    df: pd.DataFrame, column: str, threshold: float = 0.65, top: int = 3, df_corr=None
+    df: pd.DataFrame,
+    column: str,
+    threshold: float = 0.65,
+    top: int = 3,
+    df_corr: Optional[pd.DataFrame] = None,
 ) -> Dict:
     """
     Returns count of values larger than `multiplier` * `mad`
@@ -284,7 +288,6 @@ def get_numeric_summary(
     ) = get_median_absolute_deviation(series, df_length=df_length)
     stats["deviating_of_median"] = deviating_of_median
     stats["deviating_of_median_perc"] = deviating_of_median_perc
-    stats["top_correlations"] = get_top_correlations_description(df=df, column=column)
     return pd.concat([pd.Series(stats, name=column), columns_stats[column]], sort=True)