From 1320c227825c72a182ab3936a33d58ddae979553 Mon Sep 17 00:00:00 2001 From: Christopher Burr Date: Mon, 30 Jul 2018 11:12:47 +0100 Subject: [PATCH 1/7] Add support for serialising categorical columns --- root_pandas/readwrite.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/root_pandas/readwrite.py b/root_pandas/readwrite.py index c54604b..3acf4bf 100644 --- a/root_pandas/readwrite.py +++ b/root_pandas/readwrite.py @@ -353,12 +353,24 @@ def to_root(df, path, key='my_ttree', mode='w', store_index=True, *args, **kwarg from root_numpy import array2root # We don't want to modify the user's DataFrame here, so we make a shallow copy df_ = df.copy(deep=False) + if store_index: name = df_.index.name if name is None: # Handle the case where the index has no name name = '' df_['__index__' + name] = df_.index + + # Convert categorical columns into something root_numpy can serialise + for col in df.select_dtypes(['category']).columns: + name_components = ['__rpCaT', col, str(df[col].cat.ordered)] + df[col].cat.categories + if ['*' not in c for c in name_components]: + sep = '*' + else: + raise ValueError('Unable to find suitable separator for columns') + df_[sep.join(name_components)] = df[col].cat.codes + del df[col] + arr = df_.to_records(index=False) array2root(arr, path, key, mode=mode, *args, **kwargs) From fbc9c72cf0f85affa4e3e35aed7ea9f86c8c8e5f Mon Sep 17 00:00:00 2001 From: Christopher Burr Date: Mon, 30 Jul 2018 11:24:04 +0100 Subject: [PATCH 2/7] Bugfix to previous commit --- root_pandas/readwrite.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/root_pandas/readwrite.py b/root_pandas/readwrite.py index 3acf4bf..5e6a6ce 100644 --- a/root_pandas/readwrite.py +++ b/root_pandas/readwrite.py @@ -363,7 +363,8 @@ def to_root(df, path, key='my_ttree', mode='w', store_index=True, *args, **kwarg # Convert categorical columns into something root_numpy can serialise for col in df.select_dtypes(['category']).columns: - name_components = ['__rpCaT', col, str(df[col].cat.ordered)] + df[col].cat.categories + name_components = ['__rpCaT', col, str(df[col].cat.ordered)] + name_components.extend(df[col].cat.categories) if ['*' not in c for c in name_components]: sep = '*' else: From d4f2ef4c193bbaf9e9b0a248d12ad34decf2d03f Mon Sep 17 00:00:00 2001 From: Christopher Burr Date: Mon, 30 Jul 2018 11:24:42 +0100 Subject: [PATCH 3/7] Add support for loading categorical data --- root_pandas/readwrite.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/root_pandas/readwrite.py b/root_pandas/readwrite.py index 5e6a6ce..2f5621e 100644 --- a/root_pandas/readwrite.py +++ b/root_pandas/readwrite.py @@ -6,6 +6,7 @@ import numpy as np from numpy.lib.recfunctions import append_fields from pandas import DataFrame, RangeIndex +import pandas as pd from root_numpy import root2array, list_trees import fnmatch from root_numpy import list_branches @@ -312,6 +313,14 @@ def convert_to_dataframe(array, start_index=None): assert len(columns) == len(df.columns), (columns, df.columns) df = df.reindex_axis(columns, axis=1, copy=False) + # Convert categorical columns back to categories + for c in df.columns: + match = re.match(r'__rpCaT\*([^\*]\*(True|False)\*)', c) + if match: + real_name, ordered = match.groups + categories = c.split('*')[3:] + df[c] = pd.Categorical.from_codes(df[c], categories, ordered={'True': True, 'False': False}[ordered]) + return df From 70bef7ab34a505aa8437f352b2ae4a238536e89a Mon Sep 17 00:00:00 2001 From: Christopher Burr Date: Mon, 30 Jul 2018 11:30:40 +0100 Subject: [PATCH 4/7] Fix typos in variable names --- root_pandas/readwrite.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/root_pandas/readwrite.py b/root_pandas/readwrite.py index 2f5621e..1f2e098 100644 --- a/root_pandas/readwrite.py +++ b/root_pandas/readwrite.py @@ -371,15 +371,15 @@ def to_root(df, path, key='my_ttree', mode='w', store_index=True, *args, **kwarg df_['__index__' + name] = df_.index # Convert categorical columns into something root_numpy can serialise - for col in df.select_dtypes(['category']).columns: - name_components = ['__rpCaT', col, str(df[col].cat.ordered)] - name_components.extend(df[col].cat.categories) + for col in df_.select_dtypes(['category']).columns: + name_components = ['__rpCaT', col, str(df_[col].cat.ordered)] + name_components.extend(df_[col].cat.categories) if ['*' not in c for c in name_components]: sep = '*' else: raise ValueError('Unable to find suitable separator for columns') - df_[sep.join(name_components)] = df[col].cat.codes - del df[col] + df_[sep.join(name_components)] = df_[col].cat.codes + del df_[col] arr = df_.to_records(index=False) array2root(arr, path, key, mode=mode, *args, **kwargs) From ed96faafcf2663ccce3d8098360aebc95c6a0432 Mon Sep 17 00:00:00 2001 From: Christopher Burr Date: Mon, 30 Jul 2018 11:41:40 +0100 Subject: [PATCH 5/7] Fix regex and maintian column order --- root_pandas/readwrite.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/root_pandas/readwrite.py b/root_pandas/readwrite.py index 1f2e098..a6d2525 100644 --- a/root_pandas/readwrite.py +++ b/root_pandas/readwrite.py @@ -315,11 +315,12 @@ def convert_to_dataframe(array, start_index=None): # Convert categorical columns back to categories for c in df.columns: - match = re.match(r'__rpCaT\*([^\*]\*(True|False)\*)', c) + match = re.match(r'^__rpCaT\*([^\*]+\*(True|False)\*)', c) if match: real_name, ordered = match.groups categories = c.split('*')[3:] df[c] = pd.Categorical.from_codes(df[c], categories, ordered={'True': True, 'False': False}[ordered]) + df.rename(index=str, columns={c: real_name}, inplace=True) return df @@ -378,8 +379,8 @@ def to_root(df, path, key='my_ttree', mode='w', store_index=True, *args, **kwarg sep = '*' else: raise ValueError('Unable to find suitable separator for columns') - df_[sep.join(name_components)] = df_[col].cat.codes - del df_[col] + df_[col] = df_[col].cat.codes + df.rename(index=str, columns={col: sep.join(name_components)}, inplace=True) arr = df_.to_records(index=False) array2root(arr, path, key, mode=mode, *args, **kwargs) From 22aaa45fed22aba0b5af49972841dbba3578ecca Mon Sep 17 00:00:00 2001 From: Christopher Burr Date: Mon, 30 Jul 2018 11:56:32 +0100 Subject: [PATCH 6/7] And another typo --- root_pandas/readwrite.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/root_pandas/readwrite.py b/root_pandas/readwrite.py index a6d2525..f453912 100644 --- a/root_pandas/readwrite.py +++ b/root_pandas/readwrite.py @@ -317,7 +317,7 @@ def convert_to_dataframe(array, start_index=None): for c in df.columns: match = re.match(r'^__rpCaT\*([^\*]+\*(True|False)\*)', c) if match: - real_name, ordered = match.groups + real_name, ordered = match.groups() categories = c.split('*')[3:] df[c] = pd.Categorical.from_codes(df[c], categories, ordered={'True': True, 'False': False}[ordered]) df.rename(index=str, columns={c: real_name}, inplace=True) @@ -380,7 +380,7 @@ def to_root(df, path, key='my_ttree', mode='w', store_index=True, *args, **kwarg else: raise ValueError('Unable to find suitable separator for columns') df_[col] = df_[col].cat.codes - df.rename(index=str, columns={col: sep.join(name_components)}, inplace=True) + df_.rename(index=str, columns={col: sep.join(name_components)}, inplace=True) arr = df_.to_records(index=False) array2root(arr, path, key, mode=mode, *args, **kwargs) From 7546bfc46935a6ba6c497c2fce6a9aee542afeb9 Mon Sep 17 00:00:00 2001 From: Christopher Burr Date: Mon, 30 Jul 2018 12:01:08 +0100 Subject: [PATCH 7/7] And another regex fix --- root_pandas/readwrite.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/root_pandas/readwrite.py b/root_pandas/readwrite.py index f453912..bdd0966 100644 --- a/root_pandas/readwrite.py +++ b/root_pandas/readwrite.py @@ -315,7 +315,7 @@ def convert_to_dataframe(array, start_index=None): # Convert categorical columns back to categories for c in df.columns: - match = re.match(r'^__rpCaT\*([^\*]+\*(True|False)\*)', c) + match = re.match(r'^__rpCaT\*([^\*]+)\*(True|False)\*', c) if match: real_name, ordered = match.groups() categories = c.split('*')[3:]