In [1]:
import copy

import numpy as np
import pandas as pd

from hypex.dataset.dataset import Dataset, ExperimentData
from hypex.dataset.roles import TargetRole, InfoRole, FeatureRole

# Dataset



### Create Dataset
    Dataset()
    Initializes a new instance of the Dataset class.

        Args:
            roles (Union[Dict[ABCRole, Union[List[Union[str, int]], str, int]], Dict[Union[str, int], ABCRole]]): A dictionary mapping roles to their corresponding column names and types.
            data (Optional[Union[pd.DataFrame, str]]): The data to be used for the dataset. Can be either a pandas DataFrame or a file path. Defaults to None.
            backend (Optional[BackendsEnum]): The backend to be used for the dataset. Defaults to None.

        Returns:
            Dataset: The newly created Dataset.

In [2]:
ds = Dataset({'a': TargetRole(), 'b': TargetRole(float)})
ds

Empty DataFrame
Columns: []
Index: []

In [3]:
df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})

ds = Dataset({'a': TargetRole(), 'b': TargetRole(float)}, data=df)
ds

   a    b
0  1  4.0
1  2  5.0
2  3  6.0

### Roles
    Dataset.roles
    Returns the roles of the dataset.

In [4]:
ds.roles

{'a': Target(<class 'int'>), 'b': Target(<class 'float'>)}

### Create empty
    Dataset.create_empty()
    Creates an empty Dataset.
        Parameters:
            backend (BackendsEnum, optional): The backend to use for the Dataset. Defaults to BackendsEnum.pandas.
            roles (dict, optional): The roles to assign to the Dataset. Defaults to None.
            index (list, optional): The index to use for the Dataset. Defaults to None.
        Returns:
            Dataset: The newly created empty Dataset.

In [5]:
ds_empty = Dataset.create_empty()

### Columns
    Dataset.columns
    A property to access the columns in the dataset.

In [6]:
ds_empty.columns

Index([], dtype='object')

In [7]:
ds.columns

Index(['a', 'b'], dtype='object')

In [8]:
ds.columns[0]

'a'

### Index
    Dataset.index
    Returns the index of the dataset.

        Returns:
            Any: The index of the dataset.

In [9]:
ds.index

RangeIndex(start=0, stop=3, step=1)

### Backend
    Dataset.backend
    Returns the backend object associated with this instance of the dataset.

        Returns:
            Backend: The backend object.


In [10]:
type(ds.backend)

hypex.dataset.backends.pandas_backend.PandasDataset

In [11]:
ds.backend

   a    b
0  1  4.0
1  2  5.0
2  3  6.0

### 
    Dataset.Data
    Provides the direct access to the data of the backend object.

    Returns:
        Any: The data property of the backend object.


In [12]:
type(ds.data)

pandas.core.frame.DataFrame

In [13]:
ds.data

Unnamed: 0,a,b
0,1,4.0
1,2,5.0
2,3,6.0


# Dataset Methods

### From dict
    Dataset.from_dict()
    Create a new `Dataset` object from a dictionary of data and roles.
        Args:
            data (FromDictTypes): The data to be converted into a `Dataset`.
            roles (Union[Dict[ABCRole, Union[List[Union[str, int]], str, int]], Dict[Union[str, int], ABCRole]]):
                A dictionary mapping roles to column names or indices.
            backend (BackendsEnum, optional): The backend to use for the `Dataset`. Defaults to `BackendsEnum.pandas`.
            index (Any, optional): The index to use for the `Dataset`. Defaults to None.
        Returns:
            Dataset: A new `Dataset` object created from the data and roles.


In [14]:
ds_from_dict = Dataset.from_dict({'a': [1, 2], 'b': [3, 4]}, {'a': TargetRole(), 'b': InfoRole()})

In [15]:
ds_from_dict

   a  b
0  1  3
1  2  4

In [16]:
ds_from_dict.roles

{'a': Target(None), 'b': Info(None)}

In [17]:
ds

   a    b
0  1  4.0
1  2  5.0
2  3  6.0

### Search Columns
    Dataset.search_columns()
    Searches for columns in the dataset based on the given roles and data types.

    Args:
        roles (Union[ABCRole, Iterable[ABCRole]]): The roles to search for.
        tmp_role (bool, optional): Whether to search in temporary roles. Defaults to False.
        search_types (Optional[List], optional): The types to search for. Defaults to None.
    
    Returns:
        List[str]: A list of column names matching the given roles.

In [18]:
columns_found = ds.search_columns(TargetRole(), search_types=[int])
columns_found

['a']

In [19]:
ds[columns_found]

   a
0  1
1  2
2  3

### Simple math methods

In [20]:
ds.mean()

        a    b
mean  2.0  5.0

In [21]:
ds.count()

       a  b
count  3  3

In [22]:
ds.log()

          a         b
0  0.000000  1.386294
1  0.693147  1.609438
2  1.098612  1.791759

In [23]:
ds.min()

     a    b
min  1  4.0

### Get item
    Retrieves an item from the dataset based on the specified index and returns a new Dataset object containing the retrieved data along with the corresponding roles.
        
        Parameters:
            item: The index or indices to locate the item(s) in the dataset.
            
        Returns:
            Dataset: A new Dataset object containing the retrieved data and roles.

In [24]:
ds[1]

     1
a  2.0
b  5.0

In [25]:
ds['a'][1]

   1
a  2

In [26]:
ds.data[ds.data[['a', 'b']] == 4]

Unnamed: 0,a,b
0,,4.0
1,,
2,,


In [27]:
ds[ds[['a', 'b']] == 4]

    a    b
0 NaN  4.0
1 NaN  NaN
2 NaN  NaN

### Add column
    Dataset.add_column()
    Adds a new column to the dataset. Overwrites the original data.
        Args:
            data (Union[Dataset, Any]): The data to be added as a column. If `role` is not provided, `data`
                must be an instance of `Dataset`. Otherwise, it can be any type of data.
            role (Optional[Dict[str, ABCRole]]): The role of the column. If not provided, the roles from
                `data` will be updated. Defaults to None.
            index (Optional[Iterable[Hashable]]): The index of the column. Defaults to None.
        Returns:
            Dataset: The updated dataset with the new column added.
        Raises:
            ValueError: If `role` is not provided and `data` is not an instance of `Dataset`.


In [28]:
ds.add_column([7, 8, 9], {'c': TargetRole(int)})
ds

   a    b  c
0  1  4.0  7
1  2  5.0  8
2  3  6.0  9

### Map
    Dataset.map()
    Apply a function to each element of the dataset.
        Args:
            func (Callable): The function to apply to each element.
            na_action (Optional[str]): The action to take when encountering missing values.
                Defaults to None.
            **kwargs: Additional keyword arguments to pass to the function.
        Returns:
            Dataset: A new dataset with the function applied to each element.


In [29]:
ds['a'].map(lambda x: x * 2)

   a
0  2
1  4
2  6

### Unique
    Dataset.unique()
    Returns a dictionary containing the unique values in each column of the dataset.
        Returns:
            Dict[str, List[Any]]: The unique values in the dataset.
        

In [30]:
ds.unique()

{'a': array([1, 2, 3]), 'b': array([4., 5., 6.]), 'c': array([7, 8, 9])}

### Number of unique
    Dataset.nunique()
    Returns a dictionary containing the number of unique values for each column in the dataset.

        Args:
            dropna (bool, optional): If True, rows with missing values are not counted. Defaults to False.

        Returns:
            Dict[str, int]: A dictionary where the keys are the column names and the values are the number of unique values in each column.

In [31]:
ds.nunique()

{'a': 3, 'b': 3, 'c': 3}

### Apply
    Dataset.apply()
    Apply a function to a specific column or row of the dataset.
        Args:
            func (Callable): The function to be applied.
            role (Dict[FieldKeyTypes, ABCRole]): A dictionary mapping column names to their roles.
            axis (int, optional): The axis along which the function is applied. Defaults to 0.
                - 0: Apply the function to the rows of the dataset.
                - 1: Apply the function to the columns of the dataset.
            **kwargs: Additional keyword arguments to be passed to the function.
        Returns:
            Dataset: A new dataset with the applied function.
        

In [32]:
ds.add_column(ds.apply(func=lambda x: x['a'] + x['c'] + 7, role={"new": InfoRole()}, axis=1))
ds

   a    b  c   new
0  1  4.0  7  15.0
1  2  5.0  8  17.0
2  3  6.0  9  19.0

In [33]:
ds.roles

{'a': Target(<class 'int'>),
 'b': Target(<class 'float'>),
 'c': Target(<class 'int'>),
 'new': Info(<class 'float'>)}

In [34]:
ds.add_column([7, 8, 9], {'f': TargetRole(int)}, index=[2, 0, 1])

   a    b  c   new  f
0  1  4.0  7  15.0  8
1  2  5.0  8  17.0  9
2  3  6.0  9  19.0  7

### Set item
    A method to set an item in the Dataset object using a key and value.

        Parameters:
            key (str): The key to identify the item in the Dataset.
            value (Any): The value to be set for the key. If the value is a Dataset, its data attribute is used.

        Returns:
            None


In [35]:
ds['e'] = [1, 2, 3]
ds



   a    b  c   new  f  e
0  1  4.0  7  15.0  8  1
1  2  5.0  8  17.0  9  2
2  3  6.0  9  19.0  7  3

In [36]:
ds['a'][1] = 1
ds

   a    b  c   new  f  e
0  1  4.0  7  15.0  8  1
1  2  5.0  8  17.0  9  2
2  3  6.0  9  19.0  7  3

### Is in
    Dataset.isin()
    Returns a new Dataset object with the same roles as the current Dataset, but with the data filtered to only include rows where the values in the columns match any of the values in the given iterable.

        Args:
            values (Iterable): An iterable of values to match against.

        Returns:
            Dataset: A new Dataset object with the filtered data.

In [37]:
ds.isin([3, 8])

       a      b      c    new      f      e
0  False  False  False  False   True  False
1  False  False   True  False  False  False
2   True  False  False  False  False   True

In [38]:
ds.roles

{'a': Target(<class 'int'>),
 'b': Target(<class 'float'>),
 'c': Target(<class 'int'>),
 'new': Info(<class 'float'>),
 'f': Target(<class 'int'>),
 'e': Info(None)}

### Group by
    Dataset.groupby()
    Groups the data in the Dataset object by the specified column(s) and applies optional aggregation functions.
        Parameters:
            by (Any): The column(s) to group the data by.
            func (Optional[Union[str, List]]): The aggregation function(s) to apply to each group. Defaults to None.
            fields_list (Optional[Union[str, List]]): The columns to include in each group. Defaults to None.
            **kwargs: Additional keyword arguments to pass to the underlying backend's groupby method.
        Returns:
            List[Tuple[Any, Dataset]]: A list of tuples, where each tuple contains the group key and the corresponding Dataset object.
        Note:
            - If `fields_list` is provided, only the specified columns will be included in each group.
            - If `func` is provided, the aggregation function(s) will be applied to each group.
            - The `tmp_roles` attribute of each Dataset object in the returned list will be set to the `tmp_roles` attribute of the original Dataset object.
       

In [39]:
groups_func = ds.groupby('a', func='mean')
print(groups_func)

[(1,         a    b    c   new    f    e
mean  1.0  4.0  7.0  15.0  8.0  1.0), (2,         a    b    c   new    f    e
mean  2.0  5.0  8.0  17.0  9.0  2.0), (3,         a    b    c   new    f    e
mean  3.0  6.0  9.0  19.0  7.0  3.0)]


In [40]:
groups = ds.groupby('a')
print(groups)

[(1,    a    b  c   new  f  e
0  1  4.0  7  15.0  8  1), (2,    a    b  c   new  f  e
1  2  5.0  8  17.0  9  2), (3,    a    b  c   new  f  e
2  3  6.0  9  19.0  7  3)]


In [41]:
groups_func_fields = ds.groupby('a', func=['mean', 'var'], fields_list='e')
print(groups_func_fields)

[(1,         e
mean  1.0
var   NaN), (2,         e
mean  2.0
var   NaN), (3,         e
mean  3.0
var   NaN)]


### Sort
    Dataset.sort()
    Sorts the data in the Dataset object based on the specified fields.

        Parameters:
            by (Optional[MultiFieldKeyTypes]): The fields to sort the data by. If None, the data will be sorted based on the index. Defaults to None.
            ascending (bool): Whether to sort the data in ascending order. Defaults to True.
            **kwargs: Additional keyword arguments to be passed to the sorting function.

        Returns:
            Dataset: A new Dataset object with the sorted data.

In [42]:
ds.sort('a', ascending=False)

   a    b  c   new  f  e
2  3  6.0  9  19.0  7  3
1  2  5.0  8  17.0  9  2
0  1  4.0  7  15.0  8  1

### Fill nans
    Dataset.fillna()
    Fills missing values in the dataset with the specified values.
        Args:
            values (Union[int, Dict[FieldKeyTypes, FieldKeyTypes]]): The values to fill missing data with. If an integer is provided, all missing values will be filled with that integer. If a dictionary is provided, the keys represent the fields to fill missing data in, and the values represent the values to fill the missing data with.
            method (Optional[str]): The method to use for filling missing data. Must be one of "backfill", "bfill", or "ffill". If not provided, missing data will be filled with the specified values.
            **kwargs: Additional keyword arguments to be passed to the fillna function.
        Returns:
            Dataset: A new Dataset object with the filled data.
        Raises:
            NameError: If an unsupported fill method is provided.


In [43]:
fds = copy.deepcopy(ds)
fds.add_column([0, np.NaN, 2], {'g': InfoRole(int)})
fds

   a    b  c   new  f  e    g
0  1  4.0  7  15.0  8  1  0.0
1  2  5.0  8  17.0  9  2  NaN
2  3  6.0  9  19.0  7  3  2.0

In [44]:
fds.fillna(42)

   a    b  c   new  f  e   g
0  1  4.0  7  15.0  8  1   0
1  2  5.0  8  17.0  9  2  42
2  3  6.0  9  19.0  7  3   2

### Aggregation
    Dataset.agg()
    Perform an aggregation operation using the specified function(s) on the dataset.
        
        Parameters:
            func (Union[str, List]): The aggregation function or list of functions to apply.
        
        Returns:
            The aggregated data after applying the specified function(s).

In [45]:
ds.agg(['sum', 'mean'])

        a     b     c   new     f    e
sum   6.0  15.0  24.0  51.0  24.0  6.0
mean  2.0   5.0   8.0  17.0   8.0  2.0

### Index max
    Dataset.idxmax()
    Returns the index of the maximum value in each group of the dataset.
        
        Returns:
            The dataset with the index of the maximum value in each group.

In [46]:
ds[['a', 'b']].idxmax()

        a  b
idxmax  2  2

### Correlation
    Dataset.corr()
    Calculate the correlation matrix of the columns in the dataset with each other.
        Parameters:
            method (str, optional): The method to use for calculating the correlation. Defaults to "pearson".
            numeric_only (bool, optional): Whether to calculate the correlation only for numeric columns. Defaults to False.
        Returns:
            Dataset: A new Dataset object with the correlation matrix.

In [47]:
ds.corr()

     a    b  c  new  f  e
a    1  1.0  1  1.0  0  1
b    1  1.0  1  1.0  0  1
c    1  1.0  1  1.0  0  1
new  1  1.0  1  1.0  0  1
f    0 -0.5  0 -0.5  1  0
e    1  1.0  1  1.0  0  1

### Quantile
    Dataset.quantile()
    Calculate the quantile of the data.
        Args:
            q (float, optional): The quantile to calculate. Defaults to 0.5.
        Returns:
            Dataset: A new Dataset object with the quantile values calculated.

In [48]:
ds.quantile(0.3)

            a    b    c   new    f    e
quantile  1.6  4.6  7.6  16.2  7.6  1.6

### Select dtypes
    Dataset.select_dtypes()
    Selects columns from the dataset based on the data types.
        Args:
            include (Any, optional): A list of data types to include. Defaults to None.
            exclude (Any, optional): A list of data types to exclude. Defaults to None.
        Returns:
            Dataset: A new dataset with the selected columns and roles.

In [49]:
ds.select_dtypes(include='int')

   a  c  f  e
0  1  7  8  1
1  2  8  9  2
2  3  9  7  3

In [50]:
ds.select_dtypes(exclude='int')

     b   new
0  4.0  15.0
1  5.0  17.0
2  6.0  19.0

### Merge
    Dataset.merge()
    Merges this Dataset with another Dataset using specified columns or indices.
        Parameters:
            right: Dataset - The Dataset to merge with.
            on: Optional[FieldKeyTypes] - Columns to join on from both Datasets.
            left_on: Optional[FieldKeyTypes] - Columns from this Dataset to join on.
            right_on: Optional[FieldKeyTypes] - Columns from the other Dataset to join on.
            left_index: bool - Use the index from this Dataset as the merge key.
            right_index: bool - Use the index from the other Dataset as the merge key.
            suffixes: tuple[str, str] - Suffixes to append to overlapping column names.
            how: Literal["left", "right", "outer", "inner", "cross"] - Type of merge to perform.
        Returns:
            Dataset - A new Dataset resulting from the merge operation.
        Note:
            The merge operation is performed on the specified columns or indices. If no columns or indices are specified, the merge is performed on the indexes of both Datasets.

In [51]:
ds.merge(ds)

   a_x  b_x  c_x  new_x  f_x  e_x  a_y  b_y  c_y  new_y  f_y  e_y
0    1  4.0    7   15.0    8    1    1  4.0    7   15.0    8    1
1    2  5.0    8   17.0    9    2    2  5.0    8   17.0    9    2
2    3  6.0    9   19.0    7    3    3  6.0    9   19.0    7    3

In [52]:
ds.merge(ds, left_on='a', right_index=True, suffixes=('_this', '_other'))

   a  a_this  b_this  c_this  new_this  f_this  e_this  a_other  b_other  \
0  1       1     4.0       7      15.0       8       1        2      5.0   
1  2       2     5.0       8      17.0       9       2        3      6.0   

   c_other  new_other  f_other  e_other  
0        8       17.0        9        2  
1        9       19.0        7        3  

### Transpose
    Dataset.transpose()
    Transposes the dataset based and assignes the specified roles to the new columns.
        Args:
            roles (Optional[Union[Dict[Union[str, int], ABCRole], List]]): The roles to be assigned to the columns of the transposed dataset. 
                - If a dictionary is provided, the keys are the names of the columns and the values are the roles to be assigned.
                - If a list is provided, he roles are set to default type - Feature, the new column names are taken from the provided list.
                - If not provided, the roles are set to default type - Feature, the new column names are taken from the index of the original dataset.
        Returns:
            Dataset: The transposed dataset.

In [53]:
ds.transpose({'one': FeatureRole(), '2': InfoRole(), 'III': InfoRole()})

      one     2   III
a     1.0   2.0   3.0
b     4.0   5.0   6.0
c     7.0   8.0   9.0
new  15.0  17.0  19.0
f     8.0   9.0   7.0
e     1.0   2.0   3.0

In [54]:
ds.transpose()

        0     1     2
a     1.0   2.0   3.0
b     4.0   5.0   6.0
c     7.0   8.0   9.0
new  15.0  17.0  19.0
f     8.0   9.0   7.0
e     1.0   2.0   3.0

In [55]:
ds.transpose().roles

{0: Feature(<class 'float'>),
 1: Feature(<class 'float'>),
 2: Feature(<class 'float'>)}

In [56]:
ds.transpose(['one', '2', 'III'])

      one     2   III
a     1.0   2.0   3.0
b     4.0   5.0   6.0
c     7.0   8.0   9.0
new  15.0  17.0  19.0
f     8.0   9.0   7.0
e     1.0   2.0   3.0

### Shuffle
    Dataset.shuffle()
    Shuffles the dataset rows of the dataset.
        Parameters:
            random_state (Optional[int]): Optional random state for reproducibility.
        Returns:
            Dataset: A new Dataset object with shuffled data.

In [57]:
ds.shuffle()

   a    b  c   new  f  e
2  3  6.0  9  19.0  7  3
0  1  4.0  7  15.0  8  1
1  2  5.0  8  17.0  9  2

In [58]:
ds.shuffle()

   a    b  c   new  f  e
2  3  6.0  9  19.0  7  3
0  1  4.0  7  15.0  8  1
1  2  5.0  8  17.0  9  2

In [59]:
ds.shuffle(random_state=42)

   a    b  c   new  f  e
0  1  4.0  7  15.0  8  1
1  2  5.0  8  17.0  9  2
2  3  6.0  9  19.0  7  3

### Rename
    Dataset.rename()
    Renames the columns of the dataset based on the provided dictionary of names.
        Args:
            names (Dict[FieldKeyTypes, FieldKeyTypes]): A dictionary mapping old column names to new column names.
        Returns:
            Dataset: A new Dataset object with the renamed columns.

In [60]:
ds.rename({'a': 'A', 'b': 'B', 'c': 'C'})

   A    B  C   new  f  e
0  1  4.0  7  15.0  8  1
1  2  5.0  8  17.0  9  2
2  3  6.0  9  19.0  7  3

### Replace
    Dataset.replace()
    Replaces values in the dataset.
        Args:
            to_replace (Any, optional): The value to be replaced. Defaults to None.
            value (Any, optional): The new value. Defaults to None.
            regex (bool, optional): Whether to use regular expressions. Defaults to False.
        Returns:
            Dataset: A new dataset with the replaced values.

In [61]:
dsr = copy.deepcopy(ds)
dsr.replace(2, 15)

    a    b  c   new  f   e
0   1  4.0  7  15.0  8   1
1  15  5.0  8  17.0  9  15
2   3  6.0  9  19.0  7   3

### Append
    Dataset.append()
    Append the given `other` dataset to the current dataset.
        Args:
            other (Dataset or List[Dataset]): The dataset(s) to append.
            index (bool, optional): Whether to reset the index of the new dataset.
                Defaults to False.
        Returns:
            Dataset: The new dataset after appending.
        Raises:
            ConcatDataError: If the `other` dataset is not an instance of `Dataset`.
            ConcatBackendError: If the backend of the `other` dataset is different from the current dataset.

In [62]:
ds.append(ds)

   a    b  c   new  f  e
0  1  4.0  7  15.0  8  1
1  2  5.0  8  17.0  9  2
2  3  6.0  9  19.0  7  3
0  1  4.0  7  15.0  8  1
1  2  5.0  8  17.0  9  2
2  3  6.0  9  19.0  7  3

In [63]:
ds.shuffle().append(other=ds, index=True)

   a    b  c   new  f  e
0  1  4.0  7  15.0  8  1
1  2  5.0  8  17.0  9  2
2  3  6.0  9  19.0  7  3
3  1  4.0  7  15.0  8  1
4  2  5.0  8  17.0  9  2
5  3  6.0  9  19.0  7  3

# Eperiment Data

In [64]:
ed = ExperimentData(ds)

In [65]:
ed.additional_fields

Empty DataFrame
Columns: []
Index: [0, 1, 2]

In [66]:
ed.additional_fields.loc[:, :]

Empty DataFrame
Columns: []
Index: [0, 1, 2]