In [4]:
import pandas as pd
from pathlib import Path

ACTIVITY = 'activity'
DEVICE = 'device'
START_TIME = 'start_time'
END_TIME = 'end_time'

TIME = 'time'
VALUE = 'value'
NAME = 'name'

def _fix_line(s, i):

        if i == 2082109:
            # Activity work end was not begun in the first place -> remove Activity 
            s = s[:-2]
        if i == 2082361:
            # Activity work begins but never ends -> remove Activity
            s = s[:-2]

        return s

def _get_devices_df(df):
    df = df.copy().drop(ACTIVITY, axis=1)
    bin_mask = (df[VALUE] == 'ON') | (df[VALUE] == 'OFF')

    # preprocess only binary devices to ON-OFF--> False True
    df_binary = df[bin_mask]
    df_binary[VALUE] = (df_binary[VALUE] == 'ON')

    # preprocess only numeric devices
    num_mask = pd.to_numeric(df[VALUE], errors='coerce').notnull()
    df_num = df[num_mask]
    df_num[VALUE] = df_num[VALUE].astype(float)

    # preprocess categorical devices
    df_cat = df[~num_mask & ~bin_mask]

    # join datasets
    df = pd.concat([df_cat, df_binary, df_num], axis=0, ignore_index=True)
    df.columns = [TIME, DEVICE, VALUE]
    df = df.sort_values(by=TIME).reset_index(drop=True)

    return df


def _get_activity_df(df):
    # get all rows containing activities
    df = df.copy()[~df[ACTIVITY].isnull()][[START_TIME, ACTIVITY]]
    df[ACTIVITY] = df[ACTIVITY].astype(str).apply(lambda x: x.strip())

    act_list = list(df[ACTIVITY].unique())
    act_list.sort()
    
    new_df_lst = []
    for i in range(1, len(act_list), 2):
        activity = ' '.join(act_list[i].split(' ')[:-1])
        act_begin = act_list[i-1]
        act_end = act_list[i]
        assert activity in act_begin and activity in act_end
           
        # create subsets for begin and end of chosen activity
        df_res = df[df[ACTIVITY] == act_begin].reset_index(drop=True)
        df_end = df[df[ACTIVITY] == act_end].reset_index(drop=True)
        #assert len(df_res) == len(df_end)
        
        # append sorted end_time to start_time as they should be
        # pairwise together
        df_res[ACTIVITY] = activity
        df_res[END_TIME] = df_end[START_TIME]
        new_df_lst.append(df_res)
    
    # data preparation
    res = pd.concat(new_df_lst)
    res = res.reindex(columns=[START_TIME, END_TIME, ACTIVITY])
    res = res.sort_values(START_TIME)
    res = res.reset_index(drop=True)
    return res

In [None]:
class ActivityDict(dict):
    """ Dictionary with activity pd.DataFrames as values and subject names as keys.
    """

    def __init__(self, obj=None):

        if isinstance(obj, pd.DataFrame):
            obj = obj.copy().reset_index(drop=True)
            super().__init__({'subject': obj})
        elif isinstance(obj, list):
            if isinstance(obj[0], tuple):
                super().__init__({name: df for (name, df) in obj})
            else:
                super().__init__({f'subject_{i}': df for i, df in enumerate(obj)})
        elif isinstance(obj, ActivityDict) or isinstance(obj, dict):
            super().__init__(obj)
        else:
            super().__init__()

    def subjects(self) -> list:
        return list(self.keys())

    def to_json(self, date_unit="ns"):
        """Serialize to json"""
        tmp = {}
        for k, df in self.items():
            tmp[k] = df.to_json(date_unit=date_unit)

        return json.dumps(tmp)

    def read_json(cls, string):
        """Serialize from json"""
        tmp = json.loads(string)
        for k, str in tmp.items():
            tmp[k] = pd.read_json(str)
        return ActivityDict(tmp)

    def nr_acts(self):
        """"""
        return max([len(df_acts[ACTIVITY].unique()) for df_acts in self.values()])

    def get_activity_union(self):
        return list(set([item for v in self.values()
                         for item in v[ACTIVITY].unique()]))

    def apply(self, func):
        """ Applies a function to each dataframe
        """
        for k, df in self.items():
            self[k] = func(df)
        return self

    def min_starttime(self):
        min_lst = []
        for df_acts in self.values():
            if not df_acts.empty:
                min_lst.append(df_acts[START_TIME].iloc[0])
        return min(min_lst)

    def max_endtime(self):
        max_lst = []
        for df_acts in self.values():
            if not df_acts.empty:
                max_lst.append(df_acts[END_TIME].iloc[-1])
        return max(max_lst)

    def concat(self):
        return pd.concat(self.values())

    def copy(self):
        """ Returns a deep copy of itsself
        """
        return ActivityDict({k: v.copy() for k, v in self.items()})

    @classmethod
    def wrap(cls, df_acts):
        if isinstance(df_acts, pd.DataFrame):
            df_acts = df_acts.copy().reset_index(drop=True)  # TODO not here
            df_acts = ActivityDict({'subject': df_acts})
            return df_acts
        elif isinstance(df_acts, list):
            return ActivityDict({f'subject_{i}': df for i, df in enumerate(df_acts)})
        elif isinstance(df_acts, ActivityDict):
            return df_acts
        elif isinstance(df_acts, dict):
            return ActivityDict(df_acts)
        else:
            raise NotImplementedError

    def unwrap(self, inst_type: type):
        if inst_type == ActivityDict:
            return self
        elif inst_type == list:
            return list(self.values())
        elif inst_type == dict:
            return super(self)
        elif inst_type == pd.DataFrame:
            assert len(self) == 1
            return list(self.values())[0]
        else:
            raise NotImplementedError


In [15]:
def load_kyoto_dataset(data_dir: str):
    data_dir = Path(data_dir)

    raw_path = data_dir / 'data'
    corrected_path = data_dir / 'corrected_data.csv'
    
    with open(raw_path, 'r') as f_o, open(corrected_path, 'w') as f_t:
            delimiter = ';'
            for i, line in enumerate(f_o.readlines()):

                # Seperate with tabs and whitespaces and remove empty sets
                s = [sub.split(' ') for sub in line[:-1].split('\t')]
                s = [subsub for sub in s for subsub in sub]
                s = [item for item in s if item != '']

                if not s:
                    # the case for empty lines
                    continue

                # Join timestamp
                s = [' '.join([s[0], s[1]])] + s[2:]

                try:
                    s = _fix_line(s, i)
                except ValueError:
                    continue

                new_line = delimiter.join(s[:3])

                try:
                    s[4] # test if there is an activity
                    new_line += delimiter + " ".join(s[3:])
                except IndexError as e:
                    pass

                f_t.write(new_line + "\n")
            f_t.close()
            f_o.close()
    
    df = pd.read_csv(corrected_path,
                    sep=';',
                    infer_datetime_format=True,
                    na_values=True,
                    names=[START_TIME, 'id', VALUE, ACTIVITY],
                    )
    print(df.iloc[72122:72126])
    # df[START_TIME] = df[START_TIME].apply(lambda x: x.strip())
    df[START_TIME] = pd.to_datetime(df[START_TIME],format="mixed")
    df = df.sort_values(by=START_TIME)\
           .drop_duplicates()
    # Drop when a device is na 
    df = df[~df.iloc[:, :3].isna().any(axis=1)].reset_index(drop=True)
    df_dev = _get_devices_df(df)
    df_act = _get_activity_df(df)
    

    lst_act_res1 = [
        'R1_Wandering_in_room',
        'R1_Sleep',
        'R1_Bed_Toilet_Transition',
        'R1_Personal_Hygiene',
        'R1_Bathing',
        'R1_Work',
        'R1_Meal_Preparation',
        'R1_Leave_Home',
        'R1_Enter_Home',
        'R1_Eating',
        'R1_Watch_TV',
        'R1_Housekeeping',
        'R1_Sleeping_Not_in_Bed'
    ]
    lst_act_res2 = [
        'R2_Wandering_in_room',
        'R2_Meal_Preparation',
        'R2_Eating',
        'R2_Work',
        'R2_Bathing',
        'R2_Leave_Home',
        'R2_Watch_TV',
        'R2_Bed_Toilet_Transition',
        'R2_Enter_Home',
        'R2_Sleep',
        'R2_Personal_Hygiene',
        'R2_Sleeping_Not_in_Bed'
    ]
    dct_act = ActivityDict({
        'resident_1': df_act[df_act[ACTIVITY].isin(lst_act_res1)],
        'resident_2': df_act[df_act[ACTIVITY].isin(lst_act_res2)],
        })

    lst_act = df_act[ACTIVITY].unique()
    lst_dev = df_dev[DEVICE].unique()

    lst_act = df_act[ACTIVITY].unique()
    lst_dev = df_dev[DEVICE].unique()

    return dict(
        activities=dct_act,
        devices=df_dev,
        activity_list=lst_act,
        device_list=lst_dev
    )


In [19]:
dataset = load_kyoto_dataset(Path("../dataset/casas/kyoto2010"))

print("Activities:")
print(dataset['activities']['resident_1'].head())
print(dataset['activities']['resident_2'].head())

print("\nDevices:")
print(dataset['devices'].head())

  df = pd.read_csv(corrected_path,


                       start_time    id value activity
72122  2009-08-30 10:44:06.083411  M048    ON      NaN
72123  2009-08-30 10:44:46.026873  P001  4727      NaN
72124  2009-08-30 10:44:52.005653  M046    ON      NaN
72125         2009-08-30 10:44:53  M047    ON      NaN


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_binary[VALUE] = (df_binary[VALUE] == 'ON')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_num[VALUE] = df_num[VALUE].astype(float)


Activities:
                  start_time                   end_time              activity
0 2009-08-24 00:00:19.034964 2009-08-24 00:04:36.042376  R1_Wandering_in_room
1 2009-08-24 00:04:38.039369 2009-08-24 00:05:21.041611              R1_Sleep
2 2009-08-24 00:05:23.013091 2009-08-24 00:08:30.090362  R1_Wandering_in_room
3 2009-08-24 00:08:31.034365 2009-08-24 00:10:14.017269              R1_Sleep
5 2009-08-24 00:15:47.003067 2009-08-24 00:16:27.004872              R1_Sleep
                   start_time                   end_time              activity
4  2009-08-24 00:15:25.059479 2009-08-24 07:07:50.044921              R2_Sleep
17 2009-08-24 07:07:57.081426 2009-08-24 07:10:43.063058   R2_Personal_Hygiene
19 2009-08-24 07:11:07.052461 2009-08-24 07:43:18.019302              R2_Sleep
20 2009-08-24 07:43:21.019996 2009-08-24 07:47:09.082372  R2_Wandering_in_room
21 2009-08-24 07:47:34.061355 2009-08-24 07:53:53.087228   R2_Meal_Preparation

Devices:
                        time device 