In [13]:
import pandas as pd
from pathlib import Path

ACTIVITY = 'activity'
DEVICE = 'device'
START_TIME = 'start_time'
END_TIME = 'end_time'

TIME = 'time'
VALUE = 'value'
NAME = 'name'

def _fix_data(path, fp_corr):
    """
    as the data is very inconsistent with tabs and spaces this is to make it alright again
    produces: 
        date time,id,value,activity 
    """
    with open(path, 'r') as f_o, open(fp_corr, 'w') as f_t:
        for i, line in enumerate(f_o.readlines()):
            s = line[:-1].split('\t')

            # one tab \t to much 
            if i == 10285:
                s.remove('')

            # the value ON is mislabeled as ON0 
            if i == 275005:
                s[2] = 'ON'

            # The value on of device M019 is mislabeled as O
            if i == 433139:
                s[2] = 'ON'

            # The value on of device M022 is mislabeled as ON`
            if i == 174353:
                s[2] = 'ON'


            new_line = ",".join(s)
            try:
                s[4] # test if there is an activity
                new_line += "," + " ".join(s[4:])
            except IndexError as e:
                pass

            assert len(s) in [3, 4]
                
            f_t.write(new_line + "\n")
        f_t.close()
        f_o.close()

def _get_devices_df(df):
    df = df.copy().drop(ACTIVITY, axis=1)
    bin_mask = (df[VALUE] == 'ON') | (df[VALUE] == 'OFF')

    # preprocess only binary devices to ON-OFF--> False True
    df_binary = df[bin_mask]
    df_binary[VALUE] = (df_binary[VALUE] == 'ON')

    # preprocess only numeric devices
    num_mask = pd.to_numeric(df[VALUE], errors='coerce').notnull()
    df_num = df[num_mask]
    df_num[VALUE] = df_num[VALUE].astype(float)

    # preprocess categorical devices
    df_cat = df[~num_mask & ~bin_mask]

    # join datasets
    df = pd.concat([df_cat, df_binary, df_num], axis=0, ignore_index=True)
    df.columns = [TIME, DEVICE, VALUE]
    df = df.sort_values(by=TIME).reset_index(drop=True)

    return df


def _get_activity_df(df):
    # get all rows containing activities
    df = df.copy()[~df[ACTIVITY].isnull()][[START_TIME, ACTIVITY]]
    df[ACTIVITY] = df[ACTIVITY].astype(str).apply(lambda x: x.strip())

    act_list = list(df[ACTIVITY].unique())
    act_list.sort()
    
    new_df_lst = []
    for i in range(1, len(act_list), 2):
        activity = ' '.join(act_list[i].split(' ')[:-1])
        act_begin = act_list[i-1]
        act_end = act_list[i]
        assert activity in act_begin and activity in act_end
           
        # create subsets for begin and end of chosen activity
        df_res = df[df[ACTIVITY] == act_begin].reset_index(drop=True)
        df_end = df[df[ACTIVITY] == act_end].reset_index(drop=True)
        #assert len(df_res) == len(df_end)
        
        # append sorted end_time to start_time as they should be
        # pairwise together
        df_res[ACTIVITY] = activity
        df_res[END_TIME] = df_end[START_TIME]
        new_df_lst.append(df_res)
    
    # data preparation
    res = pd.concat(new_df_lst)
    res = res.reindex(columns=[START_TIME, END_TIME, ACTIVITY])
    res = res.sort_values(START_TIME)
    res = res.reset_index(drop=True)
    return res

In [14]:
def load_data(folder_path):
    fp = folder_path.joinpath("data")

    fp_corr = folder_path.joinpath('corrected_data.csv')
    _fix_data(fp, fp_corr)

    df = pd.read_csv(fp_corr,
                    sep=",",
                    #parse_dates=True,
                    infer_datetime_format=True,
                    na_values=True,
                    names=[START_TIME, 'id', VALUE, ACTIVITY],
                    engine='python'  #to ignore warning for fallback to python engine because skipfooter
                    #dtyp
                    )
    df[START_TIME] = pd.to_datetime(df[START_TIME],format='mixed')
    df = df.sort_values(by=START_TIME).reset_index(drop=True)

    df_dev = _get_devices_df(df)
    df_act = _get_activity_df(df)

    lst_act = df_act[ACTIVITY].unique()
    lst_dev = df_dev[DEVICE].unique()

    return dict(
        activities=df_act,
        devices=df_dev,
        activity_list=lst_act,
        device_list=lst_dev
    )

In [15]:
dataset = load_data(Path("../dataset/casas/Milan"))

print("Activities:")
print(dataset['activities'].head())

print("\nDevices:")
print(dataset['devices'].head())

  df = pd.read_csv(fp_corr,


Activities:
                  start_time                   end_time          activity
0 2009-10-16 03:55:53.000080 2009-10-16 03:58:28.000002     Bed_to_Toilet
1 2009-10-16 03:58:44.000068 2009-10-16 08:40:01.000075             Sleep
2 2009-10-16 08:42:01.000077 2009-10-16 08:42:56.000081      Morning_Meds
3 2009-10-16 08:43:59.000024 2009-10-16 08:44:29.000026          Watch_TV
4 2009-10-16 08:45:38.000076 2009-10-16 08:58:52.000004  Kitchen_Activity

Devices:
                        time device  value
0 2009-10-16 00:01:04.000059   M017   True
1 2009-10-16 00:01:06.000046   M009   True
2 2009-10-16 00:01:07.000064   M017  False
3 2009-10-16 00:01:08.000081   M019   True
4 2009-10-16 00:01:09.000028   M009  False


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_binary[VALUE] = (df_binary[VALUE] == 'ON')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_num[VALUE] = df_num[VALUE].astype(float)
