In [1]:
import pathlib
from dataset import Dataset, BAD_MOOD

DATA_DIR = pathlib.Path('data')
path = next(DATA_DIR.glob('*.csv'))
print('using file', path.name)

df = Dataset(csv_file_path=path)

using file daylio_export_2024_02_07.csv
Dataset(644 entries; last [1 hour 55 minutes ago]; average mood: 3.850)


# API

### `.head`

Use the `.head` method to look at the latest entries of the dataset

In [2]:
df.head()

Dataset(644 entries; last [1 hour 55 minutes ago]; average mood: 3.850)
[07.02.2024 09:31] 3.5 work, coding, relaxed, bored, did something
[07.02.2024 00:28] 3.5 relaxed, excited, home
[06.02.2024 22:42] 4.0 excited, piano, relaxed, satisfied, home
[06.02.2024 20:10] 4.0 coding, tired, relaxed, happy, home
[06.02.2024 17:17] 4.0 did something, bored, satisfied, coding
...


In [3]:
df.head(2)

Dataset(644 entries; last [1 hour 55 minutes ago]; average mood: 3.850)
[07.02.2024 09:31] 3.5 work, coding, relaxed, bored, did something
[07.02.2024 00:28] 3.5 relaxed, excited, home
...


In [4]:
# prints all entries
df.head(-1)

Dataset(644 entries; last [1 hour 55 minutes ago]; average mood: 3.850)
[07.02.2024 09:31] 3.5 work, coding, relaxed, bored, did something
[07.02.2024 00:28] 3.5 relaxed, excited, home
[06.02.2024 22:42] 4.0 excited, piano, relaxed, satisfied, home
[06.02.2024 20:10] 4.0 coding, tired, relaxed, happy, home
[06.02.2024 17:17] 4.0 did something, bored, satisfied, coding
[06.02.2024 00:44] 5.0 excited, sex, relaxed, satisfied, date, Lauren, nervous
[05.02.2024 20:30] 4.0 grateful, relaxed, unsure, date, Lauren, worried
[05.02.2024 15:58] 3.5 relaxed, nervous
[05.02.2024 14:13] 3.5 study, unsure, exam
[05.02.2024 12:10] 3.5 class, grateful, unsure
[05.02.2024 01:00] 2.0 anxious, sad, date, Lauren, nervous
[04.02.2024 22:28] 4.5 tired, relaxed, satisfied, date, Lauren
[04.02.2024 18:04] 4.0 Damir, tired, relaxed, date, Lauren, swimming
[04.02.2024 13:48] 3.5 relaxed
[04.02.2024 12:20] 4.0 grateful, satisfied, happy, date, Lauren
[04.02.2024 01:09] 5.0 grateful, tired, relaxed, date, Lauren


### `.sub`

Use the `.sub` method to filter entries and get a subset of the original dataset

- by included activities ('or' operator: all the entries which have at least one of the listed activities)

In [5]:
cycling_or_city_df = df.sub(incl_act={'cycling', 'city'})
cycling_or_city_df.head()

Dataset(72 entries; last [10 days 16 hours 39 minutes ago]; average mood: 4.181)
[27.01.2024 18:47] 4.0 tired, city, satisfied
[27.01.2024 16:03] 4.0 tired, bored, satisfied, city
[25.01.2024 20:03] 4.0 shopping, relaxed, satisfied, cycling, home
[13.01.2024 16:48] 4.0 city, excited, grateful, movies & series, date, Lauren, home
[05.01.2024 14:06] 4.0 city, anxious, grateful, unsure, date, Lauren
...


- by excluded activities (entries which don't have any of the listed activities)

_for both `incl_act` and `excl_act` the argument can be either `str` or `set[str]`_

In [6]:
without_friends = df.sub(excl_act='friends')
without_friends.head()

Dataset(549 entries; last [1 hour 55 minutes ago]; average mood: 3.827)
[07.02.2024 09:31] 3.5 work, coding, relaxed, bored, did something
[07.02.2024 00:28] 3.5 relaxed, excited, home
[06.02.2024 22:42] 4.0 excited, piano, relaxed, satisfied, home
[06.02.2024 20:10] 4.0 coding, tired, relaxed, happy, home
[06.02.2024 17:17] 4.0 did something, bored, satisfied, coding
...


- by mood values (can be either a value or a Container of values)

In [7]:
bad_mood = df.sub(mood=BAD_MOOD)
bad_mood

Dataset(21 entries; last [2 days 10 hours 26 minutes ago]; average mood: 2.214)

- by included activities ('and' operator: all entries which have all the listed activities)

_Note: this is achieved by subscribing the dataset twice; since .sub method returns a new dataset, it can be done in one line._

In [8]:
cycling_and_swimming = df.sub(incl_act='cycling').sub(incl_act='swimming')
cycling_and_swimming.head()
print(cycling_and_swimming.activities().most_common(7))

Dataset(10 entries; last [3 months 26 days 15 hours 16 minutes ago]; average mood: 4.850)
[13.10.2023 20:10] 5.0 tired, cycling, swimming
[16.09.2023 20:58] 5.0 tired, relaxed, satisfied, swimming, cycling
[04.09.2023 20:44] 5.0 Damir, tired, relaxed, satisfied, swimming, cycling
[22.08.2023 20:27] 5.0 tired, friends, satisfied, swimming, cycling
[19.08.2023 20:32] 5.0 Damir, tired, relaxed, friends, swimming, cycling
...
[('tired', 10), ('cycling', 10), ('swimming', 10), ('satisfied', 7), ('relaxed', 6), ('friends', 6), ('Damir', 5)]


- by a particular date

In [9]:
import datetime

july22 = df.sub(when=datetime.date(2023, 7, 22))
# july22 = df.sub(when='22.07.2023') # or this
july22.head(n=-1)

Dataset(6 entries; last [6 months 19 days 13 hours 9 minutes ago]; average mood: 3.583)
[22.07.2023 22:17] 3.5 relaxed, home
[22.07.2023 19:34] 4.0 relaxed, chess, home
[22.07.2023 17:09] 3.5 study, relaxed, home, cooking
[22.07.2023 14:22] 3.5 unsure, coding
[22.07.2023 13:30] 3.0 anxious, friends, dota, home
[22.07.2023 11:30] 4.0 home, satisfied


- by a subtring (or substrings) in the `note`

In [10]:
df_ktane = df.sub(note_contains='ktane')
df_ktane.head(-1)

Dataset(7 entries; last [2 months 16 days 13 hours 45 minutes ago]; average mood: 4.000)
[22.11.2023 21:41] 4.0 amused, gaming, Leha, movies & series, satisfied, Martina, home
[10.10.2023 22:39] 3.5 amused, gaming, productive, Leha, friends, stressed, home
[06.10.2023 20:45] 4.0 gaming, excited, Leha, friends, happy
[04.10.2023 20:13] 5.0 new place, grateful, movies & series, friends, satisfied, happy, Lauren
[07.07.2023 20:00] 3.5 gaming, study, shopping, coding, Leha, friends, unsure, walking, dota
[03.07.2023 20:05] 4.0 gaming, excited, coding, tired, Leha, friends, happy, stressed, home
[01.07.2023 20:00] 4.0 gaming, excited, movies & series, Leha, relaxed, friends, happy, Martina, home


- by a predicate function (which takes an entry as an argument and returns a boolean)

In [11]:
from dataset import Entry


def pred(entry: Entry) -> bool:
    """It is sunday, the mood > 4, the note is not empty"""
    return entry.full_date.weekday() == 6 and entry.mood > 4 and entry.note != ''


df_sunday_good_mood_has_note = df.sub(predicate=pred)
df_sunday_good_mood_has_note

Dataset(20 entries; last [2 days 12 hours 58 minutes ago]; average mood: 4.775)

### `.mood`

Use the `.mood` method to get the average mood of all the entries in the dataset. This value is also shown when calling `.head()`.

In [12]:
round(cycling_or_city_df.mood(), 3)

4.181

### `.activities`

Use the `.activities` method to get a Counter object of all activities in the dataset

In [13]:
cnt = df.activities()
print(f'most common: {cnt.most_common(3)}')
print(f'least common: {cnt.most_common()[-1:-6:-1]}')


most common: [('home', 368), ('satisfied', 218), ('relaxed', 212)]
least common: [('meeting', 1), ('Dad', 1), ('sex', 1), ('guitar', 2), ('homework', 2)]


### `.get_datetimes`

Use the `.get_datetimes` method to get the list of all points in time when an entry was created

In [14]:
df.get_datetimes()[:5]

[datetime.datetime(2024, 2, 7, 9, 31),
 datetime.datetime(2024, 2, 7, 0, 28),
 datetime.datetime(2024, 2, 6, 22, 42),
 datetime.datetime(2024, 2, 6, 20, 10),
 datetime.datetime(2024, 2, 6, 17, 17)]

### `.group_by_day`
Use the `.group_by_day` method to get a dictionary mapping dates to a list of entries created on that date.
The dates are sorted in ascending order.

Note: this method (like many others) uses lru_cache and since the Dataset is weakly-immutable, it is safe to use it.

In [15]:
groups = df.group_by_day()
for day, entries in groups.items():
    print(f'{day}: {len(entries)}')
    break

2023-07-01: 1


### `.stats`

Use the `.stats` method to get a custom StatsResult object which contains the following information:

In [16]:
help(df.stats)

Help on method stats in module dataset:

stats() -> utils.StatsResult method of dataset.Dataset instance
    Returns the following statistics:
        - mood (avg ± std)
        - note length [num symbols] (avg ± std)
        - entries frequency [entries per day] (median)
    as a StatsResult object.



In [17]:
df.sub(incl_act='home').stats()

Mood: 3.787 ± 0.528
Note length: 33.046 ± 64.357 symbols
Entries frequency: 2.812 entries per day (once every 8 hours 32 minutes)

### `.__iter__`
`Dataset` defines `__iter__` method, so it can be used in `for` loops. The entries are sorted in descending order by the date of creation.

In [18]:
for entry in df:
    print(entry)
    break
# or
df_iter = iter(df)
print(next(df_iter), next(df_iter), sep='\n')

[07.02.2024 09:31] 3.5 work, coding, relaxed, bored, did something
[07.02.2024 09:31] 3.5 work, coding, relaxed, bored, did something
[07.02.2024 00:28] 3.5 relaxed, excited, home


### `.__getitem__`
`Dataset` defines `__getitem__` method where the argument is 
- a date in a string format (e.g. `'01.01.2024'`), it returns a list of entries created on that date;
- a datetime in a string format (e.g. `'01.01.2024 12:00'`), it returns a single entry created at that datetime or raises an error if there are no entries created at that datetime;

Note: the first use case is equivalent to `df.sub(date='01.01.2024').entries[::-1]` (it's reversed because in the root dataset the entries are sorted in descending order by the date of creation).

In [19]:
july22_entries = df['22.07.2023']
assert july22_entries == july22.entries[::-1]
# or
entry = df['22.07.2023 17:09']; print(entry)

[22.07.2023 17:09] 3.5 study, relaxed, home, cooking


# Analysis examples

## Mood analysis

In [20]:
activity = 'cycling'
mood_with, mood_without = df.mood_with_without(activity)

In [21]:
print(f'''"{activity}"
with: {mood_with:.2f}
without: {mood_without:.2f}
change: {(mood_with - mood_without)/mood_without:.1%}'''
)

"cycling"
with: 4.37
without: 3.82
change: 14.3%


## Complete analysis

In [22]:
complete_analysis = df.complete_analysis()

In [23]:
print(f'analysed {len(complete_analysis)} actvities')
for _name, _with, _without, _change, _num_occ in complete_analysis[:5] + complete_analysis[-5:]:
    print(f'[{_name:^15}]: {_change:.1%} (with: {_with:.2f}, without: {_without:.2f}); occured {_num_occ} times')
# TODO: maybe make this a pretty figure?

analysed 47 actvities
[     happy     ]: 18.3% (with: 4.38, without: 3.70); occured 145 times
[   swimming    ]: 16.7% (with: 4.46, without: 3.82); occured 27 times
[   grateful    ]: 15.4% (with: 4.37, without: 3.79); occured 67 times
[    cycling    ]: 14.3% (with: 4.37, without: 3.82); occured 34 times
[     Damir     ]: 13.9% (with: 4.36, without: 3.83); occured 22 times
[     exam      ]: -14.5% (with: 3.30, without: 3.86); occured 10 times
[      sad      ]: -19.0% (with: 3.13, without: 3.87); occured 15 times
[   stressed    ]: -19.1% (with: 3.12, without: 3.86); occured 12 times
[    anxious    ]: -23.1% (with: 3.01, without: 3.92); occured 47 times
[     angry     ]: -40.2% (with: 2.32, without: 3.88); occured 11 times


## Mood plot

In [24]:
df.mood_plot()

## Bar plot by hour/day/month

In [25]:
df.by_time_bar_plot('hour')

In [26]:
df.by_time_bar_plot('day')

In [27]:
df.by_time_bar_plot('weekday')

In [28]:
df.by_time_bar_plot('month')

## Other features

In [29]:
df.note_length_plot(400)

In [30]:
df_dota = df.sub(incl_act='dota')

In [31]:
df_dota

Dataset(29 entries; last [24 days 22 hours 26 minutes ago]; average mood: 3.397)

In [32]:
df_dota.stats()

Mood: 3.397 ± 0.573
Note length: 12.138 ± 21.037 symbols
Entries frequency: 0.491 entries per day (once every 2 days 0 hour 53 minutes)

In [33]:
df_dota.mood_plot()