In [1]:
import pathlib
from dataset import Dataset, BAD_MOOD

DATA_DIR = pathlib.Path('data')
path = next(DATA_DIR.glob('*.csv'))
print('using file', path.name)

df = Dataset(csv_file_path=path)

using file daylio_export_2024_01_29.csv
Dataset(610 entries; last [39 minutes ago]; average mood: 3.852)


# API

### `.head`

Use the `.head` method to look at the latest entries of the dataset

In [2]:
df.head()

Dataset(610 entries; last [39 minutes ago]; average mood: 3.852)
[29.01.2024 13:50] 3.5 procrastinating, home, satisfied
[29.01.2024 00:11] 4.0 home, relaxed, satisfied
[28.01.2024 20:06] 3.5 study, walking, home, satisfied
[28.01.2024 16:16] 3.5 bored, relaxed, procrastinating, home
[27.01.2024 18:47] 4.0 city, satisfied, tired
...


In [3]:
df.head(2)

Dataset(610 entries; last [39 minutes ago]; average mood: 3.852)
[29.01.2024 13:50] 3.5 procrastinating, home, satisfied
[29.01.2024 00:11] 4.0 home, relaxed, satisfied
...


In [4]:
# prints all entries
df.head(-1)

Dataset(610 entries; last [39 minutes ago]; average mood: 3.852)
[29.01.2024 13:50] 3.5 procrastinating, home, satisfied
[29.01.2024 00:11] 4.0 home, relaxed, satisfied
[28.01.2024 20:06] 3.5 study, walking, home, satisfied
[28.01.2024 16:16] 3.5 bored, relaxed, procrastinating, home
[27.01.2024 18:47] 4.0 city, satisfied, tired
[27.01.2024 16:03] 4.0 bored, satisfied, city, tired
[27.01.2024 10:59] 3.5 excited
[27.01.2024 09:00] 4.0 family, Mom, satisfied, grateful
[26.01.2024 23:03] 3.5 piano, satisfied, tired
[26.01.2024 20:54] 4.0 swimming, Damir, satisfied, tired
[26.01.2024 15:24] 4.0 home, relaxed, satisfied
[26.01.2024 13:55] 3.5 home, class
[26.01.2024 12:14] 3.0 study, unsure, home, worried
[25.01.2024 23:13] 3.5 home, coding
[25.01.2024 20:03] 4.0 shopping, cycling, relaxed, home, satisfied
[25.01.2024 19:30] 4.0 Mom, amused, Azat, home, family
[25.01.2024 16:02] 3.5 anxious, Leha, friends, gaming, unsure, home
[25.01.2024 01:03] 4.0 excited, home, coding, tired
[24.01.2024 

### `.sub`

Use the `.sub` method to filter entries and get a subset of the original dataset

- by included activities ('or' operator: all the entries which have at least one of the listed activities)

In [5]:
cycling_or_city_df = df.sub(incl_act={'cycling', 'city'})
cycling_or_city_df.head()

Dataset(72 entries; last [1 day 19 hours 42 minutes ago]; average mood: 4.181)
[27.01.2024 18:47] 4.0 city, satisfied, tired
[27.01.2024 16:03] 4.0 bored, satisfied, city, tired
[25.01.2024 20:03] 4.0 shopping, cycling, relaxed, home, satisfied
[13.01.2024 16:48] 4.0 Lauren, movies & series, excited, grateful, city, date, home
[05.01.2024 14:06] 4.0 Lauren, anxious, grateful, city, date, unsure
...


- by excluded activities (entries which don't have any of the listed activities)

_for both `incl_act` and `excl_act` the argument can be either `str` or `set[str]`_

In [6]:
without_friends = df.sub(excl_act='friends')
without_friends.head()

Dataset(519 entries; last [39 minutes ago]; average mood: 3.829)
[29.01.2024 13:50] 3.5 procrastinating, home, satisfied
[29.01.2024 00:11] 4.0 home, relaxed, satisfied
[28.01.2024 20:06] 3.5 study, walking, home, satisfied
[28.01.2024 16:16] 3.5 bored, relaxed, procrastinating, home
[27.01.2024 18:47] 4.0 city, satisfied, tired
...


- by mood values (can be either a value or a Container of values)

In [7]:
bad_mood = df.sub(mood=BAD_MOOD)
bad_mood

Dataset(20 entries; last [5 days 6 hours 19 minutes ago]; average mood: 2.225)

- by included activities ('and' operator: all entries which have all the listed activities)

_Note: this is achieved by subscribing the dataset twice; since .sub method returns a new dataset, it can be done in one line._

In [8]:
cycling_and_swimming = df.sub(incl_act='cycling').sub(incl_act='swimming')
cycling_and_swimming.head()
print(cycling_and_swimming.activities().most_common(7))

Dataset(10 entries; last [3 months 17 days 18 hours 19 minutes ago]; average mood: 4.850)
[13.10.2023 20:10] 5.0 swimming, cycling, tired
[16.09.2023 20:58] 5.0 swimming, relaxed, tired, cycling, satisfied
[04.09.2023 20:44] 5.0 swimming, relaxed, tired, Damir, cycling, satisfied
[22.08.2023 20:27] 5.0 swimming, friends, tired, cycling, satisfied
[19.08.2023 20:32] 5.0 swimming, friends, relaxed, tired, Damir, cycling
...
[('swimming', 10), ('cycling', 10), ('tired', 10), ('satisfied', 7), ('relaxed', 6), ('friends', 6), ('Damir', 5)]


- by a particular date

In [9]:
import datetime

july22 = df.sub(when=datetime.date(2023, 7, 22))
# july22 = df.sub(when='22.07.2023') # or this
july22.head(n=-1)

Dataset(6 entries; last [6 months 10 days 16 hours 12 minutes ago]; average mood: 3.583)
[22.07.2023 22:17] 3.5 home, relaxed
[22.07.2023 19:34] 4.0 home, relaxed, chess
[22.07.2023 17:09] 3.5 study, home, cooking, relaxed
[22.07.2023 14:22] 3.5 unsure, coding
[22.07.2023 13:30] 3.0 anxious, friends, dota, home
[22.07.2023 11:30] 4.0 home, satisfied


- by a subtring (or substrings) in the `note`

In [10]:
df_ktane = df.sub(note_contains='ktane')
df_ktane.head(-1)

Dataset(7 entries; last [2 months 7 days 16 hours 48 minutes ago]; average mood: 4.000)
[22.11.2023 21:41] 4.0 Martina, movies & series, amused, Leha, gaming, home, satisfied
[10.10.2023 22:39] 3.5 amused, Leha, friends, stressed, gaming, productive, home
[06.10.2023 20:45] 4.0 excited, friends, Leha, gaming, happy
[04.10.2023 20:13] 5.0 Lauren, new place, movies & series, friends, grateful, happy, satisfied
[07.07.2023 20:00] 3.5 study, walking, shopping, friends, dota, Leha, gaming, coding, unsure
[03.07.2023 20:05] 4.0 excited, friends, Leha, tired, stressed, gaming, coding, happy, home
[01.07.2023 20:00] 4.0 Martina, movies & series, excited, relaxed, friends, Leha, gaming, happy, home


- by a predicate function (which takes an entry as an argument and returns a boolean)

In [11]:
from dataset import Entry


def pred(entry: Entry) -> bool:
    """It is sunday, the mood > 4, the note is not empty"""
    return entry.full_date.weekday() == 6 and entry.mood > 4 and entry.note != ''


df_sunday_good_mood_has_note = df.sub(predicate=pred)
df_sunday_good_mood_has_note

Dataset(18 entries; last [7 days 15 hours 15 minutes ago]; average mood: 4.778)

### `.mood`

Use the `.mood` method to get the average mood of all the entries in the dataset. This value is also shown when calling `.head()`.

In [12]:
round(cycling_or_city_df.mood(), 3)

4.181

### `.activities`

Use the `.activities` method to get a Counter object of all activities in the dataset

In [13]:
cnt = df.activities()
print(f'most common: {cnt.most_common(3)}')
print(f'least common: {cnt.most_common()[-1:-6:-1]}')


most common: [('home', 354), ('satisfied', 205), ('relaxed', 194)]
least common: [('meeting', 1), ('Dad', 1), ('guitar', 2), ('homework', 2), ('snacking', 3)]


### `.get_datetimes`

Use the `.get_datetimes` method to get the list of all points in time when an entry was created

In [14]:
df.get_datetimes()[:5]

[datetime.datetime(2024, 1, 29, 13, 50),
 datetime.datetime(2024, 1, 29, 0, 11),
 datetime.datetime(2024, 1, 28, 20, 6),
 datetime.datetime(2024, 1, 28, 16, 16),
 datetime.datetime(2024, 1, 27, 18, 47)]

### `.group_by_day`
Use the `.group_by_day` method to get a dictionary mapping dates to a list of entries created on that date.
The dates are sorted in ascending order.

Note: this method (like many others) uses lru_cache and since the Dataset is weakly-immutable, it is safe to use it.

In [15]:
groups = df.group_by_day()
for day, entries in groups.items():
    print(f'{day}: {len(entries)}')
    break

2023-07-01: 1


### `.stats`

Use the `.stats` method to get a custom StatsResult object which contains the following information:

In [16]:
help(df.stats)

Help on method stats in module dataset:

stats() -> utils.StatsResult method of dataset.Dataset instance
    Returns the following statistics:
        - mood (avg ± std)
        - note length [num symbols] (avg ± std)
        - entries frequency [entries per day] (median)
    as a StatsResult object.



In [17]:
df.sub(excl_act='home').stats()

Mood: 3.945 ± 0.633
Note length: 48.852 ± 49.946 symbols
Entries frequency: 1.667 entries per day

### `.__iter__`
`Dataset` defines `__iter__` method, so it can be used in `for` loops. The entries are sorted in descending order by the date of creation.

In [18]:
for entry in df:
    print(entry)
    break
# or
df_iter = iter(df)
print(next(df_iter), next(df_iter), sep='\n')

[29.01.2024 13:50] 3.5 procrastinating, home, satisfied
[29.01.2024 13:50] 3.5 procrastinating, home, satisfied
[29.01.2024 00:11] 4.0 home, relaxed, satisfied


### `.__getitem__`
`Dataset` defines `__getitem__` method where the argument is 
- a date in a string format (e.g. `'01.01.2024'`), it returns a list of entries created on that date;
- a datetime in a string format (e.g. `'01.01.2024 12:00'`), it returns a single entry created at that datetime or raises an error if there are no entries created at that datetime;

Note: the first use case is equivalent to `df.sub(date='01.01.2024').entries[::-1]` (it's reversed because in the root dataset the entries are sorted in descending order by the date of creation).

In [19]:
july22_entries = df['22.07.2023']
assert july22_entries == july22.entries[::-1]
# or
entry = df['22.07.2023 17:09']; print(entry)

[22.07.2023 17:09] 3.5 study, home, cooking, relaxed


# Analysis examples

## Mood analysis

In [20]:
activity = 'cycling'
mood_with, mood_without = df.mood_with_without(activity)

In [21]:
print(f'''"{activity}"
with: {mood_with:.2f}
without: {mood_without:.2f}
change: {(mood_with - mood_without)/mood_without:.1%}'''
)

"cycling"
with: 4.37
without: 3.82
change: 14.3%


## Complete analysis

In [22]:
complete_analysis = df.complete_analysis()

In [23]:
print(f'analysed {len(complete_analysis)} actvities')
for _name, _with, _without, _change, _num_occ in complete_analysis[:5] + complete_analysis[-5:]:
    print(f'[{_name:^15}]: {_change:.1%} (with: {_with:.2f}, without: {_without:.2f}); occured {_num_occ} times')
# TODO: maybe make this a pretty figure?

analysed 46 actvities
[     happy     ]: 18.6% (with: 4.38, without: 3.69); occured 140 times
[   swimming    ]: 17.2% (with: 4.48, without: 3.82); occured 26 times
[   grateful    ]: 16.0% (with: 4.40, without: 3.79); occured 60 times
[     Damir     ]: 14.3% (with: 4.38, without: 3.83); occured 21 times
[    cycling    ]: 14.3% (with: 4.37, without: 3.82); occured 34 times
[    worried    ]: -12.9% (with: 3.36, without: 3.86); occured 11 times
[      sad      ]: -16.9% (with: 3.21, without: 3.87); occured 14 times
[   stressed    ]: -18.9% (with: 3.14, without: 3.87); occured 11 times
[    anxious    ]: -22.6% (with: 3.03, without: 3.92); occured 46 times
[     angry     ]: -40.3% (with: 2.32, without: 3.88); occured 11 times


## Mood plot

In [24]:
df.mood_plot()

## Bar plot by hour/day/month

In [25]:
df.by_time_bar_plot('hour')

In [26]:
df.by_time_bar_plot('day')

In [27]:
df.by_time_bar_plot('weekday')

In [28]:
df.by_time_bar_plot('month')

## Other features

In [29]:
df.note_length_plot(400)