In [1]:
import pathlib
from dataset import Dataset, BAD_MOOD

DATA_DIR = pathlib.Path('data')
path = next(DATA_DIR.glob('*.csv'))
print('using file', path.name)

df = Dataset(csv_file_path=path)
df.stats()

using file daylio_export_2024_07_11.csv
Dataset(1274 entries; last [46 minutes 28 seconds ago]; mood: 3.811 ± 0.525)


Mood: 3.811 ± 0.525
Note length: 44.495 ± 53.236 symbols
Entries frequency: 5.806 entries per day (once every 4 hours 8 minutes)

# API

### `.head`

Use the `.head` method to look at the latest entries of the dataset

In [2]:
df.head()

Dataset(1274 entries; last [46 minutes 29 seconds ago]; mood: 3.811 ± 0.525)
[11.07.2024 23:00] 4.5 Leha, excited, factorio, friends, home, relaxed, satisfied
[11.07.2024 19:05] 4.0 excited, home, piano
[11.07.2024 16:50] 4.0 coding, excited, relaxed, work, worked
[10.07.2024 23:30] 4.0 excited, home, movies & series, satisfied
[10.07.2024 20:59] 4.0 guitar, home, relaxed, satisfied
...


In [3]:
df.head(2)

Dataset(1274 entries; last [46 minutes 29 seconds ago]; mood: 3.811 ± 0.525)
[11.07.2024 23:00] 4.5 Leha, excited, factorio, friends, home, relaxed, satisfied
[11.07.2024 19:05] 4.0 excited, home, piano
...


In [None]:
# prints all entries
df.head(-1)

### `.sub`

Use the `.sub` method to filter entries and get a subset of the original dataset

- by included activities ('or' operator: all the entries which have at least one of the listed activities)

In [5]:
cycling_or_city_df = df.sub(include={'cycling', 'city'})
cycling_or_city_df.head()

Dataset(91 entries; last [7 days 8 hours 17 minutes 35 seconds ago]; mood: 4.126 ± 0.561)
[04.07.2024 15:29] 3.0 city, tired, unsure, walking
[30.06.2024 20:10] 4.0 cycling, relaxed, satisfied, walking
[28.06.2024 22:28] 4.0 cycling, relaxed, satisfied, swimming, tired
[23.06.2024 20:54] 3.0 Lauren, city, sad, tired, unsure
[14.06.2024 16:45] 4.0 Damir, city, excited, relaxed, satisfied
...


- by excluded activities (entries which don't have any of the listed activities)

_for both `include` and `exclude` the argument can be either `str` or `set[str]`_

In [6]:
without_friends = df.sub(exclude='friends')
without_friends.head()

Dataset(1144 entries; last [4 hours 41 minutes 36 seconds ago]; mood: 3.791 ± 0.520)
[11.07.2024 19:05] 4.0 excited, home, piano
[11.07.2024 16:50] 4.0 coding, excited, relaxed, work, worked
[10.07.2024 23:30] 4.0 excited, home, movies & series, satisfied
[10.07.2024 20:59] 4.0 guitar, home, relaxed, satisfied
[10.07.2024 20:28] 4.0 amused, group project, home, relaxed, satisfied
...


- by mood values (can be either a value or a Container of values)

In [7]:
bad_mood = df.sub(mood=BAD_MOOD)
bad_mood

Dataset(36 entries; last [2 months 26 days 13 hours 7 minutes 36 seconds ago]; mood: 2.278 ± 0.252)

- by included activities ('and' operator: all entries which have all the listed activities)

_Note: this is achieved by subscribing the dataset twice; since .sub method returns a new dataset, it can be done in one line._

In [8]:
cycling_and_swimming = df.sub(include='cycling').sub(include='swimming')
cycling_and_swimming.head()
print(cycling_and_swimming.activities().most_common(7))

Dataset(11 entries; last [13 days 1 hour 18 minutes 36 seconds ago]; mood: 4.773 ± 0.518)
[28.06.2024 22:28] 4.0 cycling, relaxed, satisfied, swimming, tired
[13.10.2023 20:10] 5.0 cycling, swimming, tired
[16.09.2023 20:58] 5.0 cycling, relaxed, satisfied, swimming, tired
[04.09.2023 20:44] 5.0 Damir, cycling, relaxed, satisfied, swimming, tired
[22.08.2023 20:27] 5.0 cycling, friends, satisfied, swimming, tired
...
[('swimming', 11), ('cycling', 11), ('tired', 11), ('satisfied', 8), ('relaxed', 7), ('friends', 6), ('Damir', 5)]


In [9]:
df.sub(include='cycling', exclude='swimming')

Dataset(33 entries; last [11 days 3 hours 36 minutes 37 seconds ago]; mood: 4.136 ± 0.577)

- by a subtring (or substrings) in the `note`

In [10]:
df_ktane = df.sub(note_contains='ktane')
df_ktane.head(-1)

Dataset(6 entries; last [2 months 3 days 12 hours 11 minutes 37 seconds ago]; mood: 3.833 ± 0.258)
[08.05.2024 15:35] 3.5 Leha, annoyed, excited, friends, gaming, unsure
[08.03.2024 17:26] 4.0 Leha, amused, annoyed, friends, gaming, happy, home, satisfied
[22.11.2023 21:41] 4.0 Leha, Martina, amused, gaming, home, movies & series, satisfied
[07.07.2023 20:00] 3.5 Leha, coding, dota, friends, gaming, shopping, study, unsure, walking
[03.07.2023 20:05] 4.0 Leha, coding, excited, friends, gaming, happy, home, stressed, tired
[01.07.2023 20:00] 4.0 Leha, Martina, excited, friends, gaming, happy, home, movies & series, relaxed


- by a predicate function (which takes an entry as an argument and returns a boolean)

In [11]:
from dataset import Entry


def pred(entry: Entry) -> bool:
    """It is sunday, the mood > 4, the note is not empty"""
    return entry.full_date.weekday() == 6 and entry.mood > 4 and entry.note != ''


df_sunday_good_mood_has_note = df.sub(predicate=pred)
df_sunday_good_mood_has_note

Dataset(31 entries; last [25 days 1 hour 39 minutes 37 seconds ago]; mood: 4.677 ± 0.243)

### `.mood` and `.std`

Use the `.mood` method to get the average mood of all the entries in the dataset and the `.std` method to get the standard deviation of the mood values.

These values are also shown when calling `.head()`.

In [12]:
print(f"{cycling_or_city_df.mood():.3f} \u00B1 {cycling_or_city_df.std():.3f}")

4.126 ± 0.561


or just use `.mood_std`:

In [16]:
cycling_or_city_df.mood_std()

4.126 ± 0.561

### `.activities`

Use the `.activities` method to get a Counter object of all activities in the dataset

In [17]:
cnt = df.activities()
print(f'most common: {cnt.most_common(3)}')
print(f'least common: {cnt.most_common()[-1:-6:-1]}')

most common: [('home', 757), ('relaxed', 486), ('satisfied', 448)]
least common: [('cinema', 3), ('reading', 3), ('photography', 3), ('Dad', 3), ('guitar', 4)]


### `.get_datetimes`

Use the `.get_datetimes` method to get the list of all points in time when an entry was created. The values are sorted in descending order by the date of creation.

In [18]:
df.get_datetimes()[:5]

[datetime.datetime(2024, 7, 11, 23, 0),
 datetime.datetime(2024, 7, 11, 19, 5),
 datetime.datetime(2024, 7, 11, 16, 50),
 datetime.datetime(2024, 7, 10, 23, 30),
 datetime.datetime(2024, 7, 10, 20, 59)]

### `.group_by`
Use the `.group_by` method to get a dictionary mapping groups to the list of entries in that group.

The groups are one of `['day', 'month']` and the entries are sorted in ascending order.

Note: this method (like many others) uses lru_cache and since the Dataset is weakly-immutable, it is safe to use it.

In [19]:
groups = df.group_by('day')
for day, entries in groups.items():
    print(f'{day}: {len(entries)}')
    break

2023-07-01: 1


In [20]:
groups = df.group_by('month')
for day, entries in groups.items():
    print(f'{day}: {len(entries)}')
    break

2023-07-01: 95


### `.stats`

Use the `.stats` method to get a custom StatsResult object which contains the following information:

In [21]:
help(df.stats)

Help on method stats in module dataset:

stats() -> utils.StatsResult method of dataset.Dataset instance
    Returns the following statistics:
        - mood (avg ± std)
        - note length [num symbols] (avg ± std)
        - entries frequency [entries per day] (median)
    as a StatsResult object.



In [22]:
df.sub(include='home').stats()

Mood: 3.769 ± 0.480
Note length: 37.494 ± 55.643 symbols
Entries frequency: 4.091 entries per day (once every 5 hours 52 minutes)

### `.__iter__`
`Dataset` defines `__iter__` method, so it can be used in `for` loops. The entries are sorted in descending order by the date of creation.

In [23]:
for entry in df:
    print(entry)
    break
# or
df_iter = iter(df)
print(next(df_iter), next(df_iter), sep='; ')

[11.07.2024 23:00] 4.5 Leha, excited, factorio, friends, home, relaxed, satisfied
[11.07.2024 23:00] 4.5 Leha, excited, factorio, friends, home, relaxed, satisfied; [11.07.2024 19:05] 4.0 excited, home, piano


### `.__getitem__`
`Dataset` defines `__getitem__` method where the argument is 
- a date in a string format (e.g. `'01.01.2024'`), returns a new Dataset with all entries on that date;
- a slice object (e.g. `slice('01.01.2024', '01.01.2025')`), returns a new Dataset with all entries which were created between the two dates (the "stop" date is not included);

In [24]:
july22 = df['22.07.2023']

In [25]:
july22

Dataset(6 entries; last [11 months 20 days 11 hours 30 minutes 27 seconds ago]; mood: 3.583 ± 0.376)

- by a particular date period (by using slices)

In [26]:
ny_eve = df['29.12.2023':'03.01.2024']
ny_eve

Dataset(16 entries; last [6 months 8 days 11 hours 52 minutes 27 seconds ago]; mood: 3.969 ± 0.499)

In [27]:
end_of_november_2023 = df['29.11.2023':'01.12.2023']
end_of_november_2023.head()

Dataset(5 entries; last [7 months 11 days 2 hours 10 minutes 28 seconds ago]; mood: 3.700 ± 0.274)
[30.11.2023 23:37] 3.5 Lauren, city, date, tired
[30.11.2023 17:06] 4.0 coding, excited, home, satisfied
[30.11.2023 11:07] 3.5 Leha, coding, home, unsure
[29.11.2023 21:20] 3.5 Azat, Leha, amused, gaming, social
[29.11.2023 16:42] 4.0 coding, excited, happy, home, satisfied


In [28]:
before_aug_2023 = df[:'01.08.2023']
before_aug_2023.head()

Dataset(95 entries; last [11 months 11 days 14 hours 53 minutes 28 seconds ago]; mood: 3.589 ± 0.688)
[31.07.2023 18:54] 3.5 cleaning, relaxed
[31.07.2023 17:22] 3.0 home, study, tired, unsure
[31.07.2023 16:45] 4.0 cooking, home, relaxed, satisfied, study
[31.07.2023 12:00] 3.5 bored, home, procrastinating, relaxed, unsure
[30.07.2023 23:26] 4.0 Lauren, amused, friends, grateful, happy, home, laundry
...


### `__call__`
Return a list of entries for a particular day. The argument is a date in a string format (e.g. `'01.01.2024'`).

In [29]:
ny_entries = df('31.12.2023')
ny_entries

[[31.12.2023 16:48] 4.5 Lauren, cooking, date, excited, happy, home, movies & series,
 [31.12.2023 16:48] 2.5 Lauren, annoyed, city, date, photography, sad,
 [31.12.2023 21:51] 3.5 Lauren, cooking, date, home, movies & series, satisfied,
 [31.12.2023 22:21] 4.0 Dad, Mom, family, happy, home, nervous,
 [31.12.2023 23:57] 4.0 Lauren, date, happy]

### `@ <datetime-like>` (or `.at`)
Returns a single entry created at a particular datetime or `None` if there is no entry at that datetime.

Input: a datetime in a string format (e.g. `'01.01.2024 12:00'`) or a `datetime.datetime` object.

In [30]:
import datetime

# or
entry1 = df @ '22.07.2023 17:09' # or df.at(...)
entry2 = df @ datetime.datetime(2023, 7, 22, 17, 9) # or df.at(...)

assert entry1 == entry2
print(entry1)

[22.07.2023 17:09] 3.5 cooking, home, relaxed, study


# Analysis examples

## Mood analysis

In [31]:
activity = 'cycling'
mood_with_without = df.mood_with_without(activity)
mood_with_without

MoodWithWithout(with_=4.295 ± 0.622, without=3.793 ± 0.513)

In [32]:
print(activity, mood_with_without, sep='\n')

cycling
with: 4.295 ± 0.622
without: 3.793 ± 0.513
change: 13.23%


## Complete analysis

In [34]:
complete_analysis = df.complete_analysis()

In [None]:
print(f'analysed {len(complete_analysis)} actvities')
for _name, _mood_with_without, _num_occ in complete_analysis[:8] + complete_analysis[-8:]:
    print(f'[{_name:^15}]: {_mood_with_without.calc_change():.1%} (with: {_mood_with_without.with_}, without: {_mood_with_without.without}); occured {_num_occ} times')

## Mood scatter plot (per day/month)

In [36]:
df.mood_plot('day')

In [37]:
df.mood_plot('week')

In [38]:
df.mood_plot('month')

## Mood bar plot by hour/day/month

In [40]:
df.by_time_bar_plot('hour')

In [41]:
df.by_time_bar_plot('day')

In [42]:
df.by_time_bar_plot('weekday')

In [43]:
df.by_time_bar_plot('month')

## Entries times differences

In [44]:
df.sub(include='home').entries_differences()

## Monthly activity effect on mood

In [46]:
df.mood_change_activity('study')

No 'study' in December 2023


## Other features

In [47]:
df.note_length_plot()

In [2]:
df.generate_activity_correlation_matrix()

# Monthly Report Templates

In [4]:
import datetime


def generate_report_template(month: int, year: int):
    month_init = month
    _from = f'01.{month:02d}.{year}'
    month = month + 1 if month < 12 else 1
    year = year + 1 if month == 1 else year
    _to = f'01.{month:02d}.{year}'
    df_month = df[_from:_to]
    df_month_groups = df_month.group_by('day')

    EVENTS_COMMENT = f'total entries: {len(df_month)}\n\n'
    for day, entries in df_month_groups.items():
        EVENTS_COMMENT += f' -- {day:%d.%m.%Y, %a} --\n'
        for e in entries:
            EVENTS_COMMENT += f'@{e.full_date.time():%H:%M}: {e.mood} {", ".join(e.activities)}\n  {e.note}\n'
        EVENTS_COMMENT += '\n'
    
    month_word = datetime.date(1900, month, 1).strftime('%B')
    with open(f'{year}-{month_init:02d}.md', 'w', encoding='utf-8') as f:
        f.write(
            f'# {month_word} {year}\n\n'
            f'<!---\n{EVENTS_COMMENT}--->'
        )

In [5]:
# generate_report_template(6, 2024)

# Other