In [1]:
import pathlib
from dataset import Dataset, BAD_MOOD

DATA_DIR = pathlib.Path('data')
path = next(DATA_DIR.glob('*.csv'))
print('using file', path.name)

df = Dataset(csv_file_path=path)

using file daylio_export_2024_03_06.csv
Dataset(747 entries; last [2 hours 3 minutes 1 second ago]; mood: 3.803 ± 0.592


# API

### `.head`

Use the `.head` method to look at the latest entries of the dataset

In [2]:
df.head()

Dataset(747 entries; last [1 hour 42 minutes 25 seconds ago]; mood: 3.803 ± 0.592
[06.03.2024 17:33] 3.5 coding, tired, productive, worked
[05.03.2024 23:12] 4.0 home, happy, date, Lauren, satisfied, relaxed
[05.03.2024 21:41] 3.5 unsure, home, date, Lauren, annoyed
[05.03.2024 18:00] 4.0 productive, coding, worked, tired, work
[05.03.2024 15:13] 3.5 unsure, coding, worked, satisfied, work
...


In [3]:
df.head(2)

Dataset(747 entries; last [1 hour 42 minutes 29 seconds ago]; mood: 3.803 ± 0.592
[06.03.2024 17:33] 3.5 coding, tired, productive, worked
[05.03.2024 23:12] 4.0 home, happy, date, Lauren, satisfied, relaxed
...


In [None]:
# prints all entries
df.head(-1)

### `.sub`

Use the `.sub` method to filter entries and get a subset of the original dataset

- by included activities ('or' operator: all the entries which have at least one of the listed activities)

In [4]:
cycling_or_city_df = df.sub(include={'cycling', 'city'})
cycling_or_city_df.head()

Dataset(75 entries; last [11 days 6 hours 10 minutes 34 seconds ago]; mood: 4.173 ± 0.517
[24.02.2024 13:05] 4.0 city, grateful, sad, unsure
[16.02.2024 19:30] 3.5 date, city, Lauren
[09.02.2024 23:06] 4.5 city, grateful, happy, date, Lauren, satisfied
[27.01.2024 18:47] 4.0 tired, satisfied, city
[27.01.2024 16:03] 4.0 tired, satisfied, bored, city
...


- by excluded activities (entries which don't have any of the listed activities)

_for both `include` and `exclude` the argument can be either `str` or `set[str]`_

In [5]:
without_friends = df.sub(exclude='friends')
without_friends.head()

Dataset(647 entries; last [1 hour 42 minutes 39 seconds ago]; mood: 3.777 ± 0.591
[06.03.2024 17:33] 3.5 coding, tired, productive, worked
[05.03.2024 23:12] 4.0 home, happy, date, Lauren, satisfied, relaxed
[05.03.2024 21:41] 3.5 unsure, home, date, Lauren, annoyed
[05.03.2024 18:00] 4.0 productive, coding, worked, tired, work
[05.03.2024 15:13] 3.5 unsure, coding, worked, satisfied, work
...


- by mood values (can be either a value or a Container of values)

In [6]:
bad_mood = df.sub(mood=BAD_MOOD)
bad_mood

Dataset(33 entries; last [3 days 8 hours 55 minutes 41 seconds ago]; mood: 2.273 ± 0.253

- by included activities ('and' operator: all entries which have all the listed activities)

_Note: this is achieved by subscribing the dataset twice; since .sub method returns a new dataset, it can be done in one line._

In [7]:
cycling_and_swimming = df.sub(include='cycling').sub(include='swimming')
cycling_and_swimming.head()
print(cycling_and_swimming.activities().most_common(7))

Dataset(10 entries; last [4 months 23 days 7 hours 5 minutes 45 seconds ago]; mood: 4.850 ± 0.474
[13.10.2023 20:10] 5.0 tired, cycling, swimming
[16.09.2023 20:58] 5.0 cycling, tired, satisfied, relaxed, swimming
[04.09.2023 20:44] 5.0 cycling, tired, satisfied, relaxed, Damir, swimming
[22.08.2023 20:27] 5.0 cycling, tired, satisfied, friends, swimming
[19.08.2023 20:32] 5.0 cycling, swimming, tired, relaxed, Damir, friends
...
[('tired', 10), ('cycling', 10), ('swimming', 10), ('satisfied', 7), ('relaxed', 6), ('friends', 6), ('Damir', 5)]


- by a particular date

In [8]:
import datetime

july22 = df.sub(when=datetime.date(2023, 7, 22))
# july22 = df.sub(when='22.07.2023') # or this
july22.head(n=-1)

Dataset(6 entries; last [7 months 14 days 22 hours 58 minutes 50 seconds ago]; mood: 3.583 ± 0.376
[22.07.2023 22:17] 3.5 home, relaxed
[22.07.2023 19:34] 4.0 home, relaxed, chess
[22.07.2023 17:09] 3.5 home, relaxed, study, cooking
[22.07.2023 14:22] 3.5 coding, unsure
[22.07.2023 13:30] 3.0 home, dota, anxious, friends
[22.07.2023 11:30] 4.0 satisfied, home


- by a particular date period (by using slices)

In [9]:
ny_eve = df.sub(when=slice('29.12.2023', '03.01.2024'))
ny_eve

Dataset(16 entries; last [2 months 2 days 23 hours 20 minutes 53 seconds ago]; mood: 3.969 ± 0.499

- by a subtring (or substrings) in the `note`

In [10]:
df_ktane = df.sub(note_contains='ktane')
df_ktane.head(-1)

Dataset(7 entries; last [3 months 13 days 15 hours 34 minutes 58 seconds ago]; mood: 4.000 ± 0.500
[22.11.2023 21:41] 4.0 Martina, gaming, home, satisfied, movies & series, amused, Leha
[10.10.2023 22:39] 3.5 productive, gaming, stressed, home, amused, friends, Leha
[06.10.2023 20:45] 4.0 gaming, happy, excited, friends, Leha
[04.10.2023 20:13] 5.0 new place, grateful, happy, Lauren, satisfied, movies & series, friends
[07.07.2023 20:00] 3.5 Leha, gaming, unsure, coding, study, walking, dota, shopping, friends
[03.07.2023 20:05] 4.0 Leha, gaming, stressed, coding, home, happy, tired, excited, friends
[01.07.2023 20:00] 4.0 Leha, Martina, gaming, home, happy, excited, movies & series, relaxed, friends


- by a predicate function (which takes an entry as an argument and returns a boolean)

In [11]:
from dataset import Entry


def pred(entry: Entry) -> bool:
    """It is sunday, the mood > 4, the note is not empty"""
    return entry.full_date.weekday() == 6 and entry.mood > 4 and entry.note != ''


df_sunday_good_mood_has_note = df.sub(predicate=pred)
df_sunday_good_mood_has_note

Dataset(23 entries; last [10 days 14 minutes 11 seconds ago]; mood: 4.739 ± 0.255

### `.mood` and `.mood_std`

Use the `.mood` method to get the average mood of all the entries in the dataset and the `.mood_std` method to get the standard deviation of the mood values.

These values are also shown when calling `.head()`.

In [13]:
print(
    round(cycling_or_city_df.mood(), 3),
    round(cycling_or_city_df.mood_std(), 3)
)

4.173 0.517


### `.activities`

Use the `.activities` method to get a Counter object of all activities in the dataset

In [14]:
cnt = df.activities()
print(f'most common: {cnt.most_common(3)}')
print(f'least common: {cnt.most_common()[-1:-6:-1]}')

most common: [('home', 432), ('satisfied', 235), ('relaxed', 233)]
least common: [('Dad', 1), ('guitar', 2), ('homework', 2), ('snacking', 3), ('cinema', 3)]


### `.get_datetimes`

Use the `.get_datetimes` method to get the list of all points in time when an entry was created. The values are sorted in descending order by the date of creation.

In [15]:
df.get_datetimes()[:5]

[datetime.datetime(2024, 3, 6, 17, 33),
 datetime.datetime(2024, 3, 5, 23, 12),
 datetime.datetime(2024, 3, 5, 21, 41),
 datetime.datetime(2024, 3, 5, 18, 0),
 datetime.datetime(2024, 3, 5, 15, 13)]

### `.group_by_day`
Use the `.group_by_day` method to get a dictionary mapping dates to a list of entries created on that date.
The dates are sorted in ascending order.

Note: this method (like many others) uses lru_cache and since the Dataset is weakly-immutable, it is safe to use it.

In [16]:
groups = df.group_by_day()
for day, entries in groups.items():
    print(f'{day}: {len(entries)}')
    break

2023-07-01: 1


### `.stats`

Use the `.stats` method to get a custom StatsResult object which contains the following information:

In [17]:
help(df.stats)

Help on method stats in module dataset:

stats() -> utils.StatsResult method of dataset.Dataset instance
    Returns the following statistics:
        - mood (avg ± std)
        - note length [num symbols] (avg ± std)
        - entries frequency [entries per day] (median)
    as a StatsResult object.



In [18]:
df.sub(include='home').stats()

Mood: 3.744 ± 0.533
Note length: 38.042 ± 63.749 symbols
Entries frequency: 3.013 entries per day (once every 7 hours 58 minutes)

### `.__iter__`
`Dataset` defines `__iter__` method, so it can be used in `for` loops. The entries are sorted in descending order by the date of creation.

In [19]:
for entry in df:
    print(entry)
    break
# or
df_iter = iter(df)
print(next(df_iter), next(df_iter), sep='\n')

[06.03.2024 17:33] 3.5 coding, tired, productive, worked
[06.03.2024 17:33] 3.5 coding, tired, productive, worked
[05.03.2024 23:12] 4.0 home, happy, date, Lauren, satisfied, relaxed


### `.__getitem__`
`Dataset` defines `__getitem__` method where the argument is 
- a date in a string format (e.g. `'01.01.2024'`), returns all entries on that date in the dataset;
- a slice object (e.g. `slice('01.01.2024', '01.01.2025')`), returns all entries in the dataset which were created between the two dates (the "stop" date is not included);

Note: the first use case is equivalent to `df.sub(date='01.01.2024').entries[::-1]` (it's reversed because in the root dataset the entries are sorted in descending order by the date of creation).

In [20]:
july22_entries = df['22.07.2023']
assert july22_entries == july22.entries[::-1]

In [21]:
end_of_november_2023 = df['29.11.2023':'01.12.2023']
end_of_november_2023

[[29.11.2023 16:42] 4.0 satisfied, coding, home, happy, excited,
 [29.11.2023 21:20] 3.5 social, gaming, Azat, amused, Leha,
 [30.11.2023 11:07] 3.5 coding, home, unsure, Leha,
 [30.11.2023 17:06] 4.0 coding, excited, satisfied, home,
 [30.11.2023 23:37] 3.5 tired, date, city, Lauren]

In [None]:
before_aug_2023 = df[:'01.08.2023']
before_aug_2023

### `.__call__` (or `.at`)
Returns a single entry created at a particular datetime or raises an error if there are no entries created at that datetime.

Input: a datetime in a string format (e.g. `'01.01.2024 12:00'`).

In [23]:
# or
entry = df('22.07.2023 17:09') # or df.at(...)
entry

[22.07.2023 17:09] 3.5 home, relaxed, study, cooking

# Analysis examples

## Mood analysis

In [24]:
activity = 'cycling'
mood_with, mood_without = df.mood_with_without(activity)

In [25]:
print(f'''"{activity}"
with: {mood_with:.2f}
without: {mood_without:.2f}
change: {(mood_with - mood_without)/mood_without:.1%}'''
)

"cycling"
with: 4.37
without: 3.78
change: 15.7%


## Complete analysis

In [26]:
complete_analysis = df.complete_analysis()

In [27]:
print(f'analysed {len(complete_analysis)} actvities')
for _name, _with, _without, _change, _num_occ in complete_analysis[:8] + complete_analysis[-8:]:
    print(f'[{_name:^15}]: {_change:.1%} (with: {_with:.2f}, without: {_without:.2f}); occured {_num_occ} times')

analysed 51 actvities
[     happy     ]: 19.5% (with: 4.36, without: 3.65); occured 158 times
[   swimming    ]: 18.1% (with: 4.46, without: 3.78); occured 27 times
[    cycling    ]: 15.7% (with: 4.37, without: 3.78); occured 34 times
[   grateful    ]: 15.6% (with: 4.33, without: 3.74); occured 80 times
[     Damir     ]: 15.3% (with: 4.36, without: 3.79); occured 22 times
[   satisfied   ]: 11.6% (with: 4.10, without: 3.67); occured 235 times
[    Lauren     ]: 10.5% (with: 4.11, without: 3.72); occured 166 times
[     city      ]: 9.8% (with: 4.15, without: 3.78); occured 50 times
[     exam      ]: -13.2% (with: 3.31, without: 3.81); occured 13 times
[    worried    ]: -17.9% (with: 3.14, without: 3.83); occured 28 times
[   stressed    ]: -18.1% (with: 3.12, without: 3.81); occured 12 times
[     sick      ]: -20.0% (with: 3.06, without: 3.82); occured 18 times
[      sad      ]: -21.1% (with: 3.02, without: 3.83); occured 23 times
[    anxious    ]: -22.4% (with: 3.00, without: 

## Mood plot

In [28]:
df.mood_plot()

## Bar plot by hour/day/month

In [29]:
df.by_time_bar_plot('hour')

In [30]:
df.by_time_bar_plot('day')

In [31]:
df.by_time_bar_plot('weekday')

In [32]:
df.by_time_bar_plot('month')

## Other features

In [33]:
df.note_length_plot(400)

# Making a network

In [40]:
from itertools import combinations
from collections import Counter

from pyvis.network import Network


def color_gradient(start: str, finish: str, t: float) -> str:
    """Return a color that is a linear interpolation between start and finish"""
    start_rgb = int(start[1:3], 16), int(start[3:5], 16), int(start[5:7], 16)
    finish_rgb = int(finish[1:3], 16), int(finish[3:5], 16), int(finish[5:7], 16)
    r = int(start_rgb[0] + t * (finish_rgb[0] - start_rgb[0]))
    g = int(start_rgb[1] + t * (finish_rgb[1] - start_rgb[1]))
    b = int(start_rgb[2] + t * (finish_rgb[2] - start_rgb[2]))
    return f'#{r:02x}{g:02x}{b:02x}'


def network_from_df(df: Dataset) -> Network:
    net = Network(height='1000px', width='100%', directed=False, bgcolor='#222222', font_color=True)
    activities_counter = df.activities()
    most_common = activities_counter.most_common(20)
    complete_analysis = df.complete_analysis()
    activity_to_change = {ca.activity: ca.change for ca in complete_analysis}
    for activity, count in most_common:
        net.add_node(activity, label=activity, value=count, title=f'occured {count} times;\nmood change: {activity_to_change[activity]:.1%}')
    common_activities = set(activity for activity, _ in most_common)
    all_edges = []
    for entry in df:
        for pair in combinations(entry.activities, 2):
            if pair[0] not in common_activities or pair[1] not in common_activities:
                continue
            all_edges.append(tuple(sorted(pair)))
    edges_counter = Counter(all_edges)
    for (source, target), count in edges_counter.items():
        net.add_edge(source, target, value=count)
    return net

In [44]:
net = network_from_df(df)
net.force_atlas_2based(spring_strength=0.02, gravity=-30, damping=0.2)
net.show('network.html', notebook=False)

network.html
