In [1]:
import pathlib
from dataset import Dataset, BAD_MOOD

# Loading the dataset

In [2]:
DATA_DIR = pathlib.Path('data')
path = next(DATA_DIR.glob('*.csv'))
print(path.name)

data_2024_01_24.csv


In [3]:
df = Dataset(csv_file_path=path)

Dataset(591 entries; last [4 days 17 hours 17 minutes ago]; average mood: 3.859)


In [4]:
df.head()

Dataset(591 entries; last [4 days 17 hours 17 minutes ago]; average mood: 3.859)
[24.01.2024 18:41] 4.0 satisfied, productive, coding, did something, tired
[24.01.2024 18:33] 4.5 happy, grateful
[24.01.2024 02:16] 3.5 home, relaxed
[24.01.2024 00:57] 3.5 home, tired
[23.01.2024 22:12] 5.0 happy, satisfied, swimming, Damir, tired, friends
...


# API

### `.sub`

Use the `.sub` method to filter entries and get a subset of the original dataset

- by included activities ('or' operator: all the entries which have at least one of the listed activities)

In [5]:
cycling_or_city_df = df.sub(incl_act={'cycling', 'city'})
cycling_or_city_df.head()

Dataset(69 entries; last [15 days 19 hours 10 minutes ago]; average mood: 4.188)
[13.01.2024 16:48] 4.0 date, movies & series, Lauren, city, excited, home, grateful
[05.01.2024 14:06] 4.0 date, Lauren, city, anxious, unsure, grateful
[04.01.2024 20:56] 4.0 city, Damir, tired
[31.12.2023 16:48] 2.5 date, Lauren, sad, city, annoyed
[29.12.2023 14:36] 4.5 happy, new place, date, Lauren, city, excited, grateful
...


- by excluded activities (entries which don't have any of the listed activities)

_for both `incl_act` and `excl_act` the argument can be either `str` or `set[str]`_

In [6]:
without_friends = df.sub(excl_act='friends')
without_friends.head()

Dataset(501 entries; last [4 days 17 hours 17 minutes ago]; average mood: 3.835)
[24.01.2024 18:41] 4.0 satisfied, productive, coding, did something, tired
[24.01.2024 18:33] 4.5 happy, grateful
[24.01.2024 02:16] 3.5 home, relaxed
[24.01.2024 00:57] 3.5 home, tired
[23.01.2024 18:47] 4.0 satisfied, productive, coding, did something, tired
...


- by mood values (can be either a value or a Container of values)

In [7]:
bad_mood = df.sub(mood=BAD_MOOD)
bad_mood

Dataset(19 entries; last [28 days 19 hours 10 minutes ago]; average mood: 2.211)

- by included activities ('and' operator: all entries which have all the listed activities)

In [8]:
 #* Note: this is achieved by subscribing the dataset twice; 
 #* since .sub method returns a new dataset, it can be done in one line
cycling_and_swimming = df.sub(incl_act='cycling').sub(incl_act='swimming')
cycling_and_swimming.head()
print(cycling_and_swimming.activities().most_common(7))

Dataset(10 entries; last [3 months 17 days 15 hours 48 minutes ago]; average mood: 4.850)
[13.10.2023 20:10] 5.0 cycling, swimming, tired
[16.09.2023 20:58] 5.0 satisfied, swimming, cycling, tired, relaxed
[04.09.2023 20:44] 5.0 satisfied, swimming, Damir, cycling, tired, relaxed
[22.08.2023 20:27] 5.0 satisfied, swimming, cycling, tired, friends
[19.08.2023 20:32] 5.0 swimming, Damir, cycling, tired, friends, relaxed
...
[('cycling', 10), ('swimming', 10), ('tired', 10), ('satisfied', 7), ('relaxed', 6), ('friends', 6), ('Damir', 5)]


- by a particular date

In [9]:
import datetime

july22 = df.sub(when=datetime.date(2023, 7, 22)) # or when='22.07.2023'
july22.head(n=-1)

Dataset(6 entries; last [6 months 10 days 13 hours 41 minutes ago]; average mood: 3.583)
[22.07.2023 22:17] 3.5 home, relaxed
[22.07.2023 19:34] 4.0 home, chess, relaxed
[22.07.2023 17:09] 3.5 home, cooking, study, relaxed
[22.07.2023 14:22] 3.5 coding, unsure
[22.07.2023 13:30] 3.0 dota, home, anxious, friends
[22.07.2023 11:30] 4.0 satisfied, home


- by a subtring (or substrings) in the `note`

In [10]:
df_ktane = df.sub(note_contains='ktane')
df_ktane.head(-1)

Dataset(7 entries; last [2 months 7 days 14 hours 17 minutes ago]; average mood: 4.000)
[22.11.2023 21:41] 4.0 satisfied, Martina, movies & series, Leha, amused, home, gaming
[10.10.2023 22:39] 3.5 productive, stressed, Leha, amused, home, friends, gaming
[06.10.2023 20:45] 4.0 happy, Leha, excited, friends, gaming
[04.10.2023 20:13] 5.0 happy, satisfied, new place, movies & series, Lauren, friends, grateful
[07.07.2023 20:00] 3.5 walking, dota, shopping, coding, Leha, study, friends, gaming, unsure
[03.07.2023 20:05] 4.0 happy, coding, stressed, Leha, excited, home, tired, friends, gaming
[01.07.2023 20:00] 4.0 happy, Martina, movies & series, Leha, excited, home, friends, gaming, relaxed


- by a predicate function (which takes an entry as an argument and returns a boolean)

In [11]:
from dataset import Entry


def pred(entry: Entry) -> bool:
    """It is sunday, the mood > 4, the note is not empty"""
    return entry.full_date.weekday() == 6 and entry.mood > 4 and entry.note != ''


df_sunday_good_mood_has_note = df.sub(predicate=pred)
df_sunday_good_mood_has_note

Dataset(18 entries; last [7 days 12 hours 44 minutes ago]; average mood: 4.778)

### `.head`

Use the `.head` method to look at the latest entries of the dataset

In [12]:
cycling_or_city_df.head()

Dataset(69 entries; last [15 days 19 hours 10 minutes ago]; average mood: 4.188)
[13.01.2024 16:48] 4.0 date, movies & series, Lauren, city, excited, home, grateful
[05.01.2024 14:06] 4.0 date, Lauren, city, anxious, unsure, grateful
[04.01.2024 20:56] 4.0 city, Damir, tired
[31.12.2023 16:48] 2.5 date, Lauren, sad, city, annoyed
[29.12.2023 14:36] 4.5 happy, new place, date, Lauren, city, excited, grateful
...


In [13]:
cycling_or_city_df.head(2)

Dataset(69 entries; last [15 days 19 hours 10 minutes ago]; average mood: 4.188)
[13.01.2024 16:48] 4.0 date, movies & series, Lauren, city, excited, home, grateful
[05.01.2024 14:06] 4.0 date, Lauren, city, anxious, unsure, grateful
...


In [14]:
# prints all entries
cycling_or_city_df.head(-1)

Dataset(69 entries; last [15 days 19 hours 10 minutes ago]; average mood: 4.188)
[13.01.2024 16:48] 4.0 date, movies & series, Lauren, city, excited, home, grateful
[05.01.2024 14:06] 4.0 date, Lauren, city, anxious, unsure, grateful
[04.01.2024 20:56] 4.0 city, Damir, tired
[31.12.2023 16:48] 2.5 date, Lauren, sad, city, annoyed
[29.12.2023 14:36] 4.5 happy, new place, date, Lauren, city, excited, grateful
[23.12.2023 14:22] 4.0 happy, satisfied, new place, date, Lauren, city, tired
[17.12.2023 19:22] 4.0 happy, grateful, date, Lauren, city, tired, relaxed
[17.12.2023 14:10] 3.5 date, Lauren, bored, city, tired
[13.12.2023 15:27] 5.0 happy, satisfied, date, Lauren, city
[13.12.2023 13:03] 3.5 city, nervous
[10.12.2023 20:00] 4.5 city, satisfied, walking, tired
[30.11.2023 23:37] 3.5 city, Lauren, date, tired
[24.11.2023 21:50] 4.0 happy, satisfied, social, city, Damir, tired
[30.10.2023 18:07] 5.0 happy, date, Lauren, city, tired, grateful
[26.10.2023 22:06] 4.0 happy, satisfied, walk

### `.mood`

Use the `.mood` method to get the average mood of all the entries in the dataset. This value is also shown when calling `.head()`.

In [15]:
round(cycling_or_city_df.mood(), 3)

4.188

### `.activities`

Use the `.activities` method to get a Counter object of all activities in the dataset

In [16]:
cnt = df.activities()
print(f'most common: {cnt.most_common(3)}')
print(f'least common: {cnt.most_common()[-1:-6:-1]}')


most common: [('home', 341), ('satisfied', 195), ('relaxed', 190)]
least common: [('helping', 1), ('meeting', 1), ('Dad', 1), ('guitar', 2), ('photography', 2)]


### `.get_datetimes`

Use the `.get_datetimes` method to get the list of all points in time when an entry was created

In [17]:
df.get_datetimes()[:5]

[datetime.datetime(2024, 1, 24, 18, 41),
 datetime.datetime(2024, 1, 24, 18, 33),
 datetime.datetime(2024, 1, 24, 2, 16),
 datetime.datetime(2024, 1, 24, 0, 57),
 datetime.datetime(2024, 1, 23, 22, 12)]

### `.group_by_day`
Use the `.group_by_day` method to get a dictionary mapping dates to a list of entries created on that date.
The dates are sorted in ascending order.

Note: this method (like many others) uses lru_cache and since the Dataset is weakly-immutable, it is safe to use it.

In [18]:
groups = df.group_by_day()
for day, entries in groups.items():
    print(f'{day}: {len(entries)}')
    break

2023-07-01: 1


### `.stats`

Use the `.stats` method to get a custom StatsResult object which contains the following information:

In [19]:
help(df.stats)

Help on method stats in module dataset:

stats() -> dataset.StatsResult method of dataset.Dataset instance
    Returns the following statistics:
        - mood (avg ± std)
        - note length [num symbols] (avg ± std)
        - entries frequency [entries per day] (median)
    as a StatsResult object.



In [20]:
df.sub(excl_act='home').stats()

Mood: 3.948 ± 0.639
Note length: 49.340 ± 50.125 symbols
Entries frequency: 1.627 entries per day

### `.__iter__`
`Dataset` defines `__iter__` method, so it can be used in `for` loops. The entries are sorted in descending order by the date of creation.

In [21]:
for entry in df:
    print(entry)
    break
# or
df_iter = iter(df)
print(next(df_iter), next(df_iter), sep='\n')

[24.01.2024 18:41] 4.0 satisfied, productive, coding, did something, tired
[24.01.2024 18:41] 4.0 satisfied, productive, coding, did something, tired
[24.01.2024 18:33] 4.5 happy, grateful


### `.__getitem__`
`Dataset` defines `__getitem__` method where the argument is a date in a string format (e.g. `'01.01.2024'`). It returns a list of entries created on that date.

Note: It is equivalent to `df.sub(date='01.01.2024').entries[::-1]` (it's reversed because in the root dataset the entries are sorted in descending order by the date of creation).

In [22]:
july22_entries = df['22.07.2023']
assert july22_entries == july22.entries[::-1]

# Analysis examples

## Mood analysis

In [23]:
activity = 'cycling'
mood_with, mood_without = df.mood_with_without(activity)

In [24]:
print(f'''"{activity}"
with: {mood_with:.2f}
without: {mood_without:.2f}
change: {(mood_with - mood_without)/mood_without:.1%}'''
)

"cycling"
with: 4.38
without: 3.83
change: 14.4%


## Complete analysis

In [25]:
complete_analysis = df.complete_analysis()

In [26]:
print(f'analysed {len(complete_analysis)} actvities')
for _name, _with, _without, _change, _num_occ in complete_analysis[:5] + complete_analysis[-5:]:
    print(f'[{_name:^15}]: {_change:.1%} (with: {_with:.2f}, without: {_without:.2f}); occured {_num_occ} times')
# TODO: maybe make this a pretty figure?

analysed 46 actvities
[     happy     ]: 18.6% (with: 4.38, without: 3.70); occured 140 times
[   swimming    ]: 17.5% (with: 4.50, without: 3.83); occured 25 times
[   grateful    ]: 16.0% (with: 4.41, without: 3.80); occured 59 times
[     Damir     ]: 14.6% (with: 4.40, without: 3.84); occured 20 times
[    cycling    ]: 14.4% (with: 4.38, without: 3.83); occured 33 times
[     dota      ]: -12.5% (with: 3.40, without: 3.88); occured 29 times
[      sad      ]: -17.0% (with: 3.21, without: 3.87); occured 14 times
[   stressed    ]: -19.0% (with: 3.14, without: 3.87); occured 11 times
[    anxious    ]: -22.7% (with: 3.03, without: 3.93); occured 44 times
[     angry     ]: -40.4% (with: 2.32, without: 3.89); occured 11 times


## Mood plot

In [27]:
df.mood_plot()

## Bar plot by hour/day/month

In [28]:
df.by_time_bar_plot('hour')

In [29]:
df.by_time_bar_plot('day')

In [30]:
df.by_time_bar_plot('weekday')

In [31]:
df.by_time_bar_plot('month')

## Other features

In [32]:
df.note_length_plot(400)