In [None]:
import pathlib

from src.dataset import Dataset, Entry
from src.entry_condition import (
    A,
    DateIn,
    MoodIn,
    NoteHas,
    Predicate,
    register,
)

# TODO: add dummy data if the real file is missing
path = next(pathlib.Path("data").glob("*.csv"))
print(f"using file: {path.name} ({path.stat().st_size * 2**-20:.3f} Mb)\n")

df = Dataset(path)
print(df.stats())

register(set(df.activities()))

# API

### `.head`

Use the `.head` method to look at the latest entries of the dataset

In [None]:
df.head()

In [None]:
df.head(2)

In [None]:
# prints all entries
df.head(-1)

### `.sub`

Use the `.sub` method to filter entries and get a subset of the original dataset

- by included activities ('or' operator: all the entries which have at least one of the listed activities)

In [None]:
cycling_or_city_df = df.sub(include={"cycling", "city"})
cycling_or_city_df.head()

- by excluded activities (entries which don't have any of the listed activities)

_for both `include` and `exclude` the argument can be either `str` or `set[str]`_

In [None]:
without_friends_df = df.sub(exclude="friends")
without_friends_df.head()

- by included activities ('and' operator: all entries which have all the listed activities)

_Note: this is achieved by subscribing the dataset twice; since .sub method returns a new dataset, it can be done in one line._

In [None]:
cycling_and_swimming_df = df.sub(include="cycling").sub(include="swimming")
cycling_and_swimming_df.head()
print(cycling_and_swimming_df.activities().most_common(7))

This means "cycling and not swimming" since all the separately passed conditions must be true.

In [None]:
cycling_and_not_swimming_df = df.sub(include="cycling", exclude="swimming")
cycling_and_not_swimming_df.head()

> Note that all the conditions can be efficiently replaced by one `EntryCondition` object.
This is a more intuitive way to subscribe a dataset interactively and **is the recommended way to do it**.

Use the `A` object to create the base activity-filters (e.g. `A("home")`) and binary bitwise operators (`&`, `|` and `~`) to construct a condition.

Here is an alternative dataset creation for all the examples from above:

In [None]:
df.sub(A("cycling") | A("city"))

In [None]:
df.sub(A("study") & ~A("home"))

In [None]:
df.sub(A("friends"))

In [None]:
df.sub(A("cycling") & ~A("swimming"))

In [None]:
df.sub(~A("cycling") & A("swimming"))

Initializing `A` with an unknown activity will result in an error:

In [None]:
A("stydy")

Here is a more detailed condition creation and comparison

In [None]:
cycling_and_swimming = A("cycling") & A("swimming")
print(f"Condition: {cycling_and_swimming}")

cycling_and_swimming_df_alt = df.sub(cycling_and_swimming)
assert cycling_and_swimming_df_alt == cycling_and_swimming_df

For this slightly exaggerated example, consider the old and the new ways to construct a dataset (as well as the predicate-based example).

Say, we would like to filter all the entries such that there is 

(`gaming` with `Leha` and without `Azat`) or (`factorio` with `Azat` and without `Leha`)

In [None]:
# this_condition = A("gaming") & A("Leha") & ~A("Azat") | A("factorio") & A("Azat") & ~A("Leha")
this_condition = A("gaming & Leha & !Azat | factorio & Azat & !Leha")  #* new feature!
print(this_condition)

this_df = df.sub(this_condition)
this_df

When did I watch something with someone, but not at home?

In [None]:
# the old way
movies_not_at_home_with_someone_old_df = (
    df
    .sub(include=set(df.people().keys()))
    .sub(include="movies and series", exclude="home")
)
movies_not_at_home_with_someone_old_df

In [None]:
# the new way
movies_not_at_home_with_someone_df = df.sub(
    # A("movies and series") & ~A("home") & A.people()
    A("movies and series & !home & people")  # the newest way
)
print(movies_not_at_home_with_someone_df)

assert movies_not_at_home_with_someone_df == movies_not_at_home_with_someone_old_df

In [None]:
df.sub(A("movies and series") & A("home") & A.people())

- by a subtring (or substrings) in the `note`

In [None]:
df_ktane = df.sub(note_contains="ktane")
df_ktane.head(-1, verbose=True)

> This is also possible using `NoteHas`

In [None]:
assert df.sub(NoteHas("ktane")) == df_ktane

Or with at least one of many substrings:

In [None]:
df.sub(NoteHas("кот", "соба", "животн"))

- by a predicate function (which takes an entry as an argument and returns a boolean)

In [None]:
def pred(entry: Entry) -> bool:
    """It is sunday, the mood > 4, the note is not empty"""
    return entry.full_date.weekday() == 6 and entry.mood > 4 and entry.note != ""


df_sunday_good_mood_has_note = df.sub(Predicate(pred))
df_sunday_good_mood_has_note

In [None]:
# or by wrapping the predicate in a Predicate object to combine with other conditions

df_sunday_good_mood_has_note_new = df.sub(Predicate(pred))
print(df_sunday_good_mood_has_note_new)

assert df_sunday_good_mood_has_note == df_sunday_good_mood_has_note_new

### `.mood` and `.std`

Use the `.mood` method to get the average mood of all the entries in the dataset and the `.std` method to get the standard deviation of the mood values.

These values are also shown when calling `.head()`.

In [None]:
print(f"{cycling_or_city_df.mood():.3f} \u00b1 {cycling_or_city_df.std():.3f}")

or just use `.mood_std`:

In [None]:
cycling_or_city_df.mood_std()

### `.activities`

Use the `.activities` method to get a Counter object of all activities in the dataset

In [None]:
cnt = df.activities()
print(f"most common: {cnt.most_common(3)}")
print(f"least common: {cnt.most_common()[-1:-6:-1]}")

### `.people`

Use the `.people` method to get a Counter object of all people in the dataset

In [None]:
df.people()

### `.get_datetimes`

Use the `.get_datetimes` method to get the list of all points in time when an entry was created. The values are sorted in descending order by the date of creation.

In [None]:
df.get_datetimes()[:5]

### `.group_by`
Use the `.group_by` method to get a dictionary mapping groups to the list of entries in that group.

The groups are one of `['day', 'month']` and the entries are sorted in ascending order.

Note: this method (like many others) uses lru_cache and since the Dataset is weakly-immutable, it is safe to use it.

In [None]:
groups = df.group_by("day")
for day, entries in groups.items():
    print(f"{day}: {len(entries)}")
    break

In [None]:
groups = df.group_by("month")
for day, entries in groups.items():
    print(f"{day}: {len(entries)}")
    break

### `.stats`

Use the `.stats` method to get a custom StatsResult object which contains the following information:

In [None]:
df.sub(A("home")).stats()

### `for _ in df` (`.__iter__`)
`Dataset` defines `__iter__` method, so it can be used in `for` loops. The entries are sorted in descending order by the date of creation.

In [None]:
for entry in df:
    print(entry)
    break
# or
df_iter = iter(df)
print(next(df_iter), next(df_iter), sep="; ")

### `df[...]` (`.__getitem__`)
`Dataset` defines `__getitem__` method where the argument is 
- a date in a string format (e.g. `'01.01.2024'`), returns a new Dataset with all entries on that date;
- a slice object (e.g. `slice('01.01.2024', '01.01.2025')`), returns a new Dataset with all entries which were created between the two dates (the "stop" date is not included);

In [None]:
july22 = df["22.07.2023"]
july22

- by a particular date period (by using slices)

In [None]:
ny_eve = df["29.12.2023":"03.01.2024"]
ny_eve

In [None]:
end_of_november_2023 = df["29.11.2023":"01.12.2023"]
end_of_november_2023.head()

In [None]:
before_aug_2023 = df[:"01.08.2023"]
before_aug_2023.head()

> Note: it is now also possible to specify the date interval using an `EntryCondition`-like object:

The arguments are either date `%d.%m.%Y` (e.g. "29.08.2000"), date `%d %b %Y` (e.g. 03 Jan 2024) or an empty string (unbounded).

In [None]:
df.sub(DateIn("29 Dec 2023", "03 Jan 2024")) == ny_eve

In [None]:
df.sub(DateIn("29 Nov 2023", "01 Dec 2023")) == end_of_november_2023

In [None]:
df.sub(DateIn("", "01 Aug 2023")) == before_aug_2023

### `df @ datetime-like` (or `.at`)
Returns a single entry created at a particular datetime or `None` if there is no entry at that datetime.

Input: a datetime in a string format (e.g. `'01.01.2024 12:00'`) or a `datetime.datetime` object.

In [None]:
import datetime

entry1 = df @ "22.07.2023 17:09"
entry2 = df.at("22.07.2023 17:09")
entry3 = df @ datetime.datetime(2023, 7, 22, 17, 9)
entry4 = df.at(datetime.datetime(2023, 7, 22, 17, 9))

assert entry1 == entry2 == entry3 == entry4
print(entry1)

# Analysis examples

## Mood analysis

In [None]:
activity = "cycling"
mood_with_without = df.mood_with_without(activity)
print(repr(mood_with_without))
print(mood_with_without.calc_change())

In [None]:
print(activity, mood_with_without, sep="\n")

## Complete analysis

In [None]:
complete_analysis = df.complete_analysis()

print(f"analysed {len(complete_analysis)} actvities")
for _name, _mood_with_without, _num_occ in (
    complete_analysis[:8] + complete_analysis[-8:]
):
    print(
        f"[{_name:^15}]: {_mood_with_without.calc_change():.1%}"
        f"(with: {_mood_with_without.with_}, without: {_mood_with_without.without}); occured {_num_occ} times"
    )

## Tags

In [None]:
all_tags = df.build_tags()

## Books: timeline, highlights and plot

In [None]:
from collections import defaultdict

from IPython.core.display import HTML
from IPython.display import display

from src.books_timeline import get_timeline_html
from src.clippings import get_all_grouped_highlights


book_tags = df.get_book_tags()
print(f"Found {len(book_tags)} book tags")


# author stats
author_groups = defaultdict(list)
for book_tag in book_tags:
    author_groups[book_tag.author].append(book_tag)

In [None]:
# number of books per author
for author, tags in author_groups.items():
    print(f"{author:<15} {len(tags)}")

Highlights info

In [None]:
# compile and group highlight by the book title
BOOK_GROUPS = get_all_grouped_highlights()

# number of highlighs per book
for title, hightlights in BOOK_GROUPS.items():
    print(
        f"{title:<50} {len(hightlights):<3} ({sum(len(h.text) for h in hightlights):,} total characters)"
    )

# assign highlights to book tags, if possible
for book_tag in book_tags:
    book_tag.try_assign_highlights(BOOK_GROUPS)

In [None]:
display(HTML(get_timeline_html(book_tags)))

Now, the plot

In [None]:
df.books_read_plot(groupby="month")

## Mood scatter plot (per day/week/month)

In [None]:
df.mood_plot("week")

## Mood bar plot by hour/day/month

In [None]:
df.by_time_bar_plot("hour")

In [None]:
df.by_time_bar_plot("day")

In [None]:
df.by_time_bar_plot("weekday")

In [None]:
df.by_time_bar_plot("month")

## Calendar Heatmaps!

Now, these do look fancy, don't they!

In [None]:
df.show_calendar_plot()

Of course, we can also subset the dataset to, say, take a look at a particular activity

In [None]:
# when I was at home (some of my trips are clearly visible)
df.sub(A("home")).show_calendar_plot()

In [None]:
# when I played factorio alone
df.sub(A("factorio") & ~A.people()).show_calendar_plot()

In [None]:
# when I studied not at home
df.sub(A("study") & ~A("home")).show_calendar_plot()

In [None]:
# bad-to-meh mood?
df.sub(MoodIn(1, 3.1)).show_calendar_plot()

## Entries times differences

In [None]:
df.sub(A("home")).stats()

In [None]:
df.sub(A("home")).entries_differences()

## Activities' effect on mood

In [None]:
df.activities_effect_on_mood()

## Wordclouds!

In [None]:
df.plot_wordcloud()

In [None]:
df.sub(NoteHas("#книга")).plot_wordcloud()

## Monthly activity effect on mood

In [None]:
df.mood_change_activity("study")

## Other features

In [None]:
df.people_frequency()

In [None]:
df.note_length_plot(groupby="month")

In [None]:
df.generate_activity_correlation_matrix()

# Monthly Report Templates

In [None]:
from src.monthly_report import generate_report_template


generate_report_template(1, 2025, df)

# Other

In [19]:
def valid_parenthesis(string: str) -> bool:
    balance = 0
    for char in string:
        if char == "(":
            balance += 1
        elif char == ")":
            balance -= 1
        if balance < 0:
            return False
    return balance == 0

In [42]:
from src.entry_condition import compile, And, Or, Has, Not, EntryCondition

In [None]:
def top_level_op_idx(string: str) -> int:
    balance = 0
    op_or_idx = -1
    op_and_idx = -1
    for i, char in enumerate(string):
        if char == "(":
            balance += 1
        elif char == ")":
            balance -= 1
        elif balance == 0:
            if char == "|":
                op_or_idx = i
            elif char == "&":
                op_and_idx = i
    if op_or_idx != -1:
        return op_or_idx
    if op_and_idx != -1:
        return op_and_idx
    raise ValueError(f"No top-level operator found: {string}")


def build_entry_condition(string: str) -> EntryCondition:
    if not string:
        raise ValueError("Empty string")
    top_op_idx = top_level_op_idx(string)
    top_op = string[top_op_idx]
    left = string[:top_op_idx].strip()
    right = string[top_op_idx + 1 :].strip()
    if top_op == "|":
        return Or(build_entry_condition(left), build_entry_condition(right))
    elif top_op == "&":
        return And(build_entry_condition(left), build_entry_condition(right))
    else:
        raise ValueError(f"Invalid operator: {top_op}")


def build(string: str) -> EntryCondition:
    if not valid_parenthesis(string):
        raise ValueError("Invalid parenthesis")
    return build_entry_condition(string)

In [69]:
e1 = "(sad | angry) & happy"
e2 = "(annoyed & ()) | (!(sad | !worried) & happy) & (!sport | city)"

In [70]:
ec = build(e2)
print(ec)

ValueError: Unknown activity: '())'.