Skip to content

Commit

Permalink
Handle split EPs, standardise series punctuation (#45)
Browse files Browse the repository at this point in the history
### Added

* `album`: 
- Handling unnamed (after removal of catalognum and artist names) split
EPs that
have two artists. In accordance with [title guidelines], the EP is named
by separating the artists
    with a slash.
- Following the [title guidelines], the standard series format now
applies to
**Vol/Vol.**, **Volume**, **Pt** too. Previously we only considered
**Part**.
    - **Compilation - Volume 2**
    - Compilation Volume 2 -> **Compilation, Volume 2**
    - If series is in the beginning of the album, it is moved to the end
      - Vol. 2 - Compilation -> **Compilation, Vol 2**
- We also ensure the delimiter for abbreviations, space, and removal of
leading zeroes
      - Vol02 -> **Vol. 2**

  - Replace **(Remixes)** -> **Remixes**

* `albumtype`: the EP albumtype is recognized for split EPs.

### Updated

* `album`: 
  - Remove **+ Some remix**
    - **Album ~~+ Someone's Remix~~**

* `catalognum`: do not treat **RD-9** (Behringer RD-9) as a catalognum
* `title`: 
  - Remove **Presented by...**
    * **Title ~~[Presented by Artist]~~**
    * **Title ~~(Presented by Artist)~~**
- Remove preceding number prefix when all album tracks have it and there
are two numbers
    * **01 Title, Other Title**
    * **1 Title, 2 Other Title**
    * **~~01~~ Title, ~~02~~ Other Title**.

### Fixed

* All **zero width space** characters (`\u200b`) are now removed before
parsing.

* `album`: 
- Add many cases of missing **EP** and **LP** bits when they are found
in the comments
- Fix series numbering format: when it is delimited by some character,
keep it.
    Otherwise, separate it with a comma
    - **Album - Part 2**
    - Album Part 2 -> **Album, Part 2**
  - Tackled some edge cases where label name wrongly stayed on the album
    - **~~Label:~~ Album**
    - **~~Label -~~ Album**
  - Remove **Bonus**
  • Loading branch information
snejus committed May 20, 2023
1 parent 25d0b8a commit 18fa4b5
Show file tree
Hide file tree
Showing 14 changed files with 1,223 additions and 887 deletions.
55 changes: 55 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,58 @@
## [0.17.0] 2023-05-20

[0.17.0]: https://github.com/snejus/beetcamp/releases/tag/0.17.0

### Added

* `album`:
- Handling unnamed (after removal of catalognum and artist names) split EPs that
have two artists. In accordance with [title guidelines], the EP is named by separating the artists
with a slash.
- Following the [title guidelines], the standard series format now applies to
**Vol/Vol.**, **Volume**, **Pt** too. Previously we only considered **Part**.
- **Compilation - Volume 2**
- Compilation Volume 2 -> **Compilation, Volume 2**
- If series is in the beginning of the album, it is moved to the end
- Vol. 2 - Compilation -> **Compilation, Vol 2**
- We also ensure the delimiter for abbreviations, space, and removal of leading zeroes
- Vol02 -> **Vol. 2**

- Replace **(Remixes)** -> **Remixes**

* `albumtype`: the EP albumtype is recognized for split EPs.

### Updated

* `album`:
- Remove **+ Some remix**
- **Album ~~+ Someone's Remix~~**

* `catalognum`: do not treat **RD-9** (Behringer RD-9) as a catalognum
* `title`:
- Remove **Presented by...**
* **Title ~~[Presented by Artist]~~**
* **Title ~~(Presented by Artist)~~**
- Remove preceding number prefix when all album tracks have it and there are two numbers
* **01 Title, Other Title**
* **1 Title, 2 Other Title**
* **~~01~~ Title, ~~02~~ Other Title**.

### Fixed

* All **zero width space** characters (`\u200b`) are now removed before parsing.

* `album`:
- Add many cases of missing **EP** and **LP** bits when they are found in the comments
- Fix series numbering format: when it is delimited by some character, keep it.
Otherwise, separate it with a comma
- **Album - Part 2**
- Album Part 2 -> **Album, Part 2**
- Tackled some edge cases where label name wrongly stayed on the album
- **~~Label:~~ Album**
- **~~Label -~~ Album**
- Remove **Bonus**


## [0.16.3] 2023-02-13

[0.16.3]: https://github.com/snejus/beetcamp/releases/tag/0.16.3
Expand Down
47 changes: 9 additions & 38 deletions beetsplug/bandcamp/_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ class MediaInfo(NamedTuple):


CATALOGNUM_CONSTRAINT = r"""(?<![]/@-])(\b
(?!\W|LC[ ]|VA[\d ]+|[EL]P\W|[^\n.]+[ ](?:20\d\d|VA[ \d]+)|(?i:vol|disc|number))
(?!\W|LC[ ]|VA[\d ]+|[EL]P\W|[^\n.]+[ ](?:20\d\d|VA[ \d]+)|(?i:vol|disc|number|rd-9))
{}
\b(?!["%]))"""
_cat_pat = CATALOGNUM_CONSTRAINT.format(
Expand Down Expand Up @@ -90,25 +90,25 @@ class MediaInfo(NamedTuple):
r"^([A-J]{1,3}[12]?\.?\d|[AB]+(?=\W{2,}))(?:(?!-\w)[^\w(]|_)+", re.I + re.M
),
"vinyl_name": re.compile(r"[1-5](?= ?(xLP|LP|x))|single|double|triple", re.I),
"clean_incl": re.compile(
r" *(\(?incl|\((inc|tracks|.*remix( |es)))([^)]+\)|.*)", re.I
),
"tidy_eplp": re.compile(r"\S*(?:Double )?(\b[EL]P\b)\S*", re.I),
}
rm_strings = [
"limited edition",
r"^[EL]P( \d+)?",
r"^Vol(ume)?\W*(?!.*\)$)\d+",
r"\((digital )?album\)",
r"\(single\)",
r"^v/?a\W*|va$|vinyl(-only)?|compiled by.*",
r"^v/?a\W*|va$",
r"\Wvinyl\W|vinyl-only",
"compiled by.*",
r"[\[(]presented by.*",
r"free download|\([^()]*free(?!.*mix)[^()]*\)",
"(\W|\W )bonus( \w+)*",
r"[+][\w ]+remix|\(with remixes\)",
]

_remix_pat = r"(?P<remix>((?P<remixer>[^])]+) )?\b((re)?mix|edit|bootleg)\b[^])]*)"
# fmt: off
CLEAN_PATTERNS = [
(re.compile(fr"(([\[(])|(^| ))\*?({'|'.join(rm_strings)})(?(2)[])]|( |$))", re.I), ""), # noqa
(re.compile(rf"(([\[(])|(^| ))\*?({'|'.join(rm_strings)})(?(2)[])]|( |$))", re.I), ""), # noqa
(re.compile(r" -(\S)"), r" - \1"), # hi -bye -> hi - bye
(re.compile(r"(\S)- "), r"\1 - "), # hi- bye -> hi - bye
(re.compile(r" +"), " "), # hi bye -> hi bye
Expand All @@ -118,10 +118,9 @@ class MediaInfo(NamedTuple):
(re.compile(rf"(\({_remix_pat})$", re.I), r"\1)"), # bye - (Some Mix -> bye - (Some Mix) # noqa
(re.compile(rf"- *({_remix_pat})$", re.I), r"(\1)"), # bye - Some Mix -> bye (Some Mix) # noqa
(re.compile(r'(^|- )[“"]([^”"]+)[”"]( \(|$)'), r"\1\2\3"), # "bye" -> bye; hi - "bye" -> hi - bye # noqa
(re.compile(r"\((?i:(the )?(remixes))\)"), r"\2"), # Album (Remixes) -> Album Remixes # noqa
]
# fmt: on
keep_label_pat = r"^{0}[^ ]|\({0}|\w {0} \w|\w {0}$"
clean_label_pat = r"(\W\W+{0}\W*|\W*{0}(\W\W+|$)|(^\W*{0}\W*$))(VA)?\d*"


class Helpers:
Expand Down Expand Up @@ -205,34 +204,6 @@ def clean_name(name: str) -> str:
name = pat.sub(repl, name).strip()
return name

@staticmethod
def clean_album(name: str, *args: str, label: str = "") -> str:
"""Return clean album name.
Catalogue number and artists to be removed are given as args.
"""
name = PATTERNS["clean_incl"].sub("", name)
name = PATTERNS["ft"].sub(" ", name)
name = re.sub(r"^\[(.*)\]$", r"\1", name)

for arg in [re.escape(arg) for arg in filter(None, args)] + [
r"Various Artists?\b(?! [A-z])( \d+)?"
]:
name = re.sub(rf" *(?i:(compiled )?by|vs|\W*split w) {arg}", "", name)
if not re.search(rf"\w {arg} \w|of {arg}", name, re.I):
name = re.sub(
rf"(^|[^'\])\w]|_|\b)+(?i:{arg})([^'(\[\w]|_|(\d+$))*", " ", name
).strip()

if label:
label = re.escape(label)
if not re.search(keep_label_pat.format(label), name):
name = re.sub(clean_label_pat.format(label), " ", name, re.I).strip()

name = Helpers.clean_name(name)
# uppercase EP and LP, and remove surrounding parens / brackets
name = PATTERNS["tidy_eplp"].sub(lambda x: x.group(1).upper(), name)
return name.strip(" /")

@staticmethod
def get_genre(keywords, config, label):
# type: (Iterable[str], JSONDict, str) -> Iterable[str]
Expand Down
115 changes: 30 additions & 85 deletions beetsplug/bandcamp/_metaguru.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

from ._helpers import PATTERNS, Helpers, MediaInfo
from ._tracks import Track, Tracks
from .album import AlbumName

if sys.version_info.minor > 7:
from functools import cached_property # pylint: disable=ungrouped-imports
Expand Down Expand Up @@ -49,6 +50,7 @@ class Metaguru(Helpers):
config: JSONDict
media_formats: List[MediaInfo]
_tracks: Tracks
_album_name: AlbumName

def __init__(self, meta: JSONDict, config: Optional[JSONDict] = None) -> None:
self.meta = meta
Expand All @@ -60,11 +62,14 @@ def __init__(self, meta: JSONDict, config: Optional[JSONDict] = None) -> None:
self.config = config or {}
self.va_name = beets_config["va_name"].as_str() or self.va_name
self._tracks = Tracks.from_json(meta)
self._album_name = AlbumName(
meta, self.all_media_comments, self._tracks.albums_in_titles
)

@classmethod
def from_html(cls, html: str, config: Optional[JSONDict] = None) -> "Metaguru":
try:
meta = re.search(PATTERNS["meta"], html).group() # type: ignore[union-attr]
meta = re.search(PATTERNS["meta"], html.replace("\u200b", "")).group() # type: ignore[union-attr] # noqa
except AttributeError as exc:
raise AttributeError("Could not find release metadata JSON") from exc
else:
Expand Down Expand Up @@ -92,44 +97,6 @@ def comments(self) -> str:
def all_media_comments(self) -> str:
return "\n".join([*[m.description for m in self.media_formats], self.comments])

@cached_property
def official_album_name(self) -> str:
"""Check description for the album name header and return whatever follows it
if found.
"""
m = re.search(r"(Title: ?|Album(:|/Single) )([^\n]+)", self.all_media_comments)
if m:
return m.group(3).strip()
return ""

@cached_property
def parsed_album_name(self) -> str:
"""
Search for the album name in the following order and return the first match:
1. Album name is found in *all* track names
2. When 'EP' or 'LP' is in the release name, album name is what precedes it.
3. If some words are enclosed in quotes in the release name, it is assumed
to be the album name. Remove the quotes in such case.
"""
album_in_tracks = {t.album for t in self._tracks if t.album}
if len(album_in_tracks) == 1:
return list(album_in_tracks)[0]

album = self.album_name
for pat in [
r"(((&|#?\b(?!Double|VA|Various)(\w|[^\w| -])+) )+[EL]P)",
r"((['\"])([^'\"]+)\2( VA\d+)*)( |$)",
]:
m = re.search(pat, album)
if m:
album = m.group(1).strip()
return re.sub(r"^['\"](.+)['\"]$", r"\1", album)
return album

@cached_property
def album_name(self) -> str:
return self.meta.get("name") or ""

@cached_property
def label(self) -> str:
m = re.search(r"Label:([^/,\n]+)", self.all_media_comments)
Expand All @@ -155,14 +122,18 @@ def original_albumartist(self) -> str:
aartist = m.group(1).strip() if m else self.meta["byArtist"]["name"]
return re.sub(r" +// +", ", ", aartist)

@cached_property
def original_album(self) -> str:
return self._album_name.original

@cached_property
def bandcamp_albumartist(self) -> str:
"""Return the official release albumartist.
It is correct in half of the cases. In others, we usually find the label name.
"""
aartist = self.original_albumartist
if self.label == aartist:
split = self.clean_album(self.album_name, self.catalognum).split(" - ")
split = AlbumName.clean(self.original_album, [self.catalognum]).split(" - ")
if len(split) > 1:
aartist = split[0]

Expand Down Expand Up @@ -284,6 +255,15 @@ def albumartist(self) -> str:
def vinyl_disctitles(self) -> str:
return " ".join([m.title for m in self.media_formats if m.name == "Vinyl"])

@cached_property
def album_name(self) -> str:
return self._album_name.get(
self.catalognum,
self.tracks.original_artists,
self.tracks.artists,
self.label,
)

def _search_albumtype(self, word: str) -> bool:
"""Return whether the given word (ep or lp) matches the release albumtype.
True when one of the following conditions is met:
Expand All @@ -295,10 +275,10 @@ def _search_albumtype(self, word: str) -> bool:
sentences = re.split(r"[.]\s+|\n", self.all_media_comments)
word_pat = re.compile(rf"\b{word}\b", re.I)
catnum_pat = re.compile(rf"{word}\d", re.I)
name_pat = re.compile(rf"\b(this|{re.escape(self.clean_album_name)})\b", re.I)
name_pat = re.compile(rf"\b(this|{re.escape(self.album_name)})\b", re.I)
return bool(
catnum_pat.search(self.catalognum)
or word_pat.search(self.album_name + " " + self.vinyl_disctitles)
or word_pat.search(self.original_album + " " + self.vinyl_disctitles)
or any(word_pat.search(s) and name_pat.search(s) for s in sentences)
)

Expand All @@ -318,7 +298,9 @@ def is_lp(self) -> bool:
@cached_property
def is_ep(self) -> bool:
"""Return whether the release is an EP."""
return self._search_albumtype("ep")
return self._search_albumtype("ep") or (
" / " in self.album_name and len(self.tracks.artists) == 2
)

def check_albumtype_in_descriptions(self) -> str:
"""Count 'lp', 'album' and 'ep' words in the release and media descriptions
Expand All @@ -342,7 +324,7 @@ def first_one(artist: str) -> str:

truly_unique = set(map(first_one, self.tracks.artists))
return (
bool(re.search(r"compilation|best of|anniversary", self.album_name, re.I))
self._album_name.mentions_compilation
or self._search_albumtype("compilation")
or (len(truly_unique) > 3 and len(self.tracks) > 4)
)
Expand Down Expand Up @@ -378,7 +360,7 @@ def albumtypes(self) -> str:
if self.is_single_album:
albumtypes.add("single")
for word in ["remix", "rmx", "edits", "live", "soundtrack"]:
if word in self.album_name.lower():
if word in self.original_album.lower():
albumtypes.add(word.replace("rmx", "remix").replace("edits", "remix"))
if len(self.tracks.remixers) == len(self.tracks):
albumtypes.add("remix")
Expand Down Expand Up @@ -417,43 +399,6 @@ def genre(self) -> Optional[str]:

return ", ".join(sorted(genres)).strip() or None

@cached_property
def eplp_album_comments(self) -> str:
"""Parse comments looking for an indication of an album in the following format
(Capital-case Album Name) (EP or LP)
and return the matching album name if found.
"""
m = re.search(r"((?!The|This)\b[A-Z][^ \n]+\b )+[EL]P", self.all_media_comments)
return m.group() if m else ""

@cached_property
def clean_album_name(self) -> str:
to_clean = {self.catalognum}
if self.official_album_name:
album = self.official_album_name
else:
album = self.parsed_album_name or self.album_name
to_clean |= set(self.tracks.full_artists)
to_clean |= set(self.tracks.artists)

part = ""
m = re.search(r"\W+(part [\w-]+)", self.album_name, re.I)
if m:
album = album.replace(m.group(), "")
part = f" ({m.group(1)})"
album = self.clean_album(
album, *sorted(to_clean, key=len, reverse=True), label=self.label
)

if album.startswith("("):
album = self.album_name

album = album or self.eplp_album_comments or self.catalognum or self.album_name
if part:
album += part

return album

@property
def _common(self) -> JSONDict:
return {
Expand All @@ -473,7 +418,7 @@ def get_fields(self, fields: Iterable[str], src: object = None) -> JSONDict:

@property
def _common_album(self) -> JSONDict:
common_data: JSONDict = {"album": self.clean_album_name}
common_data: JSONDict = {"album": self.album_name}
fields = ["label", "catalognum", "albumtype", "country"]
if NEW_BEETS:
fields.extend(["genre", "style", "comments", "albumtypes"])
Expand Down Expand Up @@ -512,7 +457,7 @@ def singleton(self) -> TrackInfo:
track.track_id = track.data_url
return track

def _album(self, media: MediaInfo) -> AlbumInfo:
def get_media_album(self, media: MediaInfo) -> AlbumInfo:
"""Return album for the appropriate release format."""
self.media = media
include_digi = self.config.get("include_digital_only_tracks")
Expand Down Expand Up @@ -546,4 +491,4 @@ def _album(self, media: MediaInfo) -> AlbumInfo:
@cached_property
def albums(self) -> Iterable[AlbumInfo]:
"""Return album for the appropriate release format."""
return list(map(self._album, self.media_formats))
return list(map(self.get_media_album, self.media_formats))
Loading

0 comments on commit 18fa4b5

Please sign in to comment.