In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('albums_genres_grouped.csv')

In [None]:
from itertools import chain
GENRE_KEYWORDS = [
    "pop", "rock", "rap", "hip hop", "trap", "country", "metal", "r&b", 
    "jazz", "indie", "electronic", "house", "techno", "reggae", "folk", 
    "funk", "soul", "punk", "blues", "alternative", "classical", "ambient", 
    "k-pop", "kids", "afrobeats","relaxed","latin"
]

# Group tags by canonical genre
GENRE_GROUPS = {
    "rock": [
        "psychedelic", "post-rock", "post-grunge", "grunge", "shoegaze", "canadian"
    ],
    "pop": [
        "female vocalist", "female vocalists", "5 seconds of summer", "synthpop", 
        "tatemcrae", "my top songs", "sexy", "bee gees", "hyperpop", "digital tendencies",
        "2-step", "acoustic", "comedy", "singer-songwriter", "3", "disco", "wedding",
        "30", "peter", "midnights", "poptron", "doo-wop", "male vocals", "new wave", 
        "later", "joaoaksnes", "madonna", "portals", "aoty", "mayhem", "guts", 
        "short n' sweet", "soty", "cypriot", "breakbeat", "chinese", "german",
        "absolute bangers", "danish", "azerbaijan", "hardstyle", "japanese", "argentina",
        "indian", "india", "bhangra", "nigeria", "bollywood", "italian", "morocco",
        "traditional", "ukrainian"
    ],
    "latin": [
        "puerto rico", "mexico", "sad sierreno", "lada del 602", "corridos tumbados", 
        "seen live", "ramito de violeta", "chickencore", "corona", "cuck", "luar la l",
        "peso pluma", "spanish", "bachata", "salsa", "mierda", "banda", "duranguense",
        "romantico grupero", "colombia", "drum and bass", "sertanejo", "pagode", 
        "samba", "corrido tumbado", "mexican", "spain"
    ],
    "relaxed": [
        "ambient", "sleep", "reiki", "instrumental", "lo-fi", "rain", "nature sounds",
        "chill", "nature", "noise", "eargasm", "chillout", "white noise", "piano", 
        "hindi"
    ],
    "rap": [
        "harder than diamonds", "peak", "drill", "heavy", "my scribbled", "drake", 
        "baby keem", "g59", "j cole", "tag lil tecca-lot of me", "kanye wes", "bronx drill",
        "transitions", "auto-tagged", "gunna", "mumble crap", "boom bap", "quirky", 
        "underrated", "phonk", "chipmunk soul", "diss", "worst album ever", "nitrous", 
        "juggin", "4 out of 5", "czech", "sematary grave man from the haunted mound real nazgul skincarver keeper of da trees haunted mound lord",
        "grime", "juice wrld", "polo g", "lil uzi vert", "kanye west"
    ],
    "reggae": [
        "reggaeton", "party", "dancehall", "love"
    ],
    "hip hop": [
        "linedance", "nice", "florida", "don toliver", "southern hip-hop", "dr congo", 
        "mother", "sampling", "plugg", "ebm", "egyptian", "a cappella", "ghana", 
        "cumbia 420", "hip-hop", "melodic hip-hop"
    ],
    "metal": [
        "metalcore", "progressive metalcore", "rage", "post-hardcore", "demonic"
    ],
    "r&b": [
        "rnb", "aggressive", "king billionheir", "love at first listen", "personal favourites", 
        "sza", "british", "3 out of 5", "steve lacy"
    ],
    "k-pop": [
        "bts", "kpop", "korean", "jersey club", "it boy global"
    ],
    "religious": [
        "christian", "ccm", "worship", "gospel", "musiclist", "experimental", "hariharan", "thai"
    ],
    "electronic": [
        "depressive", "indietronica", "synthwave", "childish gamblingo", "featuring", 
        "trance", "ass", "downtempo", "dance", "60s", "polish", "russian", "norway", 
        "eurodance", "remix", "frenchcore", "uk garage"
    ],
    "indie": [
        "songs i crank my hog to", "bossa nova", "wsum 91.7 fm madison", 
        "songs i like to play whilst walking down the street at night music", 
        "gambling addiction", "emo", "darkwave", "slowcore", "stolen", "life changing", 
        "vinyl", "gothangelz", "ai", "ukranian", "opm"
    ],
    "afrobeats": [
        "afrobeats", "kenyan"
    ],
    "country": [
        "usa", "fearless", "linedance 2021", "texas", "feel good", "furry", 
        "linedance catalan", "american", "haunted", "linedance 2022"
    ],
    "kids": [
        "soundtrack", "video game music", "australian", "disney", "musical", "infantil"
    ],
    "trap": [
        "detroit trap"
    ],
    "funk": [
        "brazil"
    ],
    "folk": [
        "arabic"
    ]
}
# Flatten genre groupings into a mapping
MANUAL_GENRE_MAP = {
    tag: genre
    for genre, tags in GENRE_GROUPS.items()
    for tag in tags
}


def map_to_final_genre(genre_str):
    genre_str_lower = str(genre_str).strip().lower()
    
    # Check canonical match
    for genre in GENRE_KEYWORDS:
        if genre in genre_str_lower:
            return genre

    # Check exact match in manual mapping
    if genre_str_lower in MANUAL_GENRE_MAP:
        return MANUAL_GENRE_MAP[genre_str_lower]

    # Return as-is if not matched
    return genre_str_lower

# Apply to your DataFrame
df['genre_cleaned'] = df['genre'].apply(map_to_final_genre)


In [None]:
df["genre_cleaned"].unique()

In [None]:
df = pd.read_csv("albums_genres_grouped.csv")
genre_map = {
    'hip hop': 'rap/hip hop',
    'rap': 'rap/hip hop',
    'soul': 'soul/jazz',
    'funk': 'soul/jazz',
    'jazz': 'soul/jazz',
    'folk': 'country/folk',
    'country': 'country/folk'
}

# Apply the mapping
df['genre_condensed'] = df['genre_cleaned'].replace(genre_map)
df.to_csv('albums_genres_grouped.csv', index=False)