In [1]:
import pandas as pd
import os
import ast # To safely handle text that looks like lists

# --- ⚙️ Configuration ---
# This path should point to your extracted music folder
MUSIC_DATA_DIR = '../data/music/'

# ❗ IMPORTANT: Change this line to the real name of your CSV file!
CSV_FILENAME = 'Music_Info_with_Mood.csv' 

# This is where the final processed file will be saved
OUTPUT_DIR = '../data/music_processed/'
OUTPUT_CSV_PATH = os.path.join(OUTPUT_DIR, 'processed_music_tags.csv')

def safe_parse_tags(tag_string):
    """
    Safely parses a string that could be a list or just a plain word.
    Handles errors if the string is not perfectly formatted.
    """
    try:
        # Tries to evaluate strings like '["Happy", "Upbeat"]'
        evaluated = ast.literal_eval(tag_string)
        
        # Ensure the result is a list, not something else
        if isinstance(evaluated, list):
            return evaluated
        else:
            # If it evaluates to a single item (e.g., 'Happy'), wrap it in a list
            return [str(evaluated)]
    except (ValueError, SyntaxError):
        # If evaluation fails, it's a plain word (e.g., Happy).
        # Return it as a single-item list.
        return [tag_string]

def process_music_data():
    """
    Loads the music dataset from the CSV, cleans and combines genre and
    mood tags, and saves a new, optimized CSV for the recommender.
    """
    input_path = os.path.join(MUSIC_DATA_DIR, CSV_FILENAME)
    
    print(f"Attempting to load music dataset from: {input_path}")
    
    try:
        df = pd.read_csv(input_path)
    except FileNotFoundError:
        print("\n---! 🔴 ERROR: FILE NOT FOUND !---")
        print(f"Could not find the file: '{CSV_FILENAME}' inside '{MUSIC_DATA_DIR}'")
        print("Please double-check the folder and make sure the CSV_FILENAME variable is correct.")
        return

    print("✅ Dataset loaded successfully. Cleaning and processing...")

    try:
        # 1. Select only the columns we need
        df = df[['artist_name', 'title', 'genre', 'mood']].copy()
    except KeyError as e:
        print(f"\n---! 🔴 ERROR: COLUMN NOT FOUND !---")
        print(f"A required column was not found in your CSV file: {e}")
        return

    # 2. Drop songs that don't have mood or genre tags
    df.dropna(subset=['mood', 'genre'], inplace=True)

    # 3. Use the new safe function to create a list of mood tags. This prevents the crash.
    df['mood_tags'] = df['mood'].apply(safe_parse_tags)

    # 4. Combine the genre and mood tags into a single list
    df['all_tags'] = df.apply(lambda row: [row['genre']] + row['mood_tags'], axis=1)
    
    # 5. Join the tags into a single, clean, lowercase string for the model
    df['tags'] = df['all_tags'].apply(lambda tags: ' '.join(tags).lower())
    
    # 6. Create the final, clean dataframe
    processed_df = df[['artist_name', 'title', 'tags']]

    # 7. Save the result
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
        print(f"Created directory: {OUTPUT_DIR}")

    processed_df.to_csv(OUTPUT_CSV_PATH, index=False)
    
    print("\n--- ✅ Music Preprocessing Complete ---")
    print(f"Processed music data saved to: {OUTPUT_CSV_PATH}")
    print("----------------------------------")
    print("\nSample of the final data:")
    print(processed_df.head())

if __name__ == '__main__':
    process_music_data()

Attempting to load music dataset from: ../data/music/Music_Info_with_Mood.csv
✅ Dataset loaded successfully. Cleaning and processing...

--- ✅ Music Preprocessing Complete ---
Processed music data saved to: ../data/music_processed/processed_music_tags.csv
----------------------------------

Sample of the final data:
              artist_name             title            tags
2                 Nirvana   Come as You Are   rnb energetic
4               Radiohead             Creep   rnb energetic
8                Coldplay     The Scientist  rock energetic
10  Red Hot Chili Peppers  Under the Bridge         pop fun
12               Gorillaz    Feel Good Inc.   rnb energetic
