In [1]:
import xml.etree.ElementTree as ET
import pandas as pd

# Path to the collection.nml file
file_path = '../data/collection.nml'

# Parse the XML file
tree = ET.parse(file_path)
root = tree.getroot()

# Print the structure of the XML to understand it better
print(f"Root tag: {root.tag}")
print(f"Root attributes: {root.attrib}")
print(f"Child tags: {[child.tag for child in root]}")

# Find all ENTRY elements
entries = root.findall('.//ENTRY')
print(f"Number of entries found: {len(entries)}")

# If entries are found, print the first one to see its structure
if entries:
    first_entry = entries[0]
    print("\nFirst entry attributes:", first_entry.attrib)
    print("First entry children:", [child.tag for child in first_entry])

Root tag: NML
Root attributes: {'VERSION': '19'}
Child tags: ['HEAD', 'MUSICFOLDERS', 'COLLECTION', 'SETS', 'PLAYLISTS', 'INDEXING']
Number of entries found: 4151

First entry attributes: {'MODIFIED_DATE': '2024/12/11', 'MODIFIED_TIME': '28725', 'AUDIO_ID': 'AQ0BNUMjUjIiMjMjJmRDVTRDNTQ1RWR1RjdGVGVTd2Y3RlSVVGY4NIZqp1M1ZiV2dlRFiYZ2iGJYiIaYV3VjVkdIWWWGY1dHN4ZUg2dplXuXZVeHiXd3qXaHbMyZzKiMmZd3WranyIp9bKmYnLda3K67u/6ZmHUjZ1J2Z2RFWHVFaIUmZmZ5MREREkERERETIRERESMRARERREeIZ4aIrKef/LZ4d824m5iLy8dol2hmd4h3mXeGiHqp7pmJiHvsc3dDIzIjIzIiUyMlQzREQzM1ZXh2MlZjSHeFNEZ2RViGJGdnWIQyI0IzMzVjM0VUQzQkM0Q0IiQREREREQEREA==', 'TITLE': 'try again (a cappella)', 'ARTIST': 'aaliyah'}
First entry children: ['LOCATION', 'ALBUM', 'MODIFICATION_INFO', 'INFO', 'TEMPO', 'LOUDNESS', 'MUSICAL_KEY', 'CUE_V2']


In [2]:
# Extract artist and title from all entries
songs = []
for entry in entries:
    artist = entry.get('ARTIST', '')
    title = entry.get('TITLE', '')
    if artist and title:  # Only include entries that have both artist and title
        songs.append({'Artist': artist, 'Title': title})

# Create a DataFrame for better visualization
songs_df = pd.DataFrame(songs)

# Display the number of songs found
print(f"Found {len(songs_df)} songs with both artist and title information")

# Display the first 10 songs
songs_df.head(10)

Found 3003 songs with both artist and title information


Unnamed: 0,Artist,Title
0,aaliyah,try again (a cappella)
1,Absolute Zero & Subphonics,The Code
2,Adam F,Brand New Funk
3,Alix Perez,Down The Line (feat. MC Fats)
4,Alix Perez,Fade Away
5,Alix Perez,Forsaken feat. Peven Everett & SpectraSoul
6,Alix Perez,Revolve-Her
7,Alix Perez,Never Left
8,Alix Perez,The Cut Deepens ft. Foreign Beggars
9,Ancronix,Skin it Back


In [3]:
# Function to filter songs by artist or title
def filter_songs(df, artist=None, title=None):
    """
    Filter songs by artist or title (case-insensitive).
    
    Parameters:
    df (DataFrame): DataFrame containing songs
    artist (str): Artist name to filter by (optional)
    title (str): Title to filter by (optional)
    
    Returns:
    DataFrame: Filtered DataFrame
    """
    filtered_df = df.copy()
    
    if artist:
        filtered_df = filtered_df[filtered_df['Artist'].str.lower().str.contains(artist.lower())]
    
    if title:
        filtered_df = filtered_df[filtered_df['Title'].str.lower().str.contains(title.lower())]
    
    return filtered_df

# Example: Filter songs by artist
artist_search = "Alix Perez"  # Replace with any artist you want to search for
filtered_by_artist = filter_songs(songs_df, artist=artist_search)
print(f"Found {len(filtered_by_artist)} songs by {artist_search}")
filtered_by_artist.head(10)

Found 27 songs by Alix Perez


Unnamed: 0,Artist,Title
3,Alix Perez,Down The Line (feat. MC Fats)
4,Alix Perez,Fade Away
5,Alix Perez,Forsaken feat. Peven Everett & SpectraSoul
6,Alix Perez,Revolve-Her
7,Alix Perez,Never Left
8,Alix Perez,The Cut Deepens ft. Foreign Beggars
94,Alix Perez,Annie's Song (S.P.Y remix)
546,Alix Perez Feat. Foreign Beggars,Dark Days Feat. Foreign Beggars (Original Mix)
764,"Alix Perez, Skeptical",Without a Trace (Original Mix)
792,Alix Perez,Myriads (Jubei Remix)


In [4]:
# Example: Filter songs by title
title_search = "Fade"  # Replace with any title you want to search for
filtered_by_title = filter_songs(songs_df, title=title_search)
print(f"Found {len(filtered_by_title)} songs with '{title_search}' in the title")
filtered_by_title.head(10)

Found 3 songs with 'Fade' in the title


Unnamed: 0,Artist,Title
4,Alix Perez,Fade Away
924,"Enei, Charli Brix",Faded feat. Charli Brix (Original Mix)
1455,"LSB, Drs",Faded (Workforce Remix)


In [5]:
# Example: Filter songs by both artist and title
artist_search = "Alix"
title_search = "Fade"
filtered_combined = filter_songs(songs_df, artist=artist_search, title=title_search)
print(f"Found {len(filtered_combined)} songs by '{artist_search}' with '{title_search}' in the title")
filtered_combined

# Save the full list to a CSV file
songs_df.to_csv('../data/songs_list.csv', index=False)
print(f"Saved full list of {len(songs_df)} songs to '../data/songs_list.csv'")

Found 1 songs by 'Alix' with 'Fade' in the title
Saved full list of 3003 songs to '../data/songs_list.csv'


In [6]:
# Extract more detailed information including genre and album
detailed_songs = []
for entry in entries:
    artist = entry.get('ARTIST', '')
    title = entry.get('TITLE', '')
    
    # Initialize with empty values
    album = ""
    genre = ""
    
    # Get album information
    album_elem = entry.find('./ALBUM')
    if album_elem is not None and 'TITLE' in album_elem.attrib:
        album = album_elem.attrib['TITLE']
    
    # Get genre information
    info_elem = entry.find('./INFO')
    if info_elem is not None and 'GENRE' in info_elem.attrib:
        genre = info_elem.attrib['GENRE']
    
    if artist and title:  # Only include entries that have both artist and title
        detailed_songs.append({
            'Artist': artist, 
            'Title': title,
            'Album': album,
            'Genre': genre
        })

# Create a DataFrame for better visualization
detailed_df = pd.DataFrame(detailed_songs)

# Display the number of songs found
print(f"Found {len(detailed_df)} songs with detailed information")

# Display the first 10 songs with detailed information
detailed_df.head(10)

# Save the detailed list to a CSV file
detailed_df.to_csv('../data/detailed_songs_list.csv', index=False)
print(f"Saved detailed list of {len(detailed_df)} songs to '../data/detailed_songs_list.csv'")

Found 3003 songs with detailed information
Saved detailed list of 3003 songs to '../data/detailed_songs_list.csv'


In [7]:
# Basic statistics and analysis

# Count unique artists
unique_artists = detailed_df['Artist'].nunique()
print(f"Number of unique artists: {unique_artists}")

# Top 10 artists by number of songs
top_artists = detailed_df['Artist'].value_counts().head(10)
print("\nTop 10 artists by number of songs:")
print(top_artists)

# Top 10 genres
if 'Genre' in detailed_df.columns:
    genre_counts = detailed_df['Genre'].value_counts().head(10)
    print("\nTop 10 genres:")
    print(genre_counts)

# Visualize top artists (if matplotlib is available)
try:
    import matplotlib.pyplot as plt
    
    plt.figure(figsize=(12, 6))
    top_artists.plot(kind='bar')
    plt.title('Top 10 Artists by Number of Songs')
    plt.xlabel('Artist')
    plt.ylabel('Number of Songs')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
except ImportError:
    print("\nMatplotlib not available for visualization")

# Summary of the collection
print("\nSummary of the music collection:")
print(f"Total songs: {len(detailed_df)}")
print(f"Unique artists: {unique_artists}")
print(f"Unique albums: {detailed_df['Album'].nunique()}")
if 'Genre' in detailed_df.columns:
    print(f"Unique genres: {detailed_df['Genre'].nunique()}")

Number of unique artists: 1060

Top 10 artists by number of songs:
Artist
Total Science              74
Ed Rush & Optical          66
Bad Company                59
S.P.Y                      56
Spirit                     37
Calibre                    37
Cause 4 Concern            36
Dillinja                   35
Break                      34
Artificial Intelligence    32
Name: count, dtype: int64

Top 10 genres:
Genre
Drum & Bass          2080
                      589
Electronic            129
Electro                33
Jungle                 28
Dance                  26
Drum And Bass          26
Drum n Bass            12
Drum and Bass          10
Drum & Bass Other      10
Name: count, dtype: int64

Matplotlib not available for visualization

Summary of the music collection:
Total songs: 3003
Unique artists: 1060
Unique albums: 1280
Unique genres: 38
