# Data collection and cleaning

This is where we collect and clean our dataset.

In [1]:
import pandas as pd
import numpy as np

In [3]:
users = pd.read_csv("data/users.csv")
top_artists = pd.read_csv("data/user_top_artists.csv")
top_tracks = pd.read_csv("data/user_top_tracks.csv")
top_albums = pd.read_csv("data/user_top_albums.csv")

In [8]:
print("Users table\n")
print(users.head())

print("\nUsers top artists' table\n")
print(top_artists.head())

print("\nUsers top tracks' table\n")
print(top_tracks.head())

print("\nUsers top albums' table\n")
print(top_albums.head())

Users table

   user_id         country  total_scrobbles
0        1   United States           204985
1        2          France           211013
2        3           Spain           244871
3        4          Israel            69401
4        5  United Kingdom            95449

Users top artists' table

   user_id  rank       artist_name  playcount  \
0        1     1   Crystal Castles       1034   
1        1     2         Radiohead        972   
2        1     3          Ladytron        831   
3        1     4  Ghostface Killah        801   
4        1     5             UNKLE        722   

                                   mbid  
0  b1570544-93ab-4b2b-8398-131735394202  
1  a74b1b7f-71a5-4011-9441-d0b5e4122711  
2  b45335d1-5219-4262-a44d-936aa36eeaed  
3  3b39abeb-0064-4eed-9ddd-ee47a45c54cb  
4  6648391e-7890-4f6c-b939-976f215195d3  

Users top tracks' table

   user_id  rank               track_name          artist_name  playcount  \
0        1     1                Ice Cream  New

In [23]:
users = users[users["total_scrobbles"] > 500000].copy()
top_user_ids = set(users["user_id"])

top_artists = top_artists[top_artists["user_id"].isin(top_user_ids)]
top_tracks = top_tracks[top_tracks["user_id"].isin(top_user_ids)]
top_albums = top_albums[top_albums["user_id"].isin(top_user_ids)]

In [24]:
users = users.drop_duplicates(subset="user_id")
top_artists = top_artists.drop_duplicates(subset=["user_id", "artist_name"])
top_tracks = top_tracks.drop_duplicates(subset=["user_id", "track_name"])
top_albums = top_albums.drop_duplicates(subset=["user_id", "album_name"])

In [25]:
def clean_text(text):
    if pd.isna(text):
        return ""
    return text.strip().lower()

for df, cols in [
    (top_artists, ["artist_name"]),
    (top_tracks, ["artist_name", "track_name"]),
    (top_albums, ["artist_name", "album_name"])
]:
    for col in cols:
        df[col] = df[col].apply(clean_text)

# Replace empty mbid with NaN
for df in [top_artists, top_tracks, top_albums]:
    df["mbid"] = df["mbid"].replace("", pd.NA)

In [26]:
# Normalize playcount per user (so they sum to 1)
def normalize_playcount(df, group_col="user_id"):
    df["playcount_norm"] = df["playcount"] / df.groupby(group_col)["playcount"].transform("sum")
    return df

top_artists = normalize_playcount(top_artists)
top_tracks = normalize_playcount(top_tracks)
top_albums = normalize_playcount(top_albums)

# Merge with users table
top_artists = top_artists.merge(users, on="user_id", how="left")
top_tracks = top_tracks.merge(users, on="user_id", how="left")
top_albums = top_albums.merge(users, on="user_id", how="left")
