In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
Data = pd.read_csv('out/pitchfork_reviews.csv')
Data.head()


Unnamed: 0,artist,album,score,genre,label,reviewer,year,review_date,length,url
0,Jeff Tweedy,Twilight Override,8.0,Rock,dBpm,Elizabeth Nelson,2025.0,2025-09-27,1262,https://www.pitchfork.com/reviews/albums/jeff-...
1,Geese,Getting Killed,9.0,Rock,Partisan / Play It Again Sam,Sam Sodomsky,2025.0,2025-09-26,827,https://www.pitchfork.com/reviews/albums/geese...
2,Piotr Kurek,Songs and Bodies,7.8,Experimental,Unsound,Philip Sherburne,2025.0,2025-09-26,602,https://www.pitchfork.com/reviews/albums/piotr...
3,Paul St. Hilaire,w/ the Producers,7.1,Electronic,Kynant / N.E.W.S.,Daniel Bromfield,2025.0,2025-09-26,556,https://www.pitchfork.com/reviews/albums/paul-...
4,Ø,Sysivalo,8.3,Electronic,Sähkö,Philip Sherburne,2025.0,2025-09-25,1170,https://www.pitchfork.com/reviews/albums/o-sys...


In [3]:
Data['genre'].head(20)

0                                         Rock
1                                         Rock
2                                 Experimental
3                                   Electronic
4                                   Electronic
5                                   Electronic
6                            Electronic / Rock
7                                          Rap
8                                         Rock
9                                         Rock
10                                Experimental
11    Electronic / Experimental / Folk/Country
12                                     Pop/R&B
13                                Folk/Country
14                                  Electronic
15                                  Electronic
16                                        Rock
17                                        Rock
18                                        Rock
19                                     Pop/R&B
Name: genre, dtype: object

In [4]:
Data['genre'].unique()

array(['Rock', 'Experimental', 'Electronic', 'Electronic / Rock', 'Rap',
       'Electronic / Experimental / Folk/Country', 'Pop/R&B',
       'Folk/Country', 'Pop/R&B / Electronic / Rap', 'Jazz',
       'Pop/R&B / Rock', 'Metal', 'Electronic / Pop/R&B',
       'Jazz / Rock / Experimental', 'Metal / Rock', 'Pop/R&B / Jazz',
       'Which?', 'Folk/Country / Rock', 'Rap / Rock',
       'Rock / Experimental', 'Experimental / Pop/R&B', 'Rap / Pop/R&B',
       'Rap / Experimental', 'Electronic / Experimental',
       'Electronic / Rap', 'Experimental / Electronic',
       'Electronic / Rock / Experimental', '+ambién',
       'Folk/Country / Pop/R&B', 'Rock / Electronic / Folk/Country',
       'Rap / Electronic', 'Electronic / Experimental / Jazz',
       'Pop/R&B / Electronic', 'Jazz / Experimental',
       'Experimental / Rap', 'Rock / Electronic',
       'Experimental / Folk/Country', 'Rock / Experimental / Jazz',
       'Experimental / Rock', 'Rock / Pop/R&B', 'Experimental / Jazz',
    

In [5]:
Data2 = Data.copy()  # keep original safe

Data2['genre_clean'] = (
    Data2['genre']
    .astype(str) # make sure it's a string
    .str.replace('⁄', '/', regex=False)  # in case some rows use a weird slash
    .str.strip() # remove spaces at start/end
)

Data2[['genre', 'genre_clean']].head(20)

Unnamed: 0,genre,genre_clean
0,Rock,Rock
1,Rock,Rock
2,Experimental,Experimental
3,Electronic,Electronic
4,Electronic,Electronic
5,Electronic,Electronic
6,Electronic / Rock,Electronic / Rock
7,Rap,Rap
8,Rock,Rock
9,Rock,Rock


In [6]:
Data2['genre_clean'] = Data2['genre_clean'].replace({
    'Pop/R&B': 'Pop_R&B',
    'Folk/Country': 'Folk_Country'
})

In [7]:
Data2['genre_list'] = Data2['genre_clean'].apply(
    lambda x: [p.strip() for p in x.split('/') if p.strip() != ""]
) # take every value in genre_clean, split on /, trim spaces, ignore empty pieces, store the result in a new column called genre_list

Data2[['genre_clean', 'genre_list']].head(25)

Unnamed: 0,genre_clean,genre_list
0,Rock,[Rock]
1,Rock,[Rock]
2,Experimental,[Experimental]
3,Electronic,[Electronic]
4,Electronic,[Electronic]
5,Electronic,[Electronic]
6,Electronic / Rock,"[Electronic, Rock]"
7,Rap,[Rap]
8,Rock,[Rock]
9,Rock,[Rock]


In [8]:
valid_genres = {
    'rock': 'Rock',
    'electronic': 'Electronic',
    'experimental': 'Experimental',
    'rap': 'Rap',
    'pop_r&b': 'Pop/R&B',       # remember: underscore version
    'folk_country': 'Folk/Country',
    'jazz': 'Jazz',
    'metal': 'Metal',
    'global': 'Global'
}

# Goes through each list and keep only the ones in valid_genres
def keep_only_real_genres(parts):
    kept = []
    for p in parts:
        key = p.lower()
        if key in valid_genres:
            kept.append(valid_genres[key])   # store the nice version
    # remove duplicates but keep order
    return list(dict.fromkeys(kept))

Data2['genre_cleaned'] = Data2['genre_list'].apply(keep_only_real_genres)
Data2[['genre', 'genre_list', 'genre_cleaned']].head(30)

Unnamed: 0,genre,genre_list,genre_cleaned
0,Rock,[Rock],[Rock]
1,Rock,[Rock],[Rock]
2,Experimental,[Experimental],[Experimental]
3,Electronic,[Electronic],[Electronic]
4,Electronic,[Electronic],[Electronic]
5,Electronic,[Electronic],[Electronic]
6,Electronic / Rock,"[Electronic, Rock]","[Electronic, Rock]"
7,Rap,[Rap],[Rap]
8,Rock,[Rock],[Rock]
9,Rock,[Rock],[Rock]


In [9]:

Data2['genre_clean'] = (
    Data2['genre']
    .astype(str)
    .str.replace('⁄', '/', regex=False)
    .str.strip()
)

Data2['genre_clean'] = (
    Data2['genre_clean']
    .str.replace('Folk/Country', 'Folk_Country', regex=False)
    .str.replace('Pop/R&B', 'Pop_R&B', regex=False)
)

Data2['genre_list'] = Data2['genre_clean'].apply(
    lambda x: [p.strip() for p in x.split('/') if p.strip() != ""]
)

valid_genres = {
    'rock': 'Rock',
    'electronic': 'Electronic',
    'experimental': 'Experimental',
    'rap': 'Rap',
    'pop_r&b': 'Pop/R&B',
    'folk_country': 'Folk/Country',
    'jazz': 'Jazz',
    'metal': 'Metal',
    'global': 'Global'
}

def keep_only_real_genres(parts):
    kept = []
    for p in parts:
        key = p.lower()
        if key in valid_genres:
            kept.append(valid_genres[key])
    return list(dict.fromkeys(kept))

Data2['genre_cleaned'] = Data2['genre_list'].apply(keep_only_real_genres)

Data2[['genre', 'genre_list', 'genre_cleaned']].head(30)

Unnamed: 0,genre,genre_list,genre_cleaned
0,Rock,[Rock],[Rock]
1,Rock,[Rock],[Rock]
2,Experimental,[Experimental],[Experimental]
3,Electronic,[Electronic],[Electronic]
4,Electronic,[Electronic],[Electronic]
5,Electronic,[Electronic],[Electronic]
6,Electronic / Rock,"[Electronic, Rock]","[Electronic, Rock]"
7,Rap,[Rap],[Rap]
8,Rock,[Rock],[Rock]
9,Rock,[Rock],[Rock]


In [10]:
all_genres = set()

for lst in Data2['genre_cleaned']:
    for g in lst:
        all_genres.add(g)

sorted(all_genres)

['Electronic',
 'Experimental',
 'Folk/Country',
 'Global',
 'Jazz',
 'Metal',
 'Pop/R&B',
 'Rap',
 'Rock']

In [11]:
Data2['genre_len'] = Data2['genre_cleaned'].apply(len)
Data2['genre_len'].value_counts()

genre_len
1    7834
2     798
0     210
3      82
4       2
Name: count, dtype: int64

In [12]:
Data2[Data2['genre_len'] == 0][['genre']].head(50)

Unnamed: 0,genre
97,Which?
216,+ambién
518,Reprise / Southern
669,topo2
688,Naya Beat
728,Music From Memory
745,Light in the Attic
770,forms of minutiae
810,American Dreams
865,Rawkus


In [13]:
mask = Data2['genre_len'] == 0
# Replace with ['Unknown'] so the column stays a list
Data2.loc[mask, 'genre_cleaned'] = [['Unknown']] * mask.sum()

In [14]:
Data2.loc[mask, ['genre', 'genre_cleaned']].head(10)
# See how many 'Unknown' we have now
Data2['genre_cleaned'].value_counts().head(10)

genre_cleaned
[Rock]                        2676
[Rap]                         1406
[Electronic]                  1141
[Pop/R&B]                     1033
[Experimental]                 735
[Folk/Country]                 400
[Jazz]                         241
Unknown                        210
[Metal]                        182
[Electronic, Experimental]      81
Name: count, dtype: int64

In [15]:
def ensure_list(x):
    if isinstance(x, list):
        return x
    elif pd.isna(x):
        return []
    else:
        return [x]

Data2['genre_cleaned'] = Data2['genre_cleaned'].apply(ensure_list)
Data2['genre_cleaned'].value_counts().head(10)

genre_cleaned
[Rock]                        2676
[Rap]                         1406
[Electronic]                  1141
[Pop/R&B]                     1033
[Experimental]                 735
[Folk/Country]                 400
[Jazz]                         241
[Unknown]                      210
[Metal]                        182
[Electronic, Experimental]      81
Name: count, dtype: int64

In [17]:
def get_main_genre(val):
    if isinstance(val, list):
        if len(val) == 0:
            return 'Unknown'
        else:
            return val[0]   # take the first genre
    elif isinstance(val, str):
        return val
    else:
        return 'Unknown'

Data2['main_genre'] = Data2['genre_cleaned'].apply(get_main_genre)
Data2['main_genre'].value_counts()

main_genre
Rock            2781
Rap             1471
Electronic      1387
Pop/R&B         1162
Experimental     909
Folk/Country     462
Jazz             309
Unknown          210
Metal            203
Global            32
Name: count, dtype: int64

I cleaned the genre column by fixing formatting, removing label-only values, standardizing real genres, replacing invalid cases with ‘Unknown,’ and creating a main_genre column based on the first genre originally listed for each album.

In [18]:
Data2[['artist', 'album', 'genre', 'genre_cleaned', 'main_genre']].sample(10, random_state=42)


Unnamed: 0,artist,album,genre,genre_cleaned,main_genre
6170,Together Pangea,Dispassionate EP,Rock,[Rock],Rock
5832,Terry Riley,Sun Rings,Experimental,[Experimental],Experimental
286,Smerz,Big city life,Pop/R&B,[Pop/R&B],Pop/R&B
8484,Portugal. The Man,Woodstock,Rock,[Rock],Rock
5684,Sufjan Stevens,The Decalogue,Folk/Country / Experimental,"[Folk/Country, Experimental]",Folk/Country
761,Leon Bridges,Leon,Pop/R&B,[Pop/R&B],Pop/R&B
3684,Nite Jewel,No Sun,Pop/R&B,[Pop/R&B],Pop/R&B
7879,Francis and the Lights,Just for Us,Pop/R&B,[Pop/R&B],Pop/R&B
1421,Frost Children,Hearth Room,Electronic,[Electronic],Electronic
1543,Drake,For All the Dogs,Rap,[Rap],Rap


In [19]:
Data2['main_genre'].value_counts()

main_genre
Rock            2781
Rap             1471
Electronic      1387
Pop/R&B         1162
Experimental     909
Folk/Country     462
Jazz             309
Unknown          210
Metal            203
Global            32
Name: count, dtype: int64

In [20]:
Data2.to_csv('out/pitchfork_reviews_clean_MAO.csv', index=False)