# JSON READER FOR DEEZER DATASETS

In [1]:
import json
import pandas as pd

# Disclaimer: The following code is designed to process JSON data containing genre information. 
# It is intended to be versatile and can be used with datasets related to various genres, including RO, HU, or HR.

# Path to the JSON file
file_path = 'RO_genres.json'

# Load JSON data from the file
with open(file_path, 'r') as file:
    json_data = json.load(file)

# Replace spaces with underscores in all keys and lists
updated_data = {key.replace(' ', '_'): [genre.replace(' ', '_') for genre in value] for key, value in json_data.items()}

# Flatten the list of genres and create a set to ensure uniqueness
unique_genres_set = set(genre for genres_list in updated_data.values() for genre in genres_list)

# Convert the set back to a list
ALL_GENRES = list(unique_genres_set)

# Print lengths of the lists
list_lengths = []
for key, value in updated_data.items():
    list_lengths.append([key, len(value)])

json_data = updated_data

converted_data = {}
classes = []
for key, genres_list in json_data.items():
    # Create a binary sequence representing the presence of each genre
    binary_sequence = [1 if genre in genres_list else 0 for genre in ALL_GENRES]
    binary_sequence.append(genres_list[0])  # Append the original class
    classes.append(genres_list[0])
    converted_data[key] = binary_sequence

output_file_path = "RO.content"
# Open the file in write mode and redirect the print output to the file
with open(output_file_path, 'w') as file:
    for key, binary_sequence in converted_data.items():
        # Format the output as requested
        formatted_output = '\t'.join([str(item) for item in [key] + binary_sequence])

        # Write the formatted output to the file
        file.write(formatted_output + '\n')

In [2]:
ro_data = pd.read_csv('RO_edges.csv')
# Specify the output file path
output_file_path = 'RO.cites'

# Write the DataFrame to a text file
ro_data.to_csv(output_file_path, sep='\t', index=False)

### Check if the labels are balanced and if not continue

In [3]:
from collections import Counter
import random

# Count the occurrences of each genre
genre_counts = Counter(classes)

for genre, count in genre_counts.items():
    print(f"{genre}: {count}")

Films/Games: 4154
Reggae: 1973
Pop: 5658
Dance: 9029
Alternative: 1444
R&B: 1178
International_Pop: 2416
Rap/Hip_Hop: 4440
Indie_Rock: 2206
Dancefloor: 1508
Electro_Hip_Hop: 184
Classical: 397
Hard_Rock: 441
Electro: 645
Jazz: 224
Comedy: 257
Metal: 276
Film_Scores: 716
Spirituality_&_Religion: 143
Kids: 307
Techno/House: 795
Country: 127
Indie_Pop/Folk: 529
Dirty_South: 502
Folk: 91
Blues: 83
Latin_Music: 350
Urban_Cowboy: 13
Rock: 530
Indie_Pop: 164
Disco: 97
Rock_&_Roll/Rockabilly: 163
Opera: 215
Singer_&_Songwriter: 91
Modern: 5
Indie_Rock/Rock_pop: 53
Indian_Music: 5
Dancehall/Ragga: 59
Soul_&_Funk: 32
Soundtracks: 47
Classical_Period: 12
Oldschool_R&B: 18
Asian_Music: 28
Chill_Out/Trip-Hop/Lounge: 16
East_Coast: 1
Bolero: 8
Brazilian_Music: 44
Instrumental_jazz: 22
Trance: 7
Tropical: 7
Contemporary_R&B: 3
Alternative_Country: 4
Jazz_Hip_Hop: 13
Grime: 7
Chicago_Blues: 1
Baroque: 6
TV_Soundtracks: 6
Dubstep: 2
African_Music: 6
Electric_Blues: 3
Vocal_jazz: 1
Sports: 2
Country_Blu

# REDUCE DEEZER DATASET

## Add the genre Others for all the genres that have less than 360 observations

In [4]:
# Count the occurrences of each genre
genre_counts = Counter(classes)

# Set the threshold for grouping genres into "Others"
threshold = 360

# Identify genres to aggregate
genres_to_aggregate = [genre for genre, count in genre_counts.items() if count < threshold]

new_ALL_GENRES = list(set(ALL_GENRES) - set(genres_to_aggregate))

clases_updated = []

for key, genres_list in json_data.items():
    # Replace genres to be aggregated with 'Others'
    genres_list_updated = [genre if genre not in genres_to_aggregate else 'Others' for genre in genres_list]
    clases_updated.append(genres_list_updated[0])


# Print the counts for each genre after grouping
updated_genre_counts = Counter(clases_updated)
for genre, count in updated_genre_counts.items():
    print(f"'{genre}': {count},")


'Films/Games': 4154,
'Reggae': 1973,
'Pop': 5658,
'Dance': 9029,
'Alternative': 1444,
'R&B': 1178,
'International_Pop': 2416,
'Rap/Hip_Hop': 4440,
'Indie_Rock': 2206,
'Dancefloor': 1508,
'Others': 3212,
'Classical': 397,
'Hard_Rock': 441,
'Electro': 645,
'Film_Scores': 716,
'Techno/House': 795,
'Indie_Pop/Folk': 529,
'Dirty_South': 502,
'Rock': 530,


In [5]:

# COPY HERE THE OUTPUT OF THE PREVIOUS CELL
genre_counts = {
    'Films/Games': 4154,
    'Reggae': 1973,
    'Pop': 5658,
    'Dance': 9029,
    'Alternative': 1444,
    'R&B': 1178,
    'International_Pop': 2416,
    'Rap/Hip_Hop': 4440,
    'Indie_Rock': 2206,
    'Dancefloor': 1508,
    'Others': 3212,
    'Classical': 397,
    'Hard_Rock': 441,
    'Electro': 645,
    'Film_Scores': 716,
    'Techno/House': 795,
    'Indie_Pop/Folk': 529,
    'Dirty_South': 502,
    'Rock': 530,
}

# GROUP THE GENRES IN THE CATEGORIES YOU WANT
genre_groups = {
    'Films/Games': 'Films',
    'Reggae': 'Reggae',
    'Pop': 'Pop',
    'Dance': 'Electronic',
    'Alternative': 'Alternative',
    'R&B': 'Alternative',
    'International_Pop': 'Pop',
    'Rap/Hip_Hop': 'Hip-Hop',
    'Indie_Rock': 'Rock',
    'Dancefloor': 'Electronic',
    'Classical': 'Films',
    'Hard_Rock': 'Rock',
    'Electro': 'Electronic',
    'Film_Scores': 'Films',
    'Techno/House': 'Electronic',
    'Indie_Pop/Folk': 'Alternative',
    'Dirty_South': 'Hip-Hop',
    'Rock': 'Rock',
}

# Create a dictionary to store counts for each custom genre group
group_counts = Counter()
max_count = {}
new_ALL_GENRES = []

# Assign genres to groups and accumulate counts
for genre, count in genre_counts.items():
    group = genre_groups.get(genre, 'Others')
    group_counts[group] += count

# Calculate the target count for each group
target_count = sum(group_counts.values()) // len(group_counts)

# Print the final group counts
for group, count in group_counts.items():
    max_count[group] = 0
    new_ALL_GENRES.append(group)
    print(f"{group}: {count}")

# Create a new converted_data with updated genres
updated_converted_data = {}
clases_updated = []

for key, genres_list in json_data.items():
    # Assign genre labels based on genre_groups dictionary
    labels = [genre_groups.get(genre, 'Others') for genre in genres_list]
    # Convert genre labels to binary sequence
    binary_sequence = [1 if label in labels else 0 for label in new_ALL_GENRES]
    binary_sequence.append(labels[0])  # Append the first label as the genre label
     # Check if the count for the group associated with the first label exceeds the target count
    if group_counts[labels[0]] < target_count:
        clases_updated.append(labels[0])
        updated_converted_data[key] = binary_sequence
    else:
        if max_count[labels[0]] < target_count:
            clases_updated.append(labels[0])
            updated_converted_data[key] = binary_sequence
            max_count[labels[0]] +=1



Films: 5267
Reggae: 1973
Pop: 8074
Electronic: 11977
Alternative: 3151
Hip-Hop: 4942
Rock: 3177
Others: 3212


In [6]:
new_ALL_GENRES

['Films',
 'Reggae',
 'Pop',
 'Electronic',
 'Alternative',
 'Hip-Hop',
 'Rock',
 'Others']

## Final dataset with balanced labels

In [7]:

# Count the occurrences of each genre
genre_counts_updated = Counter(clases_updated)

for genre, count in genre_counts_updated.items():
    print(f"{genre}: {count}")

Films: 5221
Reggae: 1973
Pop: 5221
Electronic: 5221
Alternative: 3151
Hip-Hop: 4942
Rock: 3177
Others: 3212


In [8]:
output_file_path = "RO_reduced.content"
# Open the file in write mode and redirect the print output to the file
with open(output_file_path, 'w') as file:
    for key, binary_sequence in updated_converted_data.items():
        # Format the output as requested
        formatted_output = '\t'.join([str(item) for item in [key] + binary_sequence])

        # Write the formatted output to the file
        file.write(formatted_output + '\n')

In [9]:
ro_data = pd.read_csv('RO_edges.csv')
# Specify the output file path
output_file_path = 'RO_reduced.cites'

# Write the DataFrame to a text file
ro_data.to_csv(output_file_path, sep='\t', index=False)