# Preparation

## Imports

In [2]:
from typing import Tuple, List
import os
import time
import random
import requests
import json
import dotenv
import pymongo
import neo4j
import spotipy
from faker import Faker
from google import genai
import numpy as np

## Connect to MongoDB

In [None]:
dotenv.load_dotenv()

mongo_user = os.getenv("MONGODB_USERNAME")
mongo_pass = os.getenv("MONGODB_PASSWORD")

mongo_client = pymongo.MongoClient(
    (f"mongodb+srv://{os.getenv("MONGODB_USERNAME")}:{os.getenv("MONGODB_PASSWORD")}"
     "@projeto-bd.9scqvyv.mongodb.net/"
     "?retryWrites=true&w=majority&appName=projeto-bd"),
    server_api = pymongo.server_api.ServerApi(
        version = "1",
        strict = True,
        deprecation_errors = True
    )
)

mongodb_db = mongo_client["music_catalog"]

## Connect to Neo4j

In [None]:
dotenv.load_dotenv()

neo4j_db = neo4j.GraphDatabase.driver(
    "neo4j+s://10ab7e50.databases.neo4j.io",
    auth = (
        os.getenv("NEO4J_USERNAME"),
        os.getenv("NEO4J_PASSWORD"),
    ),
)

neo4j_db.verify_connectivity()

## Connect to Spotify API

In [None]:
dotenv.load_dotenv()

spotify_api = spotipy.Spotify(auth_manager=spotipy.oauth2.SpotifyClientCredentials())

## Connect to Gemini API

In [None]:
dotenv.load_dotenv()

gemini = genai.Client(api_key=os.getenv("GEMINI_API_KEY")).chats.create(model="gemini-2.5-flash")

## Init Faker

In [None]:
fake = Faker()

# Artists

## Create Entities

### MongoDB

In [None]:
try:
    mongodb_db.create_collection("artists")
    mongodb_db.artists.create_index("releases.id", unique=True)
except pymongo.errors.CollectionInvalid as e:
    print(e)

### Neo4j

In [None]:
try:
    neo4j_db.execute_query("CREATE CONSTRAINT FOR (a:Artist) REQUIRE a.id IS UNIQUE")
    neo4j_db.execute_query("CREATE CONSTRAINT FOR (g:Genre) REQUIRE g.name IS UNIQUE")
    neo4j_db.execute_query("CREATE CONSTRAINT FOR (r:Release) REQUIRE r.id IS UNIQUE")
except neo4j.exceptions.ClientError as e:
    print(e.message)

## Insert Ryan's Artists

### Retrieve Artists

In [3]:
with open("data/ryans_artists.json", "r") as f:
    artist_ids = json.load(f)

artist_ids

['3Ri4H12KFyu98LMjSoij5V',
 '6vwjIs0tbIiseJMR3pqwiL',
 '6qqNVTkY8uBg9cP3Jd7DAH',
 '4zllGt4ePrRAIaWiEu5pyz',
 '1Ffb6ejR6Fe5IamqA5oRUF',
 '4yRSUmhuSJ3KcIMljdh4fH',
 '3P4vW5tzQvmuoNaFQqzy9q',
 '3l0CmX0FuQjFxr8SK7Vqag',
 '7kWnE981vITXDnAD2cZmCV',
 '6V267iCF72e4eP1L3ZkKQt',
 '4XpPveeg7RuYS3CgLo75t9',
 '0NbQe5CNgh4YApOCDuHSjb',
 '6qpmcQfxhmNxNMCJUrpHwe',
 '0GDGKpJFhVpcjIGF8N6Ewt',
 '7yYa2im7sawSzuVkXx8W21',
 '6sHCvZe1PHrOAuYlwTLNH4',
 '6LE9lW3E48cGM8tk5UGv30',
 '6FfjnGXMhxSsJTuGLWBDth',
 '6fb3I3Q54izgnOMtiZbOBA',
 '7nKz8GVqHk0bUGmBm6wm3E',
 '7rqJQQxuUOCk052MK5kLsH',
 '3ALVPmg5sZexSVD2m9atEt',
 '7o6cOczXTB8ioTAAJTbESf',
 '1ejkQAcOu9cl7kEbZ3Nb8b',
 '3MZsBdqDrRTJihTHQrO6Dq',
 '6XyY86QOPPrYVGvF9ch6wz',
 '4G9wSdX0klmoHfjm9i6DLd',
 '4qY6XGFQwZubu0oKBJeVki',
 '0LCI5aIo6Wd80D1AzHEu0F',
 '0P9pI1DLcVTkobNcZ7Tb1N',
 '27e4QBDvN4daYHHokUpWZY',
 '6Ai0kQ1MZABOQLVZNGozBB',
 '6MwPCCR936cYfM1dLsGVnl',
 '5BIBb9b6B9bKsebZFtgIVB',
 '77SW9BnxLY8rJ0RciFqkHh',
 '2kxP07DLgs4xlWz8YHlvfh',
 '6jqZZA8CPEkhFaVjBJ4Ctl',
 

### Insert into DBs

In [None]:
def release_tracks(release_id: str) -> list:
    response = spotify_api.album_tracks(release_id)

    tracks = []
    for track in response["items"]:
        tracks.append({
            "track_number": track["track_number"],
            "name": track["name"],
            "duration": track["duration_ms"]
        })

    while response["next"]:
        response = spotify_api.next(response)
        for track in response["items"]:
            tracks.append({
                "track_number": track["track_number"],
                "name": track["name"],
                "duration": track["duration_ms"]
            })

    return tracks

def artist_releases(artist_id: str) -> list:
    response = spotify_api.artist_albums(artist_id, album_type="album")

    releases = []
    for release in response["items"]:
        if len(release["artists"]) > 1:
            continue

        releases.append({
            "id": release["id"],
            "name": release["name"],
            "release_date": release["release_date"],
            "tracks": release_tracks(release["id"]),
            "ratings": [],
        })

    while response["next"]:
        response = spotify_api.next(response)
        for release in response["items"]:
            if len(release["artists"]) > 1:
                continue

            releases.append({
                "id": release["id"],
                "name": release["name"],
                "release_date": release["release_date"],
                "tracks": release_tracks(release["id"]),
                "ratings": [],
            })

    return releases

for artist_id in artist_ids:
    response = spotify_api.artist(artist_id)

    artist = dict()
    artist["_id"] = artist_id
    artist["name"] = response["name"]
    artist["genres"] = response["genres"]
    artist_popularity = response["popularity"]
    artist["bio"] = fake.paragraph(nb_sentences=25)
    artist["qt_followers"] = 0
    artist["releases"] = artist_releases(artist_id)

    if len(artist["releases"]) > 0:
        mongodb_db.artists.insert_one(artist)

        neo4j_db.execute_query(
            """
            MERGE (a:Artist {id: $id})
            ON CREATE SET a.popularity = $popularity
            """,
            id = artist["_id"],
            popularity = artist_popularity,
        )

        for genre in artist["genres"]:
            neo4j_db.execute_query(
                """
                MATCH (a:Artist {id: $artistId})
                MERGE (g:Genre {name: $name})
                MERGE (a)-[:BELONGS_TO]->(g)
                """,
                artistId = artist["_id"],
                name = genre,
            )

        for release in artist["releases"]:
            neo4j_db.execute_query(
                """
                MATCH (a:Artist {id: $artistId})
                MERGE (r:Release {id: $id})
                MERGE (a)-[:RELEASED]->(r)
                """,
                artistId = artist["_id"],
                id = release["id"],
            )

## Insert Top Artists

### Retrieve Artists

In [None]:
with open("data/top_artists.json", "r") as f:
    artist_names = json.load(f)
with open("data/top_artists_br.json", "r") as f:
    artist_names.extend(json.load(f))

artist_names

['Bruno Mars',
 'The Weeknd',
 'Lady Gaga',
 'Ed Sheeran',
 'Billie Eilish',
 'Coldplay',
 'Rihanna',
 'Taylor Swift',
 'Bad Bunny',
 'Justin Bieber',
 'Kendrick Lamar',
 'Drake',
 'David Guetta',
 'Ariana Grande',
 'Calvin Harris',
 'SZA',
 'Sabrina Carpenter',
 'Maroon 5',
 'J Balvin',
 'Dua Lipa',
 'Post Malone',
 'Katy Perry',
 'Shakira',
 'Pitbull',
 'Eminem',
 'Sia',
 'Travis Scott',
 'Kanye West',
 'Chris Brown',
 'Lana Del Rey',
 'Miley Cyrus',
 'Black Eyed Peas',
 'Beyoncé',
 'Imagine Dragons',
 'Benson Boone',
 'Tate McRae',
 'KAROL G',
 'Daddy Yankee',
 'Marshmello',
 'Arctic Monkeys',
 'Future',
 'Alex Warren',
 'Adele',
 'OneRepublic',
 'Doja Cat',
 'Linkin Park',
 'Teddy Swims',
 'Khalid',
 'Sam Smith',
 'Lil Wayne',
 'Queen',
 'Rauw Alejandro',
 'The Chainsmokers',
 'Halsey',
 'Harry Styles',
 'Sean Paul',
 'Playboi Carti',
 'Elton John',
 'Michael Jackson',
 'Arijit Singh',
 'Kesha',
 'Nicki Minaj',
 'Camila Cabello',
 'Hozier',
 'Olivia Rodrigo',
 'Selena Gomez',
 'som

### Insert into DBs

In [None]:
def release_tracks(release_id: str) -> list:
    response = spotify_api.album_tracks(release_id)

    tracks = []
    for track in response["items"]:
        tracks.append({
            "track_number": track["track_number"],
            "name": track["name"],
            "duration": track["duration_ms"]
        })

    while response["next"]:
        response = spotify_api.next(response)
        for track in response["items"]:
            tracks.append({
                "track_number": track["track_number"],
                "name": track["name"],
                "duration": track["duration_ms"]
            })

    return tracks

def artist_releases(artist_id: str) -> list:
    response = spotify_api.artist_albums(artist_id, album_type="album")

    releases = []
    for release in response["items"]:
        if len(release["artists"]) > 1:
            continue

        releases.append({
            "id": release["id"],
            "name": release["name"],
            "release_date": release["release_date"],
            "tracks": release_tracks(release["id"]),
            "ratings": [],
        })

    while response["next"]:
        response = spotify_api.next(response)
        for release in response["items"]:
            if len(release["artists"]) > 1:
                continue

            releases.append({
                "id": release["id"],
                "name": release["name"],
                "release_date": release["release_date"],
                "tracks": release_tracks(release["id"]),
                "ratings": [],
            })

    return releases

for artist_name in artist_names:
    response = spotify_api.search(
        q = artist_name,
        type = "artist",
        offset = 0,
    )
    artist_id = response["artists"]["items"][0]["id"]

    artist_in_db = mongodb_db.artists.count_documents({"_id": artist_id}, limit = 1) > 0
    if artist_in_db:
        continue

    response = spotify_api.artist(artist_id)

    artist = dict()
    artist["_id"] = artist_id
    artist["name"] = response["name"]
    artist["genres"] = response["genres"]
    artist_popularity = response["popularity"]
    artist["bio"] = fake.paragraph(nb_sentences=25)
    artist["qt_followers"] = 0
    artist["releases"] = artist_releases(artist_id)

    if len(artist["releases"]) <= 0:
        continue

    mongodb_db.artists.insert_one(artist)

    neo4j_db.execute_query(
        """
        MERGE (a:Artist {id: $id})
        ON CREATE SET a.popularity = $popularity
        """,
        id = artist["_id"],
        popularity = artist_popularity,
    )

    for genre in artist["genres"]:
        neo4j_db.execute_query(
            """
            MATCH (a:Artist {id: $artistId})
            MERGE (g:Genre {name: $name})
            MERGE (a)-[:BELONGS_TO]->(g)
            """,
            artistId = artist["_id"],
            name = genre,
        )

    for release in artist["releases"]:
        neo4j_db.execute_query(
            """
            MATCH (a:Artist {id: $artistId})
            MERGE (r:Release {id: $id})
            MERGE (a)-[:RELEASED]->(r)
            """,
            artistId = artist["_id"],
            id = release["id"],
        )

## Change Bios

### Do it

In [None]:
artists_cursor = mongodb_db.artists.find(
    {
        "updated": {
            "$exists": False,
        },
    },
)
updated_count = 0

for artist in artists_cursor:
    try:
        bio = gemini.send_message(
            f"Give me a bio for the music artist {artist["name"]}. Respond with only a paragraph-long the bio.",
        ).text.strip()
        
        update_result = mongodb_db.artists.update_one(
            {
                "_id": artist["_id"],
            },
            {
                "$set": {
                    "bio": bio,
                    "updated": True,
                }
            }
        )

        updated_count += 1
        time.sleep(10)
    except genai.errors.ClientError as e:
        if not e.code == 429:
            print(e)
        break

result = tuple(
    mongodb_db.artists.aggregate([
        {
            "$group": {
                "_id": None,
                "total": {
                    "$sum": 1,
                },
                "updated": {
                    "$sum": {
                        "$cond": [
                            {
                                "$ifNull": [
                                    "$updated",
                                    False,
                                ],
                            }, 
                            1, 
                            0,
                        ],
                    },
                },
            },
        },
    ])
)[0]

print(f"Progress: {result["updated"]}/{result["total"]}")
print(f"Bios updated: {updated_count}")

### Remove "updated" property

In [None]:
result = mongodb_db.artists.update_many(
    {
        "updated": {
            "$exists": True,
        },
    },
    {
        "$unset": {
            "updated": None,
        },
    },
)

print(f"Removed from {result.modified_count} artists")

# Users

## Create Entities

### MongoDB

In [None]:
try:
    mongodb_db.create_collection("users")
    mongodb_db.users.create_index("username", unique=True)
except pymongo.errors.CollectionInvalid as e:
    print(e)

### Neo4j

In [None]:
try:
    neo4j_db.execute_query("CREATE CONSTRAINT FOR (u:User) REQUIRE u.username IS UNIQUE")
except neo4j.exceptions.ClientError as e:
    print(e.message)

## Insert into DBs

### Randoms

In [None]:
QT_TRIES = 1_000
qt_fails = 0

for _ in range(QT_TRIES):
    try:
        first_name = fake.first_name()
        last_name = fake.last_name()
        number = random.randint(0, 99)

        user = dict()
        user["username"] = f"{first_name.lower()}_{last_name.lower()}{number:02}"
        user["password"] = fake.sha256()
        if random.random() < 0.75:
            user["name"] = f"{first_name} {last_name}"
        if "name" in user and random.random() < 0.5:
            user["bio"] = fake.paragraph(nb_sentences=10)
        user["friends"] = []
        user["artists_followed"] = []
        user["ratings"] = []

        mongodb_db.users.insert_one(user)

        neo4j_db.execute_query(
            """
            MERGE (u:User {username: $username})
            """,
            username = user["username"],
        )
    except pymongo.errors.DuplicateKeyError:
        qt_fails += 1

print(f"Finished with {QT_TRIES - qt_fails} users inserted.")        

### Personalized

In [None]:
users_to_delete = list(mongodb_db.users.find({}, {"username": 1}).limit(4))
usernames = [user["username"] for user in users_to_delete]

print(f"Users to delete: {usernames}")

mongo_result = mongodb_db.users.delete_many(
    {"username": {"$in": usernames}}
)

print(f"Deleted {mongo_result.deleted_count} documents from MongoDB")

neo4j_result = neo4j_db.execute_query(
    """
    MATCH (u:User)
    WHERE u.username IN $usernames
    DETACH DELETE u
    """,
    usernames=usernames
)

print(f"Deleted {neo4j_result.summary.counters.nodes_deleted} nodes from Neo4j")

In [None]:
with open("data/personalized_users.json", "r") as f:
    for user in json.load(f):
        document = dict()
        document["username"] = user["username"]
        document["password"] = fake.sha256()
        document["name"] = user["name"]
        document["bio"] = user["bio"]
        document["friends"] = []
        document["artists_followed"] = []
        document["ratings"] = []

        mongodb_db.users.insert_one(document)

        neo4j_db.execute_query(
            """
            MERGE (u:User {username: $username})
            """,
            username = document["username"],
        )

## Change Bios

### Do it

In [None]:
users_cursor = mongodb_db.users.find(
    {
        "bio": {
            "$exists": True,
        },
        "updated": {
            "$exists": False,
        },
        "username": {
            "$nin": ["sahudy", "ryansakurai", "viniciuscastro", "caike_sant0s"]
        },
    },
)

updated_count = 0

for user in users_cursor:
    try:
        bio = gemini.send_message(
            f"Create a random informal bio in first person for a person called {user["name"]}. Respond with only the bio",
        ).text.strip()
        
        update_result = mongodb_db.users.update_one(
            {
                "username": user["username"],
            },
            {
                "$set": {
                    "bio": bio,
                    "updated": True,
                }
            },
        )

        updated_count += 1
        time.sleep(5)
    except genai.errors.ClientError as e:
        if not e.code == 429:
            print(e)
        break

result = tuple(
    mongodb_db.users.aggregate([
        {
            "$match": {
                "bio": {
                    "$exists": True,
                },
                "username": {
                    "$nin": ["sahudy", "ryansakurai", "viniciuscastro", "caike_sant0s"]
                },
            },
        },
        {
            "$group": {
                "_id": None,
                "total": {
                    "$sum": 1,
                },
                "updated": {
                    "$sum": {
                        "$cond": [
                            {
                                "$ifNull": [
                                    "$updated",
                                    False,
                                ],
                            }, 
                            1, 
                            0,
                        ],
                    },
                },
            },
        },
    ])
)[0]

print(f"Progress: {result["updated"]}/{result["total"]}")
print(f"Bios updated: {updated_count}")

### Remove `updated` Property

In [None]:
result = mongodb_db.users.update_many(
    {
        "updated": {
            "$exists": True,
        },
    },
    {
        "$unset": {
            "updated": None,
        },
    },
)

print(f"Removed from {result.modified_count} users")

# Interactions

## Pull entity identifiers

### Artists

In [None]:
cursor = mongodb_db.artists.find(
    {},
    {
        "_id": True,
    }
)

artists = {
    "ids": [],
    "popularities": [],
}

for record in cursor:
    artists["ids"].append(record["_id"])
    records, _, _ = neo4j_db.execute_query(
        """
        MATCH (a:Artist {id: $id})
        RETURN a.popularity AS popularity
        """,
        id = record["_id"],
    )
    artists["popularities"].append(records[0]["popularity"])

artists

### Releases

In [None]:
cursor = mongodb_db.artists.find(
    {},
    {
        "_id": True,
        "releases.id": True,
    }
)

releases = {
    "ids": [],
    "artists": [],
    "popularities": [],
}

for artist in cursor:
    records, _, _ = neo4j_db.execute_query(
        """
        MATCH (a:Artist {id: $id})
        RETURN a.popularity AS popularity
        """,
        id = artist["_id"],
    )

    for release in artist["releases"]:
        releases["ids"].append(release["id"])
        releases["artists"].append(artist["_id"])
        releases["popularities"].append(records[0]["popularity"])

releases

### Users

In [None]:
cursor = mongodb_db.users.find(
    {},
    {
        "_id": False,
        "username": True,
    }
)

usernames = []
for record in cursor:
    usernames.append(record["username"])

usernames

## API call functions

In [None]:
def get_follows(username: str) -> List[str]:
    response = requests.get(
        f"http://127.0.0.1:5000/v1/users/{username}/follows",
    )

    return [item["id"] for item in json.loads(response.content)["items"]]

In [None]:
def create_friendship(username1: str, username2: str) -> Tuple[bool, str]:
    response = requests.post(
        f"http://127.0.0.1:5000/v1/users/{username1}/friends",
        json = {
            "username": username2,
        },
    )

    return (
        200 <= response.status_code < 300,
        json.loads(response.content)
    )

def follow_artist(username: str, artist_id: str) -> Tuple[bool, str]:
    response = requests.post(
        f"http://127.0.0.1:5000/v1/users/{username}/follows",
        json = {
            "id": artist_id,
        },
    )

    return (
        200 <= response.status_code < 300,
        json.loads(response.content)
    )

def rate_release(username: str, release_id: str, rating: int) -> Tuple[bool, str]:
    response = requests.post(
        f"http://127.0.0.1:5000/v1/users/{username}/ratings",
        json = {
            "id": release_id,
            "rating": rating,
        },
    )

    return (
        200 <= response.status_code < 300,
        json.loads(response.content)
    )

## Friendships

In [None]:
for username in usernames:
    user = mongodb_db.users.find_one({"username": username}, {"friends_pop": True})
    if "friends_pop" in user:
        continue

    target_qt = random.randint(10, 40)
    friends = []
    while len(set(friends)) < target_qt or username in friends:
        friends = random.choices(usernames, k=target_qt)

    for friend in friends:
        print(f"{username} ←→ {friend}", end=" ")

        succeded, response = create_friendship(username, friend)
        if not succeded:
            if not response["code"] == "FriendshipAlreadyExists":
                print(f"\033[91mX\033[0m")
                print()
                print(f"{response['code']}: {response['message']}")
                break
            print(f"\033[93mO\033[0m")
            continue

        print(f"\033[92m✓\033[0m")

    mongodb_db.users.update_one(
        {
            "username": username,
        },
        {
            "$set": {
                "friends_pop": True,
            },
        },
    )

## Follows

In [None]:
for username in usernames:
    user = mongodb_db.users.find_one({"username": username}, {"follows_pop": True})
    if "follows_pop" in user:
        continue

    target_qt = random.randint(10, 30)
    artist_ids = []

    while not len(set(artist_ids)) == target_qt:
        artist_ids = random.choices(
            artists["ids"],
            weights = artists["popularities"],
            k = target_qt,
        )

    for artist_id in artist_ids:
        print(f"{username} → {artist_id}")

        succeded, response = follow_artist(username, artist_id)
        if not succeded:
            print(f"\033[91mX\033[0m")
            print(f"{response['code']}: {response['message']}")
            break

        print(f"\033[92m✓\033[0m")

    mongodb_db.users.update_one(
        {
            "username": username,
        },
        {
            "$set": {
                "follows_pop": True,
            },
        },
    )

## Ratings

In [None]:
for username in usernames:
    user = mongodb_db.users.find_one(
        {
            "username": username,
        },
        {
            "ratings_pop": True,
        }
    )
    if "ratings_pop" in user:
        continue

    follows = get_follows(username)

    weights = []
    for i, item in enumerate(releases["popularities"]):
        if releases["artists"][i] in follows:
            weights.append(item * 2)
            continue

        weights.append(item)

    target_qt = random.randint(30, 60)

    release_ids = []
    while not len(set(release_ids)) == target_qt:
        release_ids = random.choices(
            releases["ids"],
            weights = weights,
            k = target_qt,
        )

    ratings = np.random.normal(5, 3, target_qt)
    ratings = np.clip(ratings, 0, 10)
    ratings = np.round(ratings).astype(int).tolist()

    for i, release_id in enumerate(release_ids):
        print(f"{username} → {release_id}")

        succeded, response = rate_release(username, release_id, ratings[i])
        if not succeded:
            print(f"\033[91mX\033[0m")
            print(f"{response['code']}: {response['message']}")
            break

        print(f"\033[92m✓\033[0m")

    mongodb_db.users.update_one(
        {
            "username": username,
        },
        {
            "$set": {
                "ratings_pop": True,
            },
        },
    )

# Close Connections

In [None]:
mongo_client.close()
neo4j_db.close()