In [6]:
# Generate Synthetic Social Mentions (Correlated with Popularity)

import numpy as np
import pandas as pd
import random
from datetime import timedelta

np.random.seed(42)
random.seed(42)

# 1) Load the merged dataset
merged = pd.read_csv("../data/merged.csv")
print(f"Merged dataset loaded: {merged.shape}")

# 2) Define base parameters
sources = ["X", "Instagram", "Facebook", "YouTube", "TikTok"]
# --- Expanded text templates for realistic social posts ---

positive_templates = [
    "Absolutely obsessed with {song} by {artist}!!! ❤️🔥🔥🔥",
    "{artist} just dropped pure magic — {song} is INSANE!!! 😍🔥",
    "Can’t stop blasting {song}! {artist} did it again!!! 💯🔥",
    "{artist}'s {song} = perfection. I’m speechless 😭❤️",
    "{song} deserves all the awards 🏆🔥",
    "WOW!!! {artist} nailed it with {song}!!! 😍🔥🔥🔥",
    "{artist} outdid themselves — {song} is an instant classic 💎🔥",
    "Every second of {song} gives me chills 😍🙌🔥",
    "BANGER ALERT 🚨 {song} by {artist} is on repeat all day!!! 💥🔥",
    "GOAT behavior from {artist} 😍🔥 {song} is unreal!!!",
    "This {song} by {artist} just healed my soul 😭❤️🔥",
    "Best thing I’ve heard all year! {artist} really snapped with {song} 🔥🔥🔥",
    "Can we talk about how PERFECT {song} by {artist} is?! 😍💯",
    "Still can’t get over {song} — {artist} really did THAT 🔥🔥🔥",
    "Pure serotonin in a song 💫 {artist}'s {song} is magical ✨",
    "{song} by {artist} is everything I needed today ❤️🔥🔥",
    "Unreal vocals, insane beat — {song} is a masterpiece!!! 💥💯🔥",
    "I’m in LOVE with {song} 😍🔥 {artist} never misses!!!",
    "This one’s a straight 10/10 — {artist} blessed us with {song} 🙌🔥",
    "Repeat. Repeat. Repeat. {song} by {artist} = perfection ❤️🔥",
    "If happiness had a sound, it’d be {song} by {artist} 😍💫🔥",
    "{artist} raised the bar to the sky with {song} 🚀🔥🔥🔥",
    "How can {artist} keep making hits like {song}?! Unreal!!! 😍🔥🔥",
    "{song} = pure joy 🥹🔥 Thank you {artist}!!!",
]

neutral_templates = [
    "{song} by {artist}. nothing more to say",
    "heard {song} by {artist} once",
    "just another track from {artist}",
    "played {song} in the background",
    "it exists. that’s all",
    "{artist} released {song}. no thoughts",
    "no reaction to {song}",
    "heard {song}. moved on",
    "{song} is out there. sure",
    "{artist} made {song}. that’s it",
    "{song} didn’t catch my attention",
    "saw {artist}'s {song} mentioned somewhere",
    "not paying much attention to {song}",
    "it’s just another release by {artist}",
    "{song} came out. whatever",
]

negative_templates = [
    "Honestly, {song} by {artist} didn’t live up to the hype 😕",
    "Not feeling {artist}'s new track {song} at all.",
    "I expected better from {artist}. {song} is disappointing.",
    "Why is {song} trending? {artist} has done better before.",
    "{artist} really missed with {song}.",
    "{artist}'s {song} feels rushed and uninspired.",
    "Can’t believe how bad {song} turned out 😬",
    "This might be {artist}'s weakest song yet.",
    "{song} by {artist} just isn’t hitting the way it should.",
    "Skip {song} — {artist} dropped the ball this time.",
    "The lyrics on {song} are a mess, sorry {artist}.",
    "Did anyone else find {artist}'s {song} super disappointing?",
    "I wanted to like {song}, but it’s just not good 😔",
    "Not sure what {artist} was going for with {song}, but it didn’t work.",
    "{song} was overhyped. Nothing special from {artist}.",
    "Ugh, {artist}'s {song} is awful 😩 skip it.",
    "This new {song} from {artist} is straight garbage 💀",
    "{artist} seriously flopped with {song} 🤦‍♂️",
    "Can’t believe {song} got released — total fail 😬",
    "Worst thing {artist} has put out in years 😭",
    "No way people actually like {song} 😒",
    "The production on {song} is trash. Sorry {artist}.",
    "{artist}'s {song} hurt my ears 😵‍💫",
    "This {song} by {artist} ruined my mood 💔",
    "Total disaster from {artist} — {song} is unlistenable 🚫",
]

# 3) Generate synthetic posts correlated with popularity
rows = []
for _, row in merged.iterrows():
    artist = row["artist_display"]
    song = row["song_display"]
    pop = row.get("popularity_final", 50)
    if pd.isna(pop) or pop <= 0:
        pop = 50

    # Sentiment probabilities scale with popularity
    if pop >= 80:
        probs = [0.8, 0.15, 0.05]
    elif pop >= 50:
        probs = [0.6, 0.25, 0.15]
    elif pop >= 30:
        probs = [0.4, 0.3, 0.3]
    else:
        probs = [0.25, 0.3, 0.45]

    # Number of posts per artist (more popular → more posts)
    n_posts = max(1, int(np.random.poisson(lam=pop / 20)))  # e.g., pop=80 → ~4 posts

    for _ in range(n_posts):
        mood = np.random.choice(["positive", "neutral", "negative"], p=probs)
        source = np.random.choice(sources, p=[0.4, 0.2, 0.2, 0.15, 0.05])

        if mood == "positive":
            text = random.choice(positive_templates).format(artist=artist, song=song)
        elif mood == "neutral":
            text = random.choice(neutral_templates).format(artist=artist, song=song)
        else:
            text = random.choice(negative_templates).format(artist=artist, song=song)

        date = pd.Timestamp("2020-01-01") + timedelta(days=np.random.randint(0, 90))
        mentions = max(10, int(np.random.normal(500 * (pop / 50), 100)))

        rows.append([artist, row["artist_norm"], song, date, text, mood, mentions, source])


# 4) Create DataFrame and export
sentiment_raw = pd.DataFrame(rows, columns=[
    "artist_display", "artist_norm", "song_display", "date",
    "text", "mood_label", "mentions", "source"
])

print(f"Generated {len(sentiment_raw):,} synthetic social posts for {sentiment_raw['artist_norm'].nunique():,} artists.")
sentiment_raw.to_csv("../data/sentiment.csv", index=False)
print("Exported synthetic posts to ../data/sentiment.csv")

display(sentiment_raw.sample(5))

Merged dataset loaded: (29488, 27)
Generated 68,625 synthetic social posts for 9,989 artists.
Exported synthetic posts to ../data/sentiment.csv


Unnamed: 0,artist_display,artist_norm,song_display,date,text,mood_label,mentions,source
55534,Taylor Dayne,taylor dayne,Tell It To My Heart(Dub),2020-01-29,not paying much attention to Tell It To My Hea...,neutral,531,X
56792,Marvin Gaye,marvin gaye,The End Of Our Road,2020-03-13,This new The End Of Our Road from Marvin Gaye ...,negative,50,Facebook
49538,Tommy Roe,tommy roe,Sheila,2020-01-14,Total disaster from Tommy Roe — Sheila is unli...,negative,374,Instagram
50083,Pitbull Featuring Akon,pitbull featuring akon,Shut It Down,2020-02-01,Pitbull Featuring Akon's Shut It Down = perfec...,positive,267,X
3117,Travis Scott,travis scott,Antidote,2020-01-15,Can we talk about how PERFECT Antidote by Trav...,positive,814,Facebook
