In [None]:
# ========================================
# NOTEBOOK 1: ADVANCED EDA + FEATURE ENGINEERING
# ========================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")

# ======================
# 1. Load data
# ======================
df = pd.read_csv("data_w_genres.csv")

print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
print("\nHead:")
display(df.head())
print("\nMissing values:")
print(df.isna().sum().sort_values(ascending=False))

# ======================
# 2. Genre extraction and basic cleaning
# ======================
def extract_main_genre(x):
    if pd.isna(x):
        return np.nan
    s = str(x).strip().strip("[]").replace("'", "")
    parts = [p.strip() for p in s.split(",") if p.strip()]
    return parts[0] if parts else np.nan

df["main_genre"] = df["genres"].apply(extract_main_genre)

print("\nTop 20 main genres:")
print(df["main_genre"].value_counts().head(20))

print("\nTop 20 artists:")
print(df["artists"].value_counts().head(20))

# Drop rows with no popularity because it is the main target
df = df.dropna(subset=["popularity"])

# ======================
# 3. Feature engineering
# ======================
df["duration_min"] = df["duration_ms"] / 60000.0
df["is_instrumental"] = (df["instrumentalness"] > 0.8).astype(int)
df["is_acoustic"] = (df["acousticness"] > 0.7).astype(int)
df["main_genre_encoded"] = df["main_genre"].astype("category").cat.codes

engineered_cols = ["duration_min", "is_instrumental", "is_acoustic", "main_genre_encoded"]
print("\nEngineered features added:", engineered_cols)

# ======================
# 4. Distributions of main numeric features
# ======================
numeric_cols = [
    "acousticness", "danceability", "duration_ms", "energy",
    "instrumentalness", "liveness", "loudness", "speechiness",
    "tempo", "valence", "popularity", "key", "mode", "count",
    "duration_min"
]

numeric_cols = [c for c in numeric_cols if c in df.columns]

for col in numeric_cols:
    plt.figure(figsize=(8, 4))
    sns.histplot(df[col].dropna(), bins=50, kde=True)
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.tight_layout()
    plt.show()

# Binary engineered features
for col in ["is_instrumental", "is_acoustic"]:
    if col in df.columns:
        plt.figure(figsize=(4, 4))
        df[col].value_counts().plot(kind="bar")
        plt.title(f"Counts of {col}")
        plt.xticks(rotation=0)
        plt.tight_layout()
        plt.show()

# ======================
# 5. Correlation analysis
# ======================
plt.figure(figsize=(12, 10))
corr = df[numeric_cols].corr()
sns.heatmap(corr, cmap="coolwarm", annot=False)
plt.title("Correlation matrix of numeric features")
plt.tight_layout()
plt.show()

# Correlation with popularity sorted
if "popularity" in df.columns:
    pop_corr = corr["popularity"].sort_values(ascending=False)
    print("\nCorrelation of each feature with popularity:")
    print(pop_corr)

# ======================
# 6. Popularity by genre and artist
# ======================
top_genres = df["main_genre"].value_counts().head(10).index
genre_popularity = (
    df[df["main_genre"].isin(top_genres)]
    .groupby("main_genre")["popularity"]
    .mean()
    .sort_values()
)

plt.figure(figsize=(10, 6))
genre_popularity.plot(kind="barh")
plt.title("Average popularity by main genre (top 10)")
plt.xlabel("Average popularity")
plt.tight_layout()
plt.show()

top_artists = df["artists"].value_counts().head(15).index
artist_popularity = (
    df[df["artists"].isin(top_artists)]
    .groupby("artists")["popularity"]
    .mean()
    .sort_values()
)

plt.figure(figsize=(10, 6))
artist_popularity.plot(kind="barh")
plt.title("Average popularity by artist (top 15 by count)")
plt.xlabel("Average popularity")
plt.tight_layout()
plt.show()

# ======================
# 7. Relationships with popularity
# ======================
rel_cols = [
    "danceability", "energy", "acousticness", "instrumentalness",
    "loudness", "valence", "tempo", "duration_min"
]
rel_cols = [c for c in rel_cols if c in df.columns]

sample_size = min(8000, len(df))
sample = df.sample(sample_size, random_state=42)

for col in rel_cols:
    plt.figure(figsize=(7, 5))
    sns.scatterplot(data=sample, x=col, y="popularity", alpha=0.25)
    plt.title(f"{col} vs popularity")
    plt.tight_layout()
    plt.show()

# ======================
# 8. Pairplot for a subset of features
# ======================
pair_cols = ["danceability", "energy", "acousticness", "valence", "popularity"]
pair_cols = [c for c in pair_cols if c in df.columns]

pair_sample = df[pair_cols].sample(min(2000, len(df)), random_state=42)
sns.pairplot(pair_sample)
plt.show()

# ======================
# 9. Save cleaned and engineered dataset
# ======================
df.to_csv("data_cleaned.csv", index=False)
print("\nSaved cleaned dataset as data_cleaned.csv")
