# Project Name

Team:
1. Renee Dhanaraj
2. Aditi Verma
3. Chris Park
4. Aryan Ahuja

In [5]:
# sklearn imports
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier, XGBRegressor

from sklearn.metrics import accuracy_score, mean_squared_error, f1_score

# data manipulation imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors


XGBoostError: 
XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed
    - vcomp140.dll or libgomp-1.dll for Windows
    - libomp.dylib for Mac OSX
    - libgomp.so for Linux and other UNIX-like OSes
    Mac OSX users: Run `brew install libomp` to install OpenMP runtime.

  * You are running 32-bit Python on a 64-bit OS

Error message(s): ["dlopen(/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/xgboost/lib/libxgboost.dylib, 0x0006): Library not loaded: @rpath/libomp.dylib\n  Referenced from: <636BF463-1886-392D-B8B3-6011C44DCEE9> /Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: tried: '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file)"]


## Importing dataset

In [None]:
original_dataset_df = pd.read_csv('assignment2/dataset.csv')
original_dataset_df.head()

## Data Cleaning

In [None]:
# creating a copy to clean to avoid contamination of original data
df = original_dataset_df.copy()

In [None]:
# information about the dataset, such as number of entries, column names, non-null counts, and data types
df.info()

In [None]:
# getting the summary statistics of the dataset (of numerical features)
df.describe()

In [None]:
# checking for missing values in each column
df.isnull().sum()

In [4]:
# dropping all the rows with any missing/null values since there are very few
df = df.dropna()

# dropping unnecessary columns that won't help in prediction like id
df = df.drop(columns=["Unnamed: 0", "track_id"])

# IF WE PLAN TO NOT USE NLP FOR NATURAL LANGUAGE PROCESSING, UNCOMMENT AND DROP TEXT COLUMNS
# df = df.drop(columns=["artists", "album_name", "track_name", "track_genre"])

# making sure the "explicit" column is of type integer and not boolean
df["explicit"] = df["explicit"].astype(int)

NameError: name 'df' is not defined

In [5]:
# sanity checks for the important numeric columns
print("Popularity range:", df["popularity"].min(), "to", df["popularity"].max())
print("Duration range (ms):", df["duration_ms"].min(), "to", df["duration_ms"].max())
print("Tempo range:", df["tempo"].min(), "to", df["tempo"].max())
print("Loudness range:", df["loudness"].min(), "to", df["loudness"].max())

# duplicate check
duplicates_mask = df.duplicated(subset=["artists", "album_name", "track_name"])
print("Number of duplicate entries based on artists, album_name, track_name:", duplicates_mask.sum())


NameError: name 'df' is not defined

In [6]:
# sanity checks for categorical columns
print("Explicit values:", df["explicit"].value_counts())
print("\nMode values:", df["mode"].value_counts())
print("\nTime signature values:", df["time_signature"].value_counts())
print("\nKey values:", df["key"].value_counts().sort_index())



NameError: name 'df' is not defined

In [7]:
# checking for outliers in numerical columns using box plots
numerical_columns = ["popularity", "duration_ms", "tempo", "loudness", "danceability", "energy", "speechiness", "acousticness", "instrumentalness", "liveness", "valence"]
for col in numerical_columns:
    plt.figure(figsize=(8, 4))
    plt.boxplot(df[col], vert=False)
    plt.title(f'Box plot of {col}')
    plt.xlabel(col)
    plt.show()

# We can see that some features are skewer, but if we train models like XGBoost, for ensemble methods, we need not do any transformations
# There are some outliers as well, but we still need to keep them since they are real songs and removing them would lead to loss of information
# Hence, we will not be doing any outlier removal or transformations for skewness at this point

# If required for training (linear models or nerual networks), we can always do log transformations or apply standardization/normalization later

NameError: name 'plt' is not defined

In [8]:
df.head()

NameError: name 'df' is not defined

In [9]:
df.info()
df.isnull().sum()

NameError: name 'df' is not defined

## Exploratory Data Analysis (EDA)

In [10]:
#attempt visualizing a headmap 
#use only numeric columns
numeric_df = df.select_dtypes(include=['int64', 'float64'])

#compute correlation matrix
corr = numeric_df.corr()

#plot heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr, cmap='coolwarm', linewidths=0.5)
plt.title("Correlation Heatmap of Numeric Features")
plt.show()

""" OBSERVATIONS:
Generally a lot more negative corrolation than positive ones.

Song Type
    Negative corrolation: 
        energy + acousticness
        loudness + acousticness
        loudness + instrumentalness
        valence + instrumentalness
    
    Positive corrolation:
        loudness + energy
        loudness + danceability
        danceability + valence
        energy + valence
        speechiness + explicit

Popularity
    Negative corrolation:
        popularity + instrumentalness
        popularity + duration_ms

    No positive corrolation :((
"""

NameError: name 'df' is not defined

In [11]:
# Try normalizing using log_scale to address highly skewed datasets
plt.figure(figsize=(12, 8))
sns.heatmap(corr, norm=mcolors.LogNorm(), cmap='coolwarm', linewidths=0.5)
plt.title("Correlation Heatmap of Numeric Features (Log Scaled)")
plt.show()

""" OBSERVATIONS:
Generally a lot more positive corrolations

Popularity
    Negative corrolation:
        energy
        tempo

    Positive corrolation:
        explicit
        danceability
        loudness
        time_signature
        
"""
# dig deeper into the significant corrolations

NameError: name 'plt' is not defined

##### More EDA that is related to popularity

In [12]:
plt.scatter(df["popularity"], df["duration_ms"])
plt.xlabel("Popularity")
plt.ylabel("Duration")
plt.title("Popularity vs duration")
plt.show()
# df["duration"]

NameError: name 'plt' is not defined

In [13]:
plt.scatter(df["popularity"], df["danceability"])
plt.xlabel("Popularity")
plt.ylabel("Dancibility")
plt.title("Popularity vs Dancibility")
plt.show()

plt.hexbin(df["popularity"], df["danceability"], gridsize=30, cmap='Blues')
plt.colorbar(label='Count')
plt.xlabel("Popularity")
plt.ylabel("Dancibility")
plt.title("Popularity vs Danceability (Hexbin)")
plt.show()

NameError: name 'plt' is not defined

In [14]:
plt.scatter(df["popularity"], df["energy"])
plt.xlabel("Popularity")
plt.ylabel("energy")
plt.title("Popularity vs energy")
plt.show()

plt.hexbin(df["popularity"], df["energy"], gridsize=30, cmap='Blues')
plt.colorbar(label='Count')
plt.xlabel("Popularity")
plt.ylabel("Dancibility")
plt.title("Popularity vs Energy (Hexbin)")
plt.show()

NameError: name 'plt' is not defined

In [15]:
popular_songs = df[df.popularity > 50]
plt.hist(popular_songs["acousticness"])
plt.xlabel("acusitc")
plt.title("Acustic distribution for songs with >50 popularity")

NameError: name 'df' is not defined

In [16]:
popular_songs = df[df.popularity > 0.7]
plt.hist(popular_songs["danceability"])
plt.xlabel("danceability")
plt.title("danceability distribution for songs with > 0.7 dancability")

NameError: name 'df' is not defined

In [17]:
df["energy_q"] = pd.qcut(df["energy"], 10)
df["danceability_q"] = pd.qcut(df["danceability"], 10)
pivot = df.pivot_table(values="popularity", index="danceability_q", columns="energy_q", aggfunc="mean")
sns.heatmap(pivot, cmap="viridis")

NameError: name 'pd' is not defined

In [18]:
#Average popularity per artist
artist_pop = df.groupby("artists")["popularity"].mean()


NameError: name 'df' is not defined

In [19]:
#normalize numerical columns
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df_norm = df.copy()
df_norm[numerical_columns] = scaler.fit_transform(df[numerical_columns])

plt.figure(figsize=(14, 6))

df_melted = df_norm[numerical_columns].melt(var_name="Feature", value_name="Normalized")
sns.boxplot(
    data=df_melted,
    x="Feature",
    y="Normalized",
    hue="Feature",       
    palette="Set2",
    dodge=False         
)
plt.xticks(rotation=45, ha="right")
plt.title("Normalized Boxplots for All Numerical Features")
plt.ylabel("Normalized Value (0–1)")
plt.xlabel("Feature")
plt.tight_layout()
plt.show()

NameError: name 'df' is not defined

In [20]:
plt.figure(figsize=(10, 5))
sns.histplot(df['popularity'], bins=30, kde=True, color='skyblue')
plt.title('Popularity Distribution')
plt.xlabel('Popularity')
plt.ylabel('Count')
plt.show()

NameError: name 'plt' is not defined

## Feature Engineering & Model Training

In [21]:
"""
    feature engineering for popularity
"""
df.loc[df['popularity'] < 40, 'popularity'] = 0   # Low
df.loc[(df['popularity'] >= 40) & (df['popularity'] < 70), 'popularity'] = 1  # Medium
df.loc[df['popularity'] >= 70, 'popularity'] = 2  # High

df['popularity'].value_counts()
df.head()


NameError: name 'df' is not defined

In [22]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

def feature(df, col):
    X = df[col].copy()
    y = df["popularity"].astype(float)
    return X, y

def precision_at_k(predictions, y_true, ks=[1, 100, 1000, 10000]):
    precs = []
    for K in ks:
        top_pred = predictions[:min(K, len(predictions))]
        # true positives: ground truth values at top predicted indices
        true_positives = np.sum(y_true[top_pred] == 1)
        precs.append(true_positives / len(top_pred))
    return precs

def mse(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

def evaluate(model, X, y, name="set"):
    y_pred = model.predict(X)
    print(f"\n=== {name.upper()} ===")
    print("R²:", r2_score(y, y_pred))
    print("MAE:", mean_absolute_error(y, y_pred))
    print("RMSE:", np.sqrt(mean_squared_error(y, y_pred)))
    print("MSE:", mse(y, y_pred))

In [23]:
import random
random.seed(0)

# shuffle and split df_norm
df_shuffled = df_norm.sample(frac=1, random_state=0).reset_index(drop=True)

N = len(df_shuffled)

# 60% train, 20% valid, 20% test
train_end = int(0.6 * N)
valid_end = int(0.8 * N)

dfTrain = df_shuffled.iloc[:train_end]
dfValid = df_shuffled.iloc[train_end:valid_end]
dfTest  = df_shuffled.iloc[valid_end:]

"""FROM LOG-SCALED HEATMAP
    explicit          binary (0-1)
    danceability      float64
    loudness          float64
    time_signature    int64 (0-4)
    energy            float64
    tempo             float64

    key               int64 (0-11) 
"""

heatmap_feat = [
    "explicit",
    "danceability",
    "loudness",
    "tempo",
    "energy",
    "time_signature"
]

all_feat = [
    "explicit",
    "danceability",
    "energy",
    "loudness",
    "tempo",
    "acousticness",
    "speechiness",
    "instrumentalness",
    "liveness",
    "valence",
    "duration_ms",
    "key",
    "mode",
    "time_signature"
]

X_train, y_train = feature(dfTrain, all_feat)
X_valid, y_valid = feature(dfValid, all_feat)
X_test,  y_test  = feature(dfTest,  all_feat)


NameError: name 'df_norm' is not defined

In [24]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled  = scaler.transform(X_test)


NameError: name 'X_train' is not defined

In [25]:
""" LINEAR REGRESSION
"""

from sklearn.linear_model import LinearRegression

# fit linear regression
linearModel = LinearRegression()
linearModel.fit(X_train_scaled, y_train)

#evaluate
# Validation performance
evaluate(linearModel, X_valid_scaled, y_valid, "validation")


# Test performance
evaluate(linearModel, X_test_scaled,  y_test,  "test")

NameError: name 'X_train_scaled' is not defined

In [None]:
""" RIDGE REGRESSION
    Linear regression is performing badly, (negative R2 score)
    Try accounting for corrolation in data

    Bad features
"""

from sklearn.linear_model import Ridge

# Fit Ridge Regression
ridge = Ridge(alpha=1.0)
ridge.fit(X_train_scaled, y_train)

# Predictions
y_pred_valid = ridge.predict(X_valid_scaled)
y_pred_test  = ridge.predict(X_test_scaled)

# Evaluate on validation
evaluate(ridge, X_valid_scaled, y_valid, "Ridge Regression - Validation")

# Evaluate on test
evaluate(ridge, X_test_scaled,  y_test,  "Ridge Regression - Test")


In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(
    n_estimators=300,
    max_depth=None,
    random_state=0,
    n_jobs=-1
)

rf.fit(X_train, y_train)  
evaluate(rf, X_valid, y_valid, "Random Forest - Validation")
evaluate(rf, X_test,  y_test,  "Random Forest - Test")


## Model Evaluation