In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import re
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from textblob import TextBlob

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from IPython.display import HTML, display
import ipywidgets as widgets
from matplotlib.animation import FuncAnimation

In [2]:
df = pd.read_csv("../data/processed_data (1).csv")

In [4]:
df.shape

(67499, 32)

In [3]:
df.shape[0]

67499

### Data Preprocessing

In [5]:
def dedup(df):
    df['artist_song'] = df.apply(lambda row: row['artist_name']+row['track_name'], axis = 1)
    return df.drop_duplicates('artist_song')

In [6]:
def select_cols(df):
       '''
       Select useful columns
       '''
       return df[['track_uri', 'artist_name','track_name','danceability', 'energy', 'key', 'loudness', 'mode',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', "artist_pop", "genres", "track_pop"]]

In [7]:
# OHE 
# this only applies to categorical columns with one value in each datapoint, won't work for attribute with several values
# this doesn't add weights to different categories, so every option weights the same
def ohe(df, column_name, new_name):
    ohe_df = pd.get_dummies(df[column_name])
    feature_name = ohe_df.columns
    new_name = [ str(new_name) + " | " + str(i) for i in feature_name]
    ohe_df.columns = new_name
    return ohe_df.reset_index(drop=True)

In [8]:
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity
def getPolarity(text):
    return TextBlob(text).sentiment.polarity

In [9]:
def getAnalysis(score, task):
    if task == 'Subjectivity':
        if score < 1/3:
            return 'low'
        elif score > 1/3:
            return 'high'
        else:    
            return 'mid'
    else:
        if score < 0:
            return 'negative'
        elif score == 0:
            return 'mid'
        else:
            return 'high'

In [10]:
def preprocess(df):
    df = dedup(df)
    df = select_cols(df)
    df['genres'] = df['genres'].apply(lambda genre: genre.split(" "))
    return df

In [13]:
def create_feature_values(songDF):
    
    #TF-IDF for genres
    tfidf = TfidfVectorizer()
    tfidf_matrix =  tfidf.fit_transform(songDF['genres'].apply(lambda x: " ".join(x))) #fit_transform inputs a sentence of genres seperated by a whitespcae
    genre_df = pd.DataFrame(tfidf_matrix.toarray())
    genre_df.columns = ['genre' + " | " + i for i in tfidf.get_feature_names_out()]
    genre_df.drop(columns='genre | unknown') # Drop unknown genre, specified by a word "unknown"
    genre_df.reset_index(drop = True, inplace=True)

    # Sentiment analysis
    songDF['subjectivity'] = songDF['track_name'].apply(getSubjectivity).apply(lambda x: getAnalysis(x, 'Subjectivity'))
    songDF['polarity'] = songDF['track_name'].apply(getPolarity).apply(lambda x: getAnalysis(x, 'Polarity'))

    # One-hot Encoding
    key_ohe = ohe(songDF, 'key','key') * 0.5
    mode_ohe = ohe(songDF, 'mode','mode') * 0.5
    subject_ohe = ohe(songDF, 'subjectivity', 'subjectivity') * 0.3
    polar_ohe = ohe(songDF, 'polarity', 'polarity') * 0.3

    # Scale tempo, loudness, track_pop, artist_pop (Data Normaliation)
    scale = MinMaxScaler()
    attr_scaled = pd.DataFrame(scale.fit_transform(songDF[['tempo', "loudness", "track_pop", "artist_pop"]]), columns = ['tempo_scaled', "loundess_scaled", "track_pop_scaled", "artist_pop_scaled"])

    processed_df = pd.concat([genre_df, key_ohe, mode_ohe, subject_ohe, polar_ohe, attr_scaled, songDF[["track_uri"]].reset_index(drop=True)], axis = 1)
    return processed_df

### Creating materials


In [43]:
preprocessed_data = preprocess(df)
processed_data = create_feature_values(preprocessed_data)
track_uri = processed_data['track_uri']
processed_data.drop(['track_uri'], axis = 1, inplace = True)

In [96]:
def apply_clustering(scaled_features, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(scaled_features[:4])
    return kmeans, cluster_labels

In [84]:
def plot_elbow_method(scaled_features, max_clusters=10):
    sse = []
    for k in range(1, max_clusters + 1):
        kmeans = KMeans(n_clusters=k, n_init='auto', random_state=42)
        kmeans.fit(scaled_features)
        sse.append(kmeans.inertia_)

    plt.figure(figsize=(10, 8))
    plt.plot(range(1, max_clusters + 1), sse, marker='o')
    plt.title('Elbow Method for Optimal Number of Clusters')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Sum of Squared Distances (SSE)')
    plt.show()

In [6]:
test_songs = pd.read_csv("../data/test_playlist.csv")

In [9]:
test_songs[:5].to_csv("test_songs.csv")