In [1]:
# importing modules

import sys

sys.path.append('../')
from connection import create_spotify_oauth, get_audio_features, get_token, get_tracks, \
    tracks_to_df, audio_features_to_df, get_artist_info, artist_info_to_df, get_album_info, \
        album_info_to_df, get_similar_artists

from itertools import chain

import json
import re
import os
import glob
import ast

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, Normalizer
from sklearn.feature_extraction.text import FeatureHasher

In [2]:
# token and authentication variables

sp_ouath = create_spotify_oauth()
token_info = get_token()

In [3]:
# importing data (all playlists)
# tracks_list = []
# artists_list = []
# albums_list = []

# for file in glob.glob('../../data/raw/*tracks*'):
#     tracks = pd.read_csv(file)
#     tracks_list.append(tracks)

# for file in glob.glob('../../data/raw/*artists*'):
#     artists = pd.read_csv(file)
#     artists_list.append(artists)

# for file in glob.glob('../../data/raw/*albums*'):
#     albums = pd.read_csv(file)
#     albums_list.append(albums)

# tracks_df = pd.concat(tracks_list)
# artists_df = pd.concat(artists_list)
# albums_df = pd.concat(albums_list)

In [4]:
# importing data (individual playlists)

tracks_df = pd.read_csv("../../data/raw/p1_tracks.csv", index_col=False)
albums_df = pd.read_csv("../../data/raw/p1_albums.csv", index_col=False)
artists_df = pd.read_csv("../../data/raw/p1_artists.csv", index_col=False)

In [5]:
# Merging tracks and albums dataframes together

master_df = pd.merge(tracks_df, albums_df, how='left')

# exploding artist id column
master_df['artist_id'] = master_df['artist_id'].apply(ast.literal_eval)
master_df = master_df.explode('artist_id') 

# binarizing explicit column
master_df['explicit'] = master_df['explicit'].replace({True: 1, False: 0})


In [6]:
# processing artist genres

# lambda converts the columns into strings and strips of the exterior [] and '' using re, then splits the string 
# into a list based on ','
artists_df['artist_genres'] = artists_df['artist_genres'].apply(lambda x: re.sub(r"[\[\]']", '', str(x)).split(','))

# unnest the list of genres into seperate rows, rest of features are duplicated
artists_df = artists_df.explode('artist_genres')

# creates a copy of artist_df where there are no genres then drops those rows from the original df
artists_missing_genres = artists_df.loc[artists_df['artist_genres'] == ""].copy()
artists_df = artists_df[artists_df['artist_genres'] != '']

# check for any artists that have missing genres
if not artists_df.loc[artists_df['artist_genres'] == ''].empty:
    
    # iterates through the rows of the dataframe
    for index, artist in artists_missing_genres.iterrows():
        artist_id = artist['artist_id']
        similar_artists = get_similar_artists(artist_id, token_info) # API call
        
        # list comprehension that fetches the artist genres for every similar artist    
        genres_list = [similar_artist['genres'] for similar_artist in similar_artists if similar_artist['genres']]
        
    # convert the loop output to list
    flattened_genres = list(chain.from_iterable(genres_list))

    # converting to set so only unique values remain in list and adding it to the artists_missing_genres df
    artists_missing_genres.at[index, 'artist_genres'] = list(set(flattened_genres)) 

    artists_missing_genres = artists_missing_genres.explode('artist_genres')

    # if after imputation, there are still no genres, mark it off as not imputable
    artists_missing_genres['artist_genres'].replace('','genre not imputable', inplace=True)

    # concat the missing artist genres with the original df
    artists_df = pd.concat([artists_df, artists_missing_genres], ignore_index=False)

# dropping duplicate records, this occurs if a playlist has multiple tracks from the same artist
artists_df.drop_duplicates(subset=['artist_id', 'artist_genres'], inplace=True)

In [7]:
# processing albums release dates

# converting object type column to datetime
master_df['album_release_date'] = pd.to_datetime(master_df['album_release_date'], format="mixed")
master_df['album_release_date'] = master_df['album_release_date'].dt.year # keeping only YYYY info

# categorizing album type column
master_df['album_type'] = master_df['album_type'].replace({'single': 0, 'album': 1, 'compilation': 2}) # EP's count as singles

In [8]:
# merging artists dataframe to master dataframe
master_df = pd.merge(master_df, artists_df, on='artist_id', how='left')

# dropping uneeded variables
master_df.drop(["Unnamed: 0_x", "Unnamed: 0_y"], axis=1, inplace=True)

In [25]:
# column groups, tranformers, and pipelines

# grouping columns together based on data type
# ID and display cols will be excluded from models they will be remerged back in later for web app
column_groups = {
    'id_cols': ['track_id', 'artist_id', 'album_id'],
    'display_cols': ['track_name', 'preview_url', 'artist_img_300','album_name', 'album_label', 
                     'album_cover_640', 'album_cover_300', 'album_cover_64'],
    'discrete_cols': ['artist_number', 'album_tracks'],
    'ordinal_cols': ['key', 'time_signature', 'album_release_date'],
    'cat_lai_cols': ['album_type'],  # leave as is
    'continuous_cols': ['tempo', 'duration_ms', 'song_popularity', 'danceability', 'energy', 'loudness', 
                        'speechiness', 'acousticness', 'instrumentalness', 'liveness','valence','artist_followers', 
                        'artist_popularity','album_popularity'],
    'binary_cols': ['explicit', 'mode'],
    'categorical_cols': ['artist_genres', 'artist_name']
}

# filtered dataframe with only data that will be used in the model (excluding ID/display cols)
cols_to_transform = master_df[[col for col in  master_df.columns if col not in column_groups['id_cols'] + column_groups['display_cols']]]

# common transformer for continuous columns
continuous_transformer = Pipeline([
    ('standardizing', StandardScaler())
])

# common transformer for ordinal columns
ordinal_transformer = Pipeline([
    ('encoding', OrdinalEncoder())
])

# merging the transformers 
column_transformers = ColumnTransformer(
    transformers=[
        ('continuous', continuous_transformer, column_groups['continuous_cols']),
        ('ordinal', ordinal_transformer, column_groups['ordinal_cols'])],
    remainder='passthrough'
)

# adding preprocessing transformers to pipeline
pipe = Pipeline(steps=[('preprocessing', column_transformers)])

# executing pipeline
cols_transformed = pipe.fit_transform(cols_to_transform)
cols_transformed = pd.DataFrame(cols_transformed, 
                                columns = pipe.get_feature_names_out()) # extracting feature names


In [None]:
# feature hashing artist genres ~~~~ IGNOREEEE

# (will need to turn this into a function that passes into column transformer)

# artists_genres_dict = artists_df['artist_genres'].to_dict() # convert to dict
# X = text_transformer.fit_transform(artists_df['artist_genres']) # apply transformer
# X = X.toarray() # convert output to array
# X_ = pd.DataFrame(data=X, columns=[f'genre_{i}' for i in range(X.shape[1])], index=artists_df.index) # convert array to df
# artists_df = pd.concat([artists_df, X_], axis=1) # merge artist df with the hashed genres
# artists_df.drop(['artist_genres'], axis=1, inplace=True) # dropped the og genre column since it's not needed anymore

In [None]:
# # THIS IS OLD CODE // SOME PARTS NEEDED FOR REMERGING ID/DISPLAY COLS WITH MASTER DF POST-MODEL

# # resetting indexes (to avoid errors during merge)
# # artists_df.reset_index(inplace=True)
# # albums_df.reset_index(inplace=True)
# # tracks_df.reset_index(inplace=True)

# # to avoid duplicate columns in master_df merging below
# # artists_df.drop(['artist_followers','artist_popularity'], axis=1, inplace=True)

# # remerging id / display cols back with the transformed df's
# artists_post_transform = pd.concat([artists_df[artists_cols['id_cols'] + artists_cols['display_cols']], 
#                                     artists_transformed, artists_df], axis=1)
# albums_post_transform = pd.concat([albums_df[albums_cols['id_cols'] + albums_cols['display_cols']], 
#                                    albums_transformed], axis=1)
# tracks_post_transform = pd.concat([tracks_df[tracks_cols['id_cols'] + tracks_cols['display_cols']], 
#                                    tracks_transformed], axis=1)

# # remerging hashed artist genre columns
# artists_post_transform = artists_post_transform.loc[:,~artists_post_transform.columns.duplicated()]

# # merge tracks and artists on 'artist_id'
# master_df = pd.merge(tracks_post_transform, artists_post_transform, on='artist_id', how='outer')

# # merge the result with albums on 'album_id'
# master_df = pd.merge(master_df, albums_post_transform, on='album_id', how='outer')