In [1]:
import pandas as pd
from tqdm import tqdm
import os
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler

from textblob import TextBlob

from sklearn.metrics.pairwise import cosine_similarity

In [2]:
print(os.getcwd())

c:\qtri\Self-learning\Spotify Recommendation\Spotify_Recommendation


### Data Import

In [3]:
df = pd.read_csv("../data/processed_data (1).csv")

In [4]:
df.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'pos', 'artist_name', 'track_uri',
       'artist_uri', 'track_name', 'album_uri', 'duration_ms_x', 'album_name',
       'name', 'danceability', 'energy', 'key', 'loudness', 'mode',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'type', 'id', 'uri', 'track_href', 'analysis_url',
       'duration_ms_y', 'time_signature', 'artist_pop', 'genres', 'track_pop'],
      dtype='object')

### Preprocessing
1. Useful datat selection (dedup and drop first 2 columns)
2. Bring uris to its basic format (artist_uri, album_uri, uri) (but we don't use them tho)
3. List concatenation for genres

In [5]:
def dedup(df):
    df['artist_song'] = df.apply(lambda row: row['artist_name']+row['track_name'], axis = 1)
    return df.drop_duplicates('artist_song')

In [6]:
#but we don't need to use these uris thou
#songDF['track_uri'] = songDF['track_uri'].apply(lambda x: re.findall(r'\w+$', x)[0])
#songDF['album_uri'] = songDF['album_uri'].apply(lambda x: re.findall(r'\w+$', x)[0])

In [7]:
def select_cols(df):
       '''
       Select useful columns
       '''
       return df[['track_uri', 'artist_name','track_name','danceability', 'energy', 'key', 'loudness', 'mode',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', "artist_pop", "genres", "track_pop"]]

### Feature Generation / Feature Engineer
1. One-hot encoding
2. Sentiment Analysis
3. TF-IDF
4. Normalization

In [8]:
# OHE 
# this only applies to categorical columns with one value in each datapoint, won't work for attribute with several values
# this doesn't add weights to different categories, so every option weights the same
def ohe(df, column_name, new_name):
    ohe_df = pd.get_dummies(df[column_name])
    feature_name = ohe_df.columns
    new_name = [ str(new_name) + " | " + str(i) for i in feature_name]
    ohe_df.columns = new_name
    return ohe_df.reset_index(drop=True)

In [None]:
#Sentiment Analysis. Conduct with track name to find polarity and subjectivity.

In [9]:
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity
def getPolarity(text):
    return TextBlob(text).sentiment.polarity

In [10]:
def getAnalysis(score, task):
    if task == 'Subjectivity':
        if score < 1/3:
            return 'low'
        elif score > 1/3:
            return 'high'
        else:    
            return 'mid'
    else:
        if score < 0:
            return 'negative'
        elif score == 0:
            return 'mid'
        else:
            return 'high'

### Feature Engineer summary

In [11]:
def preprocess(df):
    df = dedup(df)
    df = select_cols(df)
    df['genres'] = df['genres'].apply(lambda genre: genre.split(" "))
    return df

In [12]:
def create_feature_values(songDF):
    
    #TF-IDF for genres
    tfidf = TfidfVectorizer()
    tfidf_matrix =  tfidf.fit_transform(songDF['genres'].apply(lambda x: " ".join(x))) #fit_transform inputs a sentence of genres seperated by a whitespcae
    genre_df = pd.DataFrame(tfidf_matrix.toarray())
    genre_df.columns = ['genre' + " | " + i for i in tfidf.get_feature_names_out()]
    genre_df.drop(columns='genre | unknown') # Drop unknown genre, specified by a word "unknown"
    genre_df.reset_index(drop = True, inplace=True)

    # Sentiment analysis
    songDF['subjectivity'] = songDF['track_name'].apply(getSubjectivity).apply(lambda x: getAnalysis(x, 'Subjectivity'))
    songDF['polarity'] = songDF['track_name'].apply(getPolarity).apply(lambda x: getAnalysis(x, 'Polarity'))

    # One-hot Encoding
    key_ohe = ohe(songDF, 'key','key') * 0.5
    mode_ohe = ohe(songDF, 'mode','mode') * 0.5
    subject_ohe = ohe(songDF, 'subjectivity', 'subjectivity') * 0.3
    polar_ohe = ohe(songDF, 'polarity', 'polarity') * 0.3

    # Scale tempo, loudness, track_pop, artist_pop (Data Normaliation)
    scale = MinMaxScaler()
    attr_scaled = pd.DataFrame(scale.fit_transform(songDF[['tempo', "loudness", "track_pop", "artist_pop"]]), columns = ['tempo_scaled', "loundess_scaled", "track_pop_scaled", "artist_pop_scaled"])

    processed_df = pd.concat([genre_df, key_ohe, mode_ohe, subject_ohe, polar_ohe, attr_scaled, songDF[["track_uri"]].reset_index(drop=True)], axis = 1)
    return processed_df

In [31]:
preprocessed_df = preprocess(df)
complete_feature_set = create_feature_values(preprocessed_df)

### Content-based filtering algo

In [15]:
playlistDF = pd.read_csv("data/test_playlist.csv")
playlistDF = playlistDF[playlistDF['name']=="Mom's playlist"]
playlistDF = preprocess(playlistDF)

In [17]:
def classify_playlist(processed_data, input_data):
    #we don't want to recommend existing songs, so we need to filter out songs in train_data that are already in the input_data.
    train_data_in_playlist = processed_data[processed_data['track_uri'].isin(input_data['track_uri'])]
    train_data_notin_playlist = processed_data[~processed_data['track_uri'].isin(input_data['track_uri'])]
    return train_data_in_playlist.sum(axis = 0), train_data_notin_playlist

In [18]:
# Generate the features
complete_feature_set_playlist_vector, complete_feature_set_nonplaylist = classify_playlist(complete_feature_set, playlistDF)

In [66]:
def generate_playlist_recos(df, features, nonplaylist_features):
    '''
    Generated recommendation based on songs in aspecific playlist.
    ---
    Input: 
    df (pandas dataframe): spotify dataframe
    features (pandas series): summarized playlist feature (single vector)
    nonplaylist_features (pandas dataframe): feature set of songs that are not in the selected playlist
        
    Output: 
    non_playlist_df_top_40: Top 40 recommendations for that playlist
    '''
    
    non_playlist_df = df[df['track_uri'].isin(nonplaylist_features['track_uri'].values)]
    # Find cosine similarity between the playlist and the complete song set
    feature_temp = features.drop(['track_uri'])
    non_playlist_df['sim'] = cosine_similarity(nonplaylist_features.drop(['track_uri'], axis = 1).values, feature_temp.values.reshape(1, -1))[:,0]
    non_playlist_df_top_40 = non_playlist_df.sort_values('sim',ascending = False).head(40)
    
    return non_playlist_df_top_40, non_playlist_df

In [67]:
# Genreate top 40 recommendation

top40, all = generate_playlist_recos(preprocessed_df, complete_feature_set_playlist_vector, complete_feature_set_nonplaylist)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_playlist_df['sim'] = cosine_similarity(nonplaylist_features.drop(['track_uri'], axis = 1).values, feature_temp.values.reshape(1, -1))[:,0]


In [70]:
rcm_songs = top40[["track_name", "artist_name"]]

In [72]:
rcm_songs.to_csv("recommended_songs_for_Mom.csv", index = False)