In [2]:
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
from random import choice
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
from sklearn.decomposition import LatentDirichletAllocation

In [3]:
def decode_lyrics(s):
    s = s.encode('ascii', 'ignore')
    s = s.decode()
    s = s.replace('\n', ' ')
    s = s.replace('-', ' ')
    s = re.sub(r'[\[].*?[\)\]]', ' ', s)
    
    return s

In [5]:
!pwd

/home/ubuntu/vinyl-record-recommender/src/server_src


In [6]:
dfs = pickle.load(open('dfs_w_lyrics.pkl', 'rb'))
tracks = pd.concat(dfs)
tracks.reset_index(inplace=True)
lead_artist = []
feat_artists = []

for artists in tracks.artist:
    lead_artist.append(artists[0])
    feat_artists.append(artists[1:])

tracks['lead_artist'] = lead_artist
tracks['feat_artists'] = feat_artists
tracks['album_release'] = tracks.album + ' by ' + tracks.lead_artist

In [7]:
lyrics_flag = pd.isna(tracks['lyrics'])
lyrical_db = tracks[~lyrics_flag]
lyrical_db = lyrical_db[['album_release', 'lyrics']]

In [8]:
lyrics_by_album = lyrical_db.groupby('album_release')['lyrics'].apply(lambda x: ' '.join(x))

In [11]:
lyrics_by_album = lyrics_by_album.apply(decode_lyrics)

In [6]:
lyrics_by_album = lyrics_by_album.apply(decode_lyrics)

In [7]:
mapping = pd.Series(lyrics_by_album.reset_index().index, index=lyrics_by_album.reset_index().album_release)
mapping

album_release
- Ugh, those feels again by Snoh Aalegra                0
17 by XXXTENTACION                                      1
A Fever You Can't Sweat Out by Panic! At The Disco      2
A Letter To My Younger Self by Quinn XCII               3
A Love Letter To You 3 by Trippie Redd                  4
                                                     ... 
you broke me first by Tate McRae                      489
you ruined new york city for me by FLETCHER           490
zero_one by The Living Tombstone                      491
~how i'm feeling~ by Lauv                             492
÷ (Deluxe) by Ed Sheeran                              493
Length: 494, dtype: int64

In [13]:
album_df = lyrics_by_album.reset_index()
album_df

Unnamed: 0,album_release,lyrics
0,"- Ugh, those feels again by Snoh Aalegra",Even if we catch the sunrise It's only a momen...
1,17 by XXXTENTACION,"17. A collection of nightmares, thoughts, an..."
2,A Fever You Can't Sweat Out by Panic! At The D...,(Niemieckich spotkao si z szerokim rozgosem ...
3,A Letter To My Younger Self by Quinn XCII,I think the back door creaks I think the sun...
4,A Love Letter To You 3 by Trippie Redd,DJ on the beat so it's a banger (It ain't ov...
...,...,...
489,you broke me first by Tate McRae,You broke me first Maybe you don't like t...
490,you ruined new york city for me by FLETCHER,I poured you a glass of wine And drank it fo...
491,zero_one by The Living Tombstone,Quarter to two and I haven't got out of bed ...
492,~how i'm feeling~ by Lauv,"Another life, another story She walked out, ..."


### BASELINE MODEL

### Manually test albums

### HYPERPARAMETERS

In [10]:
max_features = 1000

In [11]:
vc = CountVectorizer(max_features=max_features)

In [12]:
album_df.set_index('album_release', inplace=True)

In [13]:
lyrics = album_df.lyrics

In [14]:
M = vc.fit_transform(lyrics)

In [15]:
words = vc.get_feature_names()

In [16]:
lda = LatentDirichletAllocation(n_components=30,
                               n_jobs=-1)

In [17]:
lda.fit(M)

LatentDirichletAllocation(n_components=30, n_jobs=-1)

In [18]:
theta = lda.transform(M)
phi = lda.components_

theta, phi = (np.around(x,2) for x in (theta, phi))
theta = pd.DataFrame(theta, index=lyrics.index)
phi = pd.DataFrame(phi, columns=words)

print('Theta:')
display(theta)
print('Phi:')
display(phi)

Theta:


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
album_release,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"- Ugh, those feels again by Snoh Aalegra",0.00,0.00,0.0,0.00,0.0,0.00,0.09,0.00,0.00,0.0,...,0.00,0.00,0.00,0.00,0.0,0.0,0.00,0.0,0.0,0.74
17 by XXXTENTACION,0.00,0.00,0.0,0.00,0.0,0.00,0.00,0.00,0.00,0.0,...,0.00,0.00,0.96,0.00,0.0,0.0,0.00,0.0,0.0,0.00
A Fever You Can't Sweat Out by Panic! At The Disco,0.00,0.41,0.0,0.00,0.0,0.00,0.00,0.16,0.00,0.0,...,0.00,0.00,0.00,0.00,0.0,0.0,0.00,0.0,0.0,0.00
A Letter To My Younger Self by Quinn XCII,0.00,0.40,0.0,0.00,0.0,0.00,0.07,0.03,0.00,0.0,...,0.00,0.00,0.00,0.02,0.0,0.0,0.00,0.0,0.0,0.19
A Love Letter To You 3 by Trippie Redd,0.06,0.08,0.0,0.00,0.0,0.00,0.00,0.00,0.00,0.0,...,0.00,0.14,0.00,0.00,0.0,0.0,0.50,0.0,0.0,0.09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
you broke me first by Tate McRae,0.00,0.00,0.0,0.00,0.0,0.00,0.00,0.00,0.00,0.0,...,0.00,0.00,0.00,0.00,0.0,0.0,0.00,0.0,0.0,0.93
you ruined new york city for me by FLETCHER,0.00,0.41,0.0,0.00,0.0,0.05,0.00,0.00,0.00,0.0,...,0.00,0.00,0.00,0.00,0.0,0.0,0.00,0.0,0.0,0.50
zero_one by The Living Tombstone,0.00,0.34,0.0,0.00,0.0,0.00,0.00,0.12,0.00,0.0,...,0.00,0.00,0.00,0.00,0.0,0.0,0.00,0.0,0.0,0.12
~how i'm feeling~ by Lauv,0.00,0.28,0.0,0.01,0.0,0.00,0.06,0.00,0.02,0.0,...,0.05,0.00,0.06,0.00,0.0,0.0,0.00,0.0,0.0,0.42


Phi:


Unnamed: 0,12,2015,2016,2017,2018,2020,21,about,above,act,...,yes,yet,yo,you,young,your,youre,yours,yourself,yuh
0,0.03,0.03,0.03,0.03,0.03,0.03,92.37,40.87,0.03,40.94,...,15.84,0.03,67.55,2160.08,5.03,989.98,0.03,14.7,0.03,2.34
1,7.23,0.03,0.03,0.03,0.03,1.09,0.03,650.42,40.68,13.56,...,68.68,56.98,0.55,14637.03,319.45,3917.45,19.31,231.94,86.19,0.03
2,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,...,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03
3,11.23,53.81,73.65,216.81,0.03,0.03,13.96,0.03,0.03,0.03,...,0.03,0.03,12.64,14.05,30.38,56.31,0.03,0.03,0.03,0.03
4,7.27,0.03,0.03,0.03,0.03,0.03,2.65,32.13,20.44,37.03,...,6.42,5.39,6.7,1313.38,219.42,719.64,0.03,0.03,47.94,0.03
5,0.03,0.03,0.03,0.03,0.03,1.24,0.03,3.47,0.03,0.03,...,0.06,0.03,68.41,384.98,0.03,55.63,0.03,0.03,6.22,0.03
6,0.03,3.14,0.03,0.03,0.03,0.03,0.03,47.14,13.62,1.18,...,47.61,0.9,0.24,2796.66,0.48,617.64,0.03,3.96,33.25,11.3
7,5.44,0.03,0.03,0.03,0.03,0.03,0.03,253.57,35.39,32.07,...,9.85,134.32,1.98,1488.12,20.55,342.18,62.35,15.67,26.49,0.03
8,1.04,0.03,0.03,0.03,0.03,9.31,0.03,0.03,0.03,0.03,...,11.56,0.03,1924.87,38.04,1.96,0.03,0.03,0.03,0.03,0.03
9,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,...,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03


In [27]:
album_avg_feature.index

Index(['- Ugh, those feels again by Snoh Aalegra', '17 by XXXTENTACION',
       'A Fever You Can't Sweat Out by Panic! At The Disco',
       'A Letter To My Younger Self by Quinn XCII',
       'A Love Letter To You 3 by Trippie Redd',
       'A Love Letter To You 4 by Trippie Redd',
       'A Love Letter To You by Trippie Redd',
       'AI YoungBoy 2 by YoungBoy Never Broke Again', 'AM by Arctic Monkeys',
       'ANTI (Deluxe) by Rihanna',
       ...
       'thank u, next by Ariana Grande', 'us by gnash',
       'x (Deluxe Edition) by Ed Sheeran', 'xx by The xx', 'ye by Kanye West',
       'you broke me first by Tate McRae',
       'you ruined new york city for me by FLETCHER',
       'zero_one by The Living Tombstone', '~how i'm feeling~ by Lauv',
       '÷ (Deluxe) by Ed Sheeran'],
      dtype='object', name='album_release', length=494)

In [25]:
album_features = pd.concat([album_avg_feature, theta], axis=1)

In [26]:
album_features

Unnamed: 0_level_0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,20,21,22,23,24,25,26,27,28,29
album_release,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"- Ugh, those feels again by Snoh Aalegra",0.588143,0.490429,6.500000,-9.584857,0.357143,0.138764,0.435457,1.188218e-02,0.161943,0.438693,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
17 by XXXTENTACION,0.689273,0.250527,4.636364,-15.489545,0.272727,0.216018,0.618655,1.213968e-02,0.163082,0.343664,...,0.00,0.61,0.00,0.00,0.00,0.03,0.00,0.00,0.02,0.00
A Fever You Can't Sweat Out by Panic! At The Disco,0.500462,0.790615,7.076923,-5.923923,0.307692,0.138192,0.188285,7.261538e-02,0.323800,0.538385,...,0.00,0.00,0.00,0.04,0.08,0.00,0.00,0.00,0.00,0.11
A Letter To My Younger Self by Quinn XCII,0.712750,0.550750,4.416667,-6.694333,0.583333,0.076492,0.309169,4.810325e-04,0.144858,0.527917,...,0.00,0.05,0.00,0.00,0.01,0.00,0.00,0.02,0.00,0.33
A Love Letter To You 3 by Trippie Redd,0.676250,0.585125,5.562500,-7.203437,0.500000,0.101037,0.139309,3.096875e-04,0.147212,0.292437,...,0.00,0.00,0.00,0.00,0.05,0.01,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
you broke me first by Tate McRae,0.642000,0.374000,4.000000,-9.386000,1.000000,0.054500,0.786000,0.000000e+00,0.090600,0.079900,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.69
you ruined new york city for me by FLETCHER,0.567800,0.627000,6.400000,-5.829800,1.000000,0.137620,0.357740,6.548800e-05,0.191400,0.471000,...,0.00,0.00,0.00,0.00,0.00,0.02,0.09,0.00,0.00,0.73
zero_one by The Living Tombstone,0.525300,0.899800,5.500000,-4.177600,0.700000,0.077480,0.003538,1.905360e-04,0.369790,0.581860,...,0.00,0.41,0.00,0.00,0.00,0.00,0.00,0.02,0.00,0.28
~how i'm feeling~ by Lauv,0.670667,0.451952,6.571429,-8.344190,0.857143,0.105490,0.300174,1.571429e-06,0.163971,0.405752,...,0.01,0.06,0.01,0.00,0.00,0.01,0.00,0.00,0.00,0.13


# Recommender Functions

In [27]:
tfidf = TfidfVectorizer(stop_words='english')
lyric_vec = tfidf.fit_transform(lyrics_by_album)
print(f'{tracks.album.nunique()} albums.')
print(f'{lyric_vec.shape[1]} unique single n-grams in album corpus.')

508 albums.
59938 unique single n-grams in album corpus.


In [42]:
from sklearn.preprocessing import StandardScaler

In [43]:
scaler = StandardScaler()

In [44]:
X_test = album_features.values

In [57]:
X_scaled = scaler.fit_transform(X_test)

In [28]:
X = album_features.values
lyric_vec = tfidf.fit_transform(lyrics_by_album)

In [34]:
def recommended_albums_lyrics(sample_album, lyric_vec):
    similarity_matrix = cosine_similarity(lyric_vec, lyric_vec)
    
    album_index = mapping[sample_album]
    similarity_score = list(enumerate(similarity_matrix[album_index]))
    similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=True)
    
    # get score of 10 most similar movies. Ignore first movie.
    similarity_score = similarity_score[1:10]
    
    # return movie names using the mapping series
    
    album_indices = [i[0] for i in similarity_score]
    
    return (album_df['album_release'].iloc[album_indices])

In [64]:
def recommended_albums_topics(sample_album, X):
    similarity_matrix = cosine_similarity(X, X)
    
    album_index = mapping[sample_album]
    similarity_score = list(enumerate(similarity_matrix[album_index]))
    similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=True)
    
    # get score of 10 most similar movies. Ignore first movie.
    similarity_score = similarity_score[1:6]
    
    # return movie names using the mapping series
    
    album_indices = [i[0] for i in similarity_score]
    
    return (album_df['album_release'].iloc[album_indices])

In [31]:
album_df.reset_index(inplace=True)

In [12]:
album_df

NameError: name 'album_df' is not defined

In [47]:
for album in lyrical_db.album_release.unique():
    print(album)

After Hours by The Weeknd
ASTROWORLD by Travis Scott
American Teen by Khalid
AI YoungBoy 2 by YoungBoy Never Broke Again
A Love Letter To You 4 by Trippie Redd
Artist 2.0 by A Boogie Wit da Hoodie
ANTI (Deluxe) by Rihanna
A Love Letter To You 3 by Trippie Redd
A Letter To My Younger Self by Quinn XCII
AM by Arctic Monkeys
Appetite For Destruction by Guns N' Roses
Artist 2.0 (Deluxe) by A Boogie Wit da Hoodie
AT.LONG.LAST.A$AP by A$AP Rocky
A Fever You Can't Sweat Out by Panic! At The Disco
American Beauty/American Psycho by Fall Out Boy
A Love Letter To You by Trippie Redd
Artist by A Boogie Wit da Hoodie
beerbongs & bentleys by Post Malone
Blonde by Frank Ocean
BALLADS 1 by Joji
B4 The Storm by Internet Money
Birds In The Trap Sing McKnight by Travis Scott
Birds In The Trap Sing McKnight by Young Thug
Beauty Behind The Madness by The Weeknd
Blurryface by Twenty One Pilots
Be Like That (feat. Swae Lee & Khalid) by Kane Brown
Baby On Baby by DaBaby
Blueberry Faygo by Lil Mosey
Breaking 

In [95]:
recommended_albums_lyrics('Hotel Diablo by Machine Gun Kelly', lyric_vec)

343                    There's Really A Wolf by Russ
137             Hoodie SZN by A Boogie Wit da Hoodie
16     Artist 2.0 (Deluxe) by A Boogie Wit da Hoodie
19                    B4 The Storm by Internet Money
268                                 Perception by NF
17              Artist 2.0 by A Boogie Wit da Hoodie
342                                 The Search by NF
42                             Circles by Mac Miller
106                        Ghetto Gospel by Rod Wave
Name: album_release, dtype: object

In [98]:
recs = recommended_albums_topics('Hot Pink by Doja Cat', X_scaled)

In [99]:
for r in recs:
    print(r)

Breaking Me by Topic
Romance by Shawn Mendes
Love Yourself 結 'Answer' by BTS
Changes by Justin Bieber
You Never Walk Alone by BTS
