### Requirements

For text processing
- pip install gensim
- pip install pyldavis

For data cleaning - removing any language other than English
- pip install pycld2

To test for encoding in dataset
- pip install chardet

In [1]:
# import required packages/libraries
import pandas as pd

### Data Review

In [2]:
# check encoding type of data
import chardet
file = 'final_lyrics.csv'
with open(file, 'rb') as rawdata:
    result = chardet.detect(rawdata.read(100000))
result

{'encoding': 'utf-8', 'confidence': 0.99, 'language': ''}

In [3]:
# open/read data
lyrics_data = pd.read_csv('final_lyrics.csv', encoding='utf-8')
# drop first col
lyrics_df = lyrics_data.iloc[: , 1:]

In [4]:
print(lyrics_df.head())

  Genre    Artist                  Title  \
0   Pop  dua lipa              new rules   
1   Pop  dua lipa        don’t start now   
2   Pop  dua lipa                  idgaf   
3   Pop  dua lipa  blow your mind (mwah)   
4   Pop  dua lipa             be the one   

                                              Lyrics  
0  one one one one one   talkin' in my sleep at n...  
1  if you don't wanna see me   did a full 80 craz...  
2  you call me all friendly tellin' me how much y...  
3  i know it's hot i know we've got something tha...  
4  i see the moon i see the moon i see the moon o...  


In [5]:
print(len(lyrics_df))

243406


In [6]:
# lyric check
lyrics_df['Lyrics'][8]

"common love isn't for us we created something phenomenal don't you agree don't you agree you got me feeling\u2005diamond\u2005rich nothing on this\u2005planet compares to it don't you agree don't\u2005you agree  pre who needs to go to sleep when i got you next to me   all night i'll riot with you i know you got my back and you know i got you so come on come on come on come on come on come on let's get physical lights out follow the noise baby keep on dancing like you ain't got a choice so come on come on come on come on come on let's get physical   adrenaline keeps on rushing in love the simulation we're dreaming in don't you agree don't you agree i don't wanna live another life 'cause this one's pretty nice living it up  pre who needs to go to sleep when i got you next to me   all night i'll riot with you i know you got my back and you know i got you so come on come on come on come on come on come on let's get physical lights out follow the noise baby keep on dancing like you ain't g

### Data Cleaning

In [7]:
import pycld2 as cld2

In [8]:
lyrics_df['Lyrics'] =  [str(lyrics.encode('ascii', 'replace')).replace('b"','').replace('?',' ').replace('"','').replace('\\n', ' ').replace("b'",'').replace('instrumental','').replace('[\[],:*!?]','').replace('(','').replace(')','').replace('.','').replace(',','').replace('\\','').replace('verse','').replace('!','').replace('chorus','').replace('*','')
               for lyrics in lyrics_df['Lyrics'].str.decode('unicode_escape')]

In [9]:
# lyric check
lyrics_df['Lyrics'][8]

"common love isn't for us we created something phenomenal don't you agree don't you agree you got me feeling   diamond   rich nothing on this   planet compares to it don't you agree don't   you agree  pre who needs to go to sleep when i got you next to me   all night i'll riot with you i know you got my back and you know i got you so come on come on come on come on come on come on let's get physical lights out follow the noise baby keep on dancing like you ain't got a choice so come on come on come on come on come on let's get physical   adrenaline keeps on rushing in love the simulation we're dreaming in don't you agree don't you agree i don't wanna live another life 'cause this one's pretty nice living it up  pre who needs to go to sleep when i got you next to me   all night i'll riot with you i know you got my back and you know i got you so come on come on come on come on come on come on let's get physical lights out follow the noise baby keep on dancing like you ain't got a choice 

In [10]:
# lyric check
lyrics_df['Lyrics'][42]

"hwasa                                                                 don't you agree don't you agree                                                    just   wasting time don't you   agree don't you agree bae  pre hwasa who needs to   go to sleep when i got you next to me   hwasa dua lipa                                                                       so   come on come   on come on come on come on come on let's get physical                                                                       so come on come on come on come on come on let's get physical   dua lipa adrenaline keeps on rushing in love the simulation we're dreaming in don't you agree don't you agree i don't wanna live another life 'cause this one's pretty nice living it up  pre dua lipa who needs to go to sleep when i got you next to me   dua lipa all night i'll riot with you i know you got my back and you know i got you so come on come on come on come on come on come on let's get physical lights out follow the n

In [11]:
# lyric check
lyrics_df['Lyrics'][23]

"dababy billboard baby dua lipa make 'em dance when it come on everybody lookin' for a dancefloor to run on   dua lipa if you wanna run away with me i know a galaxy and i can take you for a ride i had a premonition that we fell into a rhythm where the music don't stop for life glitter in the sky glitter in my eyes shining just the way i like if you're feeling like you need a little bit of company you met me at the perfect time  pre dua lipa you want me i want you baby my sugarboo i'm levitating the milky way we're renegading yeah yeah y  ah yeah yeah   dua lipa i got you moonlight you're my starlight i need you all night com   on dance with me i'm levitating you moonlight you're my starlight you're the moonlight i need you all night come on dance with me i'm levitating   dababy i'm one of the greatest ain't no debatin' on it let's go i'm still levitated i'm heavily medicated ironic i gave 'em love and they end up hatin' on me go she told me she love me and she been waitin' been fightin

Filter for only English Lyrics

In [14]:
en_lyrics = []
for i in range(len(lyrics_df)):
    _, _, _, detected_language = cld2.detect(lyrics_df['Lyrics'][i],  returnVectors=True)
    if len(detected_language) == 1:
        if detected_language[0][2] == 'ENGLISH':
#             lyrics_df.drop(i, inplace = True)
            en_lyrics.append(i)

lyrics_df = lyrics_df.iloc[en_lyrics,:] 
        
len(lyrics_df)
# data cut from 243406 to 211873

211874

In [15]:
# reset df index
lyrics_df.reset_index(inplace=True)
lyrics_df

Unnamed: 0,index,Genre,Artist,Title,Lyrics
0,0,Pop,dua lipa,new rules,one one one one one talkin' in my sleep at n...
1,1,Pop,dua lipa,don’t start now,if you don't wanna see me did a full 80 craz...
2,2,Pop,dua lipa,idgaf,you call me all friendly tellin' me how much y...
3,3,Pop,dua lipa,blow your mind (mwah),i know it's hot i know we've got something tha...
4,4,Pop,dua lipa,be the one,i see the moon i see the moon i see the moon o...
...,...,...,...,...,...
211869,243401,Country,edens edge,who am i drinking tonight,I gotta say Boy after only just a couple of da...
211870,243402,Country,edens edge,liar,I helped you find her diamond ring You made me...
211871,243403,Country,edens edge,last supper,Look at the couple in the corner booth Looks a...
211872,243404,Country,edens edge,christ alone live in studio,When I fly off this mortal earth And I'm measu...


### Pre-process Data

In [16]:
# import custom filters
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_tags, strip_punctuation, strip_numeric, stem_text
from gensim.parsing.preprocessing import strip_multiple_whitespaces, strip_non_alphanum, remove_stopwords, strip_short

from gensim import corpora, models, similarities

In [17]:
# define custom filters
CUSTOM_FILTERS = [lambda x: x.lower(), #lowercase
                  strip_multiple_whitespaces,# remove repeating whitespaces
                  strip_numeric, # remove numbers
                  remove_stopwords,# remove stopwords
                  strip_short, # remove words less than minsize=3 characters long
#                   stem_text # return porter-stemmed text,
                 ]

In [18]:
text_preprocess = []
for i in range(len(lyrics_df)):
    text_preprocess.append(preprocess_string(lyrics_df['Lyrics'][i], CUSTOM_FILTERS))
lyrics_df['text_preprocessing'] = text_preprocess

In [19]:
lyrics_df

Unnamed: 0,index,Genre,Artist,Title,Lyrics,text_preprocessing
0,0,Pop,dua lipa,new rules,one one one one one talkin' in my sleep at n...,"[talkin', sleep, night, makin', crazy, mind, m..."
1,1,Pop,dua lipa,don’t start now,if you don't wanna see me did a full 80 craz...,"[don't, wanna, crazy, thinking, 'bout, way, he..."
2,2,Pop,dua lipa,idgaf,you call me all friendly tellin' me how much y...,"[friendly, tellin', miss, that's, funny, guess..."
3,3,Pop,dua lipa,blow your mind (mwah),i know it's hot i know we've got something tha...,"[know, it's, hot, know, we've, got, money, can..."
4,4,Pop,dua lipa,be the one,i see the moon i see the moon i see the moon o...,"[moon, moon, moon, you're, looking, sun, i'm, ..."
...,...,...,...,...,...,...
211869,243401,Country,edens edge,who am i drinking tonight,I gotta say Boy after only just a couple of da...,"[gotta, boy, couple, dates, you're, hands, out..."
211870,243402,Country,edens edge,liar,I helped you find her diamond ring You made me...,"[helped, diamond, ring, try, tomorrow, you'll,..."
211871,243403,Country,edens edge,last supper,Look at the couple in the corner booth Looks a...,"[look, couple, corner, booth, looks, lot, like..."
211872,243404,Country,edens edge,christ alone live in studio,When I fly off this mortal earth And I'm measu...,"[fly, mortal, earth, i'm, measured, depth, gir..."


In [20]:
#create dictionary
lyrics_dictionary = corpora.Dictionary(lyrics_df['text_preprocessing'])
print(lyrics_dictionary)

Dictionary(260879 unique tokens: ["'cause", "'em", "ain't", 'baby', 'backwards']...)


In [21]:
# convert tokenized documents to vectors
corpus = [lyrics_dictionary.doc2bow(text) for text in lyrics_df['text_preprocessing']]

In [22]:
# print human readable output
# notice original word order lost - only word and frequency remain
[[(lyrics_dictionary[id], freq) for id, freq in cp]for cp in corpus]

[[("'cause", 4),
  ("'em", 11),
  ("ain't", 3),
  ('baby', 1),
  ('backwards', 1),
  ('bed', 3),
  ('breathe', 1),
  ("callin'", 3),
  ('count', 11),
  ('crazy', 1),
  ("doesn't", 2),
  ("don't", 33),
  ('drunk', 3),
  ('eat', 1),
  ('feel', 1),
  ('finally', 1),
  ('forwards', 1),
  ('friend', 7),
  ("gettin'", 4),
  ('gonna', 3),
  ('got', 11),
  ('gotta', 6),
  ("he's", 6),
  ('heart', 1),
  ("hopin'", 1),
  ("i'm", 2),
  ('keeps', 1),
  ('kick', 3),
  ('know', 7),
  ('learn', 3),
  ('let', 7),
  ('like', 1),
  ('love', 5),
  ('makes', 2),
  ("makin'", 1),
  ('mind', 2),
  ('morning', 3),
  ('new', 11),
  ('night', 1),
  ('pattern', 1),
  ('perfect', 1),
  ('phone', 3),
  ('pick', 3),
  ('practice', 1),
  ('pre', 3),
  ("pullin'", 1),
  ("pushin'", 1),
  ('read', 1),
  ('refrain', 2),
  ('rehearse', 1),
  ('repeat', 1),
  ('rules', 10),
  ('save', 1),
  ('sleep', 2),
  ("standin'", 1),
  ("talkin'", 1),
  ('tell', 10),
  ('times', 2),
  ('tryna', 1),
  ('turn', 2),
  ('uhooh', 1),
 

## Part 1.1: TF-IDF and LDA

In [23]:
# initialize tfidf model
tfidf = models.TfidfModel(corpus)

# apply transformation to entire corpus
transformed_tfidf = tfidf[corpus]

In [24]:
# LDA on tfidf
lda = models.LdaMulticore(transformed_tfidf, num_topics=4, id2word=lyrics_dictionary)

lda.show_topics()

[(0,
  '0.002*"life" + 0.002*"world" + 0.002*"it\'s" + 0.002*"blood" + 0.001*"god" + 0.001*"i\'m" + 0.001*"eyes" + 0.001*"like" + 0.001*"time" + 0.001*"light"'),
 (1,
  '0.003*"got" + 0.003*"like" + 0.003*"shit" + 0.003*"nigga" + 0.003*"ain\'t" + 0.003*"i\'m" + 0.002*"fuck" + 0.002*"niggas" + 0.002*"money" + 0.002*"yeah"'),
 (2,
  '0.007*"love" + 0.005*"don\'t" + 0.005*"i\'m" + 0.005*"baby" + 0.005*"know" + 0.004*"you\'re" + 0.004*"want" + 0.004*"it\'s" + 0.004*"i\'ll" + 0.004*"can\'t"'),
 (3,
  '0.006*"christmas" + 0.002*"merry" + 0.001*"sleigh" + 0.001*"mistletoe" + 0.001*"reindeer" + 0.001*"caroline" + 0.001*"bells" + 0.000*"love" + 0.000*"baby" + 0.000*"presents"')]

In [25]:
# add topic to df

from operator import itemgetter

topic_list = []
for i in range(len(lyrics_df)):
    pp_lyrics = preprocess_string(lyrics_df['Lyrics'][i], CUSTOM_FILTERS)
    bow_lyrics = lyrics_dictionary.doc2bow(pp_lyrics)
    topics = lda.get_document_topics(bow_lyrics)
    topic = max(topics, key=itemgetter(1))[0]
    topic_list.append(topic)
    
lyrics_df['topic'] = topic_list
lyrics_df

Unnamed: 0,index,Genre,Artist,Title,Lyrics,text_preprocessing,topic
0,0,Pop,dua lipa,new rules,one one one one one talkin' in my sleep at n...,"[talkin', sleep, night, makin', crazy, mind, m...",2
1,1,Pop,dua lipa,don’t start now,if you don't wanna see me did a full 80 craz...,"[don't, wanna, crazy, thinking, 'bout, way, he...",2
2,2,Pop,dua lipa,idgaf,you call me all friendly tellin' me how much y...,"[friendly, tellin', miss, that's, funny, guess...",2
3,3,Pop,dua lipa,blow your mind (mwah),i know it's hot i know we've got something tha...,"[know, it's, hot, know, we've, got, money, can...",2
4,4,Pop,dua lipa,be the one,i see the moon i see the moon i see the moon o...,"[moon, moon, moon, you're, looking, sun, i'm, ...",2
...,...,...,...,...,...,...,...
211869,243401,Country,edens edge,who am i drinking tonight,I gotta say Boy after only just a couple of da...,"[gotta, boy, couple, dates, you're, hands, out...",1
211870,243402,Country,edens edge,liar,I helped you find her diamond ring You made me...,"[helped, diamond, ring, try, tomorrow, you'll,...",2
211871,243403,Country,edens edge,last supper,Look at the couple in the corner booth Looks a...,"[look, couple, corner, booth, looks, lot, like...",2
211872,243404,Country,edens edge,christ alone live in studio,When I fly off this mortal earth And I'm measu...,"[fly, mortal, earth, i'm, measured, depth, gir...",0


In [26]:
# Topic Visualization
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()


In [27]:
# Visualization: feed the LDA model into the pyLDAvis instance
lda_viz = gensimvis.prepare(lda, corpus, lyrics_dictionary)
lda_viz

  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


## Part 1.2: Similarity Calculation - Cosine Distance 

In [50]:
# open text file of lyric
input_file = 'input_lyrics1.txt'
with open(input_file, 'r') as f:
    input_lyrics = f.read()
    # clean input lyrics
    input_lyrics = input_lyrics.replace('b"','').replace('?',' ').replace('"','').replace('\\n', ' ').replace("b'",'').replace('instrumental','').replace('[\[],:*!?]','').replace('(','').replace(')','').replace('.','').replace(',','').replace('\\','').replace('verse','').replace('!','').replace('chorus','').replace('*','')
    # pre-process
    input_lyrics_preprocess = preprocess_string(input_lyrics, CUSTOM_FILTERS)
    # TF-IDF - vectorize lyrics
    bow_input_lyrics = lyrics_dictionary.doc2bow(input_lyrics_preprocess)
    # LDA - get topic
    input_lyrics_topics = lda.get_document_topics(bow_input_lyrics)
    input_lyrics_lda = max(input_lyrics_topics, key=itemgetter(1))[0] 
    # apply transformation to input lyrics
    transform_input_lyrics = tfidf[bow_input_lyrics]
    # get lyrics matrix
    index = similarities.SparseMatrixSimilarity(transformed_tfidf, num_features=len(lyrics_dictionary))
    # calc cosine similarity of input with every item in the dataset
    cos_sims = index[transform_input_lyrics]
#     print(cos_sims)
    # create new df based on matched topic
    topic_df = lyrics_df[lyrics_df.topic == input_lyrics_lda]
    # filter for matching topic
    topic_ind = lyrics_df[lyrics_df.topic == input_lyrics_lda].index.tolist()
    topic_song_list = [cos_sims[i] for i in topic_ind]
topic_song_list

  input_lyrics = input_lyrics.replace('b"','').replace('?',' ').replace('"','').replace('\\n', ' ').replace("b'",'').replace('instrumental','').replace('[\[],:*!?]','').replace('(','').replace(')','').replace('.','').replace(',','').replace('\\','').replace('verse','').replace('!','').replace('chorus','').replace('*','')


[0.03501813,
 0.06645417,
 0.03978243,
 0.025939327,
 0.07132574,
 0.026065914,
 0.021420369,
 0.026590252,
 0.006212875,
 0.034584146,
 0.0035622204,
 0.010615887,
 0.03389237,
 0.023528488,
 0.028069934,
 0.06637899,
 0.018568957,
 0.024908843,
 0.0063624918,
 0.012034316,
 0.024905061,
 0.024518078,
 0.07180401,
 0.76541173,
 0.017873397,
 0.045631953,
 0.036215514,
 0.012367448,
 0.04826665,
 0.08186865,
 0.0510107,
 0.034443963,
 0.04331077,
 0.1370999,
 0.01873298,
 0.023529768,
 0.0047773966,
 0.022020105,
 0.014437508,
 0.01957823,
 0.028643634,
 0.022114728,
 0.049270067,
 0.03009473,
 0.06610772,
 0.032294508,
 0.007107326,
 0.025363915,
 0.04251506,
 0.034715693,
 0.0066403225,
 0.038080174,
 0.043432698,
 0.09502719,
 0.040620025,
 0.021694792,
 0.0,
 0.036750868,
 0.006727691,
 0.008933695,
 0.023380073,
 0.018792018,
 0.00021312399,
 0.022110362,
 0.018421628,
 0.027948001,
 0.007253886,
 0.008775641,
 0.034582492,
 0.023219565,
 0.05633283,
 0.038499393,
 0.03183521,
 0.

In [51]:
# length check to make sure same size as df (when not filtering for topic)
# if filtering for topic, to see num of songs with same topic
# len(cos_sims)
len(topic_song_list)

157800

In [52]:
song_index = topic_song_list.index(max(topic_song_list))
song_index
rec_song = [topic_df['Artist'][song_index], topic_df['Title'][song_index], topic_df['Genre'][song_index]]
rec_song

['dua lipa', 'no goodbyes', 'Pop']