In [1]:
import pandas as pd
import glob

### Open all csv files, and create a dataframe containing only the lyrics 
path = "song-lyrics-dataset/csv/"
all_files = glob.glob(path + "*.csv")

data = []
for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    data.append(df)  

df = pd.concat(data, axis=0, ignore_index=True)
data_text = df[['Lyric']]

data_text['index'] = data_text.index
documents = data_text
print(documents)


                                                  Lyric  index
0     thought i'd end up with sean but he wasn't a m...      0
1     yeah breakfast at tiffany's and bottles of bub...      1
2     you you love it how i move you you love it how...      2
3     ariana grande  nicki minaj i've been here all ...      3
4     right now i'm in a state of mind i wanna be in...      4
...                                                 ...    ...
5398  it's strange to think the songs we used to sin...   5398
5399  drew looks at me i fake a smile so he won't se...   5399
5400  to put it plainly we just couldnt stop writing...   5400
5401  turn wycd on you're on your grunwald back from...   5401
5402  trying just like they say just taking the step...   5402

[5403 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_text['index'] = data_text.index


In [2]:
print(len(documents))
print(documents[:5])

5403
                                               Lyric  index
0  thought i'd end up with sean but he wasn't a m...      0
1  yeah breakfast at tiffany's and bottles of bub...      1
2  you you love it how i move you you love it how...      2
3  ariana grande  nicki minaj i've been here all ...      3
4  right now i'm in a state of mind i wanna be in...      4


In [3]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')
from nltk import pos_tag

lemmatizer = WordNetLemmatizer()
my_stop_words = STOPWORDS.union(set(['yeah', 'cause', 'wanna', 'gonna', 'nigga', 'fuckin', 'bitch', 'come']))


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Gebruiker\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
def lemmatize_stemming(text):
    return lemmatizer.lemmatize(text)
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in my_stop_words and len(token) > 3:
           # result.append(token)
            result.append(lemmatize_stemming(token))
    return result

In [5]:
### Tokenize and lemmatize the text for a specific song

doc_sample = documents[documents['index'] == 1].values[0][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['yeah', 'breakfast', 'at', "tiffany's", 'and', 'bottles', 'of', 'bubbles', 'girls', 'with', 'tattoos', 'who', 'like', 'getting', 'in', 'trouble', 'lashes', 'and', 'diamonds', 'atm', 'machines', 'buy', 'myself', 'all', 'of', 'my', 'favorite', 'things', 'yeah', 'been', 'through', 'some', 'bad', 'shit', 'i', 'should', 'be', 'a', 'sad', 'bitch', 'who', 'woulda', 'thought', "it'd", 'turn', 'me', 'to', 'a', 'savage', 'rather', 'be', 'tied', 'up', 'with', 'calls', 'and', 'not', 'strings', 'write', 'my', 'own', 'checks', 'like', 'i', 'write', 'what', 'i', 'sing', 'yeah', 'yeah', '', 'pre', 'my', 'wrist', 'stop', "watchin'", 'my', 'neck', 'is', 'flossy', 'make', 'big', 'deposits', 'my', 'gloss', 'is', "poppin'", 'you', 'like', 'my', 'hair', 'gee', 'thanks', 'just', 'bought', 'it', 'i', 'see', 'it', 'i', 'like', 'it', 'i', 'want', 'it', 'i', 'got', 'it', 'yeah', '', '', 'i', 'want', 'it', 'i', 'got', 'it', 'i', 'want', 'it', 'i', 'got', 'it', 'i', 'want', 'it', 'i', 'got', '

In [6]:
### Tokenize and lemmatize the text, making sure all NaN's are filled with an empty string

processed_docs = documents['Lyric'].fillna('').astype(str).map(preprocess)
### print the first 10 results
processed_docs[:10]

0    [thought, sean, wasn, match, wrote, song, rick...
1    [breakfast, tiffany, bottle, bubble, girl, tat...
2    [love, love, touch, said, believe, woman, feel...
3    [ariana, grande, nicki, minaj, night, ariana, ...
4    [right, state, mind, like, time, tear, left, p...
5    [lacigam, gnihtemos, thgin, laiceps, ruoy, thg...
6    [type, feelin, mmmmmm, know, know, shouldn, th...
7    [heaven, sent, hopin, repeat, history, tryna, ...
8    [think, crazy, cravin, plainly, gimme, baby, d...
9    [step, know, like, skrrt, staying, night, orde...
Name: Lyric, dtype: object

In [7]:
###  Create a dictionary from ‘processed_docs’ containing the number of times a word appears in the training set 
###  We are going to use the Dictionary function to derive a dictionary with counts from the headlines

dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break
        
print(dictionary)

0 aisle
1 amazing
2 angel
3 better
4 bout
5 discussion
6 drama
7 fast
8 forbid
9 friend
10 good
Dictionary(32225 unique tokens: ['aisle', 'amazing', 'angel', 'better', 'bout']...)


In [8]:
### Filter out tokens that appear in less than 10 documents

dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=100000)
print(dictionary)

Dictionary(4554 unique tokens: ['aisle', 'amazing', 'angel', 'better', 'bout']...)


In [10]:
### For each document we create a dictionary reporting how many words and how many times those words appear 
### Gensim provides the *doc2bow* function to create a BoW vector representation for a document
### Save this to ‘bow_corpus’, then check a specific song

bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[0]

[(0, 1),
 (1, 5),
 (2, 1),
 (3, 1),
 (4, 1),
 (5, 1),
 (6, 1),
 (7, 1),
 (8, 1),
 (9, 2),
 (10, 5),
 (11, 1),
 (12, 1),
 (13, 1),
 (14, 1),
 (15, 1),
 (16, 1),
 (17, 1),
 (18, 1),
 (19, 1),
 (20, 4),
 (21, 3),
 (22, 5),
 (23, 3),
 (24, 1),
 (25, 1),
 (26, 1),
 (27, 1),
 (28, 2),
 (29, 1),
 (30, 5),
 (31, 5),
 (32, 1),
 (33, 1),
 (34, 2),
 (35, 1),
 (36, 1),
 (37, 2),
 (38, 1),
 (39, 2),
 (40, 2),
 (41, 2),
 (42, 3),
 (43, 1),
 (44, 6),
 (45, 37),
 (46, 1),
 (47, 1),
 (48, 1),
 (49, 2),
 (50, 1),
 (51, 1),
 (52, 1),
 (53, 1),
 (54, 1)]

In [11]:
### Preview Bag Of Words for a selected song

bow_doc_1 = bow_corpus[0]
for i in range(len(bow_doc_1)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_1[i][0], 
                                               dictionary[bow_doc_1[i][0]], 
bow_doc_1[i][1]))

Word 0 ("aisle") appears 1 time.
Word 1 ("amazing") appears 5 time.
Word 2 ("angel") appears 1 time.
Word 3 ("better") appears 1 time.
Word 4 ("bout") appears 1 time.
Word 5 ("discussion") appears 1 time.
Word 6 ("drama") appears 1 time.
Word 7 ("fast") appears 1 time.
Word 8 ("friend") appears 1 time.
Word 9 ("good") appears 2 time.
Word 10 ("grateful") appears 5 time.
Word 11 ("grew") appears 1 time.
Word 12 ("hand") appears 1 time.
Word 13 ("handle") appears 1 time.
Word 14 ("happens") appears 1 time.
Word 15 ("havin") appears 1 time.
Word 16 ("holding") appears 1 time.
Word 17 ("laugh") appears 1 time.
Word 18 ("learned") appears 1 time.
Word 19 ("listen") appears 1 time.
Word 20 ("look") appears 4 time.
Word 21 ("lost") appears 3 time.
Word 22 ("love") appears 5 time.
Word 23 ("loved") appears 3 time.
Word 24 ("malcolm") appears 1 time.
Word 25 ("mama") appears 1 time.
Word 26 ("married") appears 1 time.
Word 27 ("match") appears 1 time.
Word 28 ("need") appears 2 time.
Word 29 ("

In [12]:
### Train our lda model using gensim.models.LdaMulticore and save it to ‘lda_model’

lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=5, id2word=dictionary, passes=2, workers=2)

In [13]:
### For each topic, we will explore the words occuring in that topic and its relative weight

for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.031*"love" + 0.024*"want" + 0.020*"baby" + 0.016*"feel" + 0.016*"time" + 0.010*"right" + 0.010*"think" + 0.010*"thing" + 0.007*"tell" + 0.007*"heart"
Topic: 1 
Words: 0.050*"love" + 0.017*"need" + 0.015*"right" + 0.012*"night" + 0.012*"good" + 0.012*"diamond" + 0.011*"rihanna" + 0.011*"girl" + 0.010*"song" + 0.009*"heart"
Topic: 2 
Words: 0.012*"baby" + 0.011*"money" + 0.010*"shit" + 0.010*"want" + 0.010*"hard" + 0.009*"girl" + 0.009*"look" + 0.009*"tell" + 0.008*"fuck" + 0.008*"time"
Topic: 3 
Words: 0.016*"girl" + 0.014*"love" + 0.011*"baby" + 0.009*"need" + 0.009*"shit" + 0.009*"tonight" + 0.009*"time" + 0.008*"body" + 0.008*"shake" + 0.007*"want"
Topic: 4 
Words: 0.032*"baby" + 0.027*"work" + 0.014*"music" + 0.012*"night" + 0.011*"stop" + 0.011*"love" + 0.010*"rihanna" + 0.008*"light" + 0.007*"kiss" + 0.007*"time"


In [33]:
### Check where a specific song would be classified 

for index, score in sorted(lda_model[bow_corpus[0]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.9937191605567932	 
Topic: 0.018*"girl" + 0.013*"love" + 0.011*"need" + 0.010*"tonight" + 0.010*"baby" + 0.009*"time" + 0.009*"shake" + 0.009*"body" + 0.009*"shit" + 0.007*"ohoh"


In [14]:
### Download the needed modules in order to visualize and analyze the predicted topics

%matplotlib inline
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
vis = gensimvis.prepare(topic_model=lda_model, corpus=bow_corpus, dictionary=dictionary)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

  default_term_info = default_term_info.sort_values(


In [15]:
### This saves the above image 

vis_data = gensimvis.prepare(lda_model, bow_corpus, dictionary)
pyLDAvis.save_html(vis_data, 'topic_output.html')