# make compatible with Python 2 and Python 3
from __future__ import print_function, division, absolute_import 

# Remove warnings
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
%matplotlib inline



In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn import decomposition
from scipy import linalg

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize
import pickle

In [3]:
import sys
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bhair\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
from nltk.tokenize import sent_tokenize # tokenizes sentences
import re
from nltk.stem import PorterStemmer
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
eng_stopwords = stopwords.words('english')

In [5]:
import pandas as pd
data= pd.read_csv('C:/Users/bhair/Documents/GitHub/LyricsMatch_DataX1/data/lyrics_updated.csv')
# We only need the lyrics text column from the data
data=data[data['lang']=='en']
data_lyrics = data[['cleaned_lyrics']]


In [6]:
print(eng_stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [7]:
#replacing the newlines with spaces
data['cleaned_lyrics']=data['cleaned_lyrics'].replace({'\n':' '},regex=True)
print(data['cleaned_lyrics'][:5])

0    oh baby how you doing you know i m gonna cut r...
1    playin everything so easy it s like you seem s...
2    if you search for tenderness it isn t hard to ...
3    oh oh oh i oh oh oh i verse if i wrote a book ...
4    party the people the people the party it s pop...
Name: cleaned_lyrics, dtype: object


In [8]:
#Cleaning lyrics
def lyrics_cleaner(lyrics):
    
    lyrics=re.sub("[^a-zA-Z]"," ",lyrics)
    # Tokenize into words (all lower case)
    lyrics = lyrics.lower().split()
    
    # Remove stopwords
    eng_stopwords = set(stopwords.words("english"))
    lyrics = [word for word in lyrics if not word in eng_stopwords]
    
    # Join the review to one sentence
    lyrics = ' '.join(lyrics)
    
    return lyrics

In [9]:
data['cleaned_lyrics'] = data['cleaned_lyrics'].apply(lyrics_cleaner)

In [10]:
print(data['cleaned_lyrics'][0])

oh baby know gonna cut right chase women made like think created special purpose know special feel baby let get lost need call work cause boss real want show feel consider lucky big deal well got key heart gonna need rather open body show secrets know inside need lie big wide strong fit much tough talk like cause back got big ego huge ego love big ego much walk like cause back usually humble right choose leave could blues call arrogant call confident decide find working damn know killing legs better yet thighs matter fact smile maybe eyes boy site see kind something like big wide strong fit much tough talk like cause back got big ego huge ego love big ego much walk like cause back walk like cause back talk like cause back back back walk like cause back big wide strong fit much tough talk like cause back got big ego huge ego huge ego love big ego much walk like cause back ego big must admit got every reason feel like bitch ego strong know need beat sing piano


In [11]:
data['cleaned_lyrics'].head()
data_lyrics=data["cleaned_lyrics"]
data_lyrics[4:10]

4    party people people party popping sitting arou...
5    heard church bells ringing heard choir singing...
6    another day would spend waitin right one stari...
7    waiting waiting waiting waiting waiting waitin...
8    verse read magazines waiting around said wait ...
9    n n honey better sit look around cause must bu...
Name: cleaned_lyrics, dtype: object

In [12]:
#Creating a CountVectorizer object and a TF-IDF Vectorizer object
vect1=CountVectorizer (analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             max_features = 5000)
vect2 = TfidfVectorizer(max_features=5000)

In [13]:
vect1.fit(data_lyrics)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=5000, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [14]:
vect1_feature_names=vect1.get_feature_names()

In [15]:
#bag of words 
vect2.fit(data_lyrics)
vect2_feature_names=vect2.get_feature_names()
print(vect2_feature_names[30:50])

['add', 'addict', 'addicted', 'addiction', 'address', 'admit', 'adore', 'advance', 'advantage', 'adventure', 'advice', 'afar', 'affair', 'affection', 'afford', 'afraid', 'africa', 'african', 'afternoon', 'age']


In [16]:
#checking the size of the bag of words
print(np.array(vect2_feature_names).shape)

(5000,)


In [17]:
print(vect1_feature_names[20:40])
print(vect2_feature_names[20:40])

['across', 'act', 'actin', 'acting', 'action', 'actions', 'acts', 'actually', 'ad', 'adam', 'add', 'addict', 'addicted', 'addiction', 'address', 'admit', 'adore', 'advance', 'advantage', 'adventure']
['across', 'act', 'actin', 'acting', 'action', 'actions', 'acts', 'actually', 'ad', 'adam', 'add', 'addict', 'addicted', 'addiction', 'address', 'admit', 'adore', 'advance', 'advantage', 'adventure']


In [18]:
bag1 = vect1.transform(data_lyrics) 
bag2 = vect2.transform(data_lyrics) 

In [20]:
#Fitting the model to generate the decomposed matrices
clf = decomposition.NMF(n_components=25, random_state=1)
W1 = clf.fit_transform(bag1)
H1 = clf.components_
W2 = clf.fit_transform(bag2)
H2 = clf.components_

In [21]:
print(W2.shape, H2.shape)

(237947, 25) (25, 5000)


In [22]:
#Getting the top words for each topic such that the last word [-1 position] in the list is the top word
for i,topic in enumerate(H2):
    print(f'Top 3 words for topic #{i} :')
    print([vect1_feature_names[i] for i in topic.argsort()[-3:]])
    print('\n')

Top 3 words for topic #0 :
['could', 'way', 'say']


Top 3 words for topic #1 :
['true', 'heart', 'love']


Top 3 words for topic #2 :
['niggas', 'shit', 'nigga']


Top 3 words for topic #3 :
['ooh', 'whoa', 'oh']


Top 3 words for topic #4 :
['see', 'cause', 'know']


Top 3 words for topic #5 :
['ooh', 'girl', 'baby']


Top 3 words for topic #6 :
['show', 'go', 'let']


Top 3 words for topic #7 :
['see', 'tell', 'want']


Top 3 words for topic #8 :
['de', 'da', 'la']


Top 3 words for topic #9 :
['make', 'tonight', 'gonna']


Top 3 words for topic #10 :
['right', 'ooh', 'yeah']


Top 3 words for topic #11 :
['heart', 'make', 'feel']


Top 3 words for topic #12 :
['always', 'ever', 'never']


Top 3 words for topic #13 :
['two', 'day', 'one']


Top 3 words for topic #14 :
['feels', 'girl', 'like']


Top 3 words for topic #15 :
['good', 'man', 'got']


Top 3 words for topic #16 :
['eyes', 'world', 'life']


Top 3 words for topic #17 :
['mind', 'long', 'time']


Top 3 words for topic #18 

In [25]:
#Getting the most weighted topic for each song and creating a new column "Topic" to store that in the dataframe
data['Topic'] = W2.argmax(axis=1)
data.tail()

Unnamed: 0,song,year,artist,genre,lyrics,cleaned_lyrics,length,lang,Topic
265282,who-am-i-drinking-tonight,2012,edens-edge,Country,"I gotta say\nBoy, after only just a couple of ...",gotta say boy couple dates hands outright blow...,1597,en,15
265283,liar,2012,edens-edge,Country,I helped you find her diamond ring\nYou made m...,helped find diamond ring made try everything t...,1009,en,12
265284,last-supper,2012,edens-edge,Country,Look at the couple in the corner booth\nLooks ...,look couple corner booth looks lot like lookin...,1171,en,14
265285,christ-alone-live-in-studio,2012,edens-edge,Country,When I fly off this mortal earth\nAnd I'm meas...,fly mortal earth measured depth girth father s...,850,en,16
265286,amen,2012,edens-edge,Country,I heard from a friend of a friend of a friend ...,heard friend friend friend finally got rid gir...,1532,en,24
