In [1]:
import pandas as pd
import numpy as np
import math

In [2]:
data = pd.read_csv("lyrics.csv")

In [3]:
data = data.replace({'\[instrumental\]' : math.nan})
data = data.dropna()
data = data[data.genre != "Not Available"]
data = data.replace({'\n': ' '}, regex=True)

In [4]:
len(data)

242615

In [5]:
data.head(10)

Unnamed: 0,index,song,year,artist,genre,lyrics
0,0,ego-remix,2009,beyonce-knowles,Pop,"Oh baby, how you doing? You know I'm gonna cut..."
1,1,then-tell-me,2009,beyonce-knowles,Pop,"playin' everything so easy, it's like you seem..."
2,2,honesty,2009,beyonce-knowles,Pop,If you search For tenderness It isn't hard to ...
3,3,you-are-my-rock,2009,beyonce-knowles,Pop,"Oh oh oh I, oh oh oh I [Verse 1:] If I wrote a..."
4,4,black-culture,2009,beyonce-knowles,Pop,"Party the people, the people the party it's po..."
5,5,all-i-could-do-was-cry,2009,beyonce-knowles,Pop,I heard Church bells ringing I heard A choir s...
6,6,once-in-a-lifetime,2009,beyonce-knowles,Pop,This is just another day that I would spend Wa...
7,7,waiting,2009,beyonce-knowles,Pop,"Waiting, waiting, waiting, waiting Waiting, wa..."
8,8,slow-love,2009,beyonce-knowles,Pop,[Verse 1:] I read all of the magazines while w...
9,9,why-don-t-you-love-me,2009,beyonce-knowles,Pop,"N-n-now, honey You better sit down and look ar..."


In [6]:
training_50k = data.sample(n=50000, replace=True)
validation_50k = data.sample(n=50000, replace=True)
testing_50k = data.sample(n=50000, replace=True)

In [7]:
training_50k.head(10)

Unnamed: 0,index,song,year,artist,genre,lyrics
127097,127097,enemies-with-me,2006,2pac,Hip-Hop,Young Thugs in this motherfucker Don't break u...
198654,198654,sweetheart,2006,bee-gees,Pop,If your heart tells you so that you should lea...
3445,3445,fidarmi-delle-tue,2006,carmen-consoli,Rock,Sai come prendermi in giro le tue favole spave...
145022,145022,like-i-never-left,2010,craig-david,Pop,"[Verse 1] Girl, you've been looking for someth..."
156245,156245,salon-ghost-live,2012,devin-townsend-project,Rock,"How, everybody? How about a body at moonlight ..."
249389,249389,discipline,2008,gangstarr,Hip-Hop,Mmm You gotta hold your head up.. Oahh-eahh-oh...
3489,3489,i-will-always-be-your-friend,1982,george-duke,Jazz,"We loved each other for so long, Tell me what ..."
5224,5224,i-believed-in-god-reprise,2012,danko-jones,Rock,INSTRUMENTAL
250639,250639,apple,2003,atmosphere,Hip-Hop,"[Slug] Hi, my name is Sean Daley and you might..."
64343,64343,yeah-remix,2011,big-kuntry-king,Hip-Hop,[Intro - Big Kuntry King] P$C Block ENT You kn...


In [8]:
training_genre_lyrics = training_50k.filter([ 'genre', 'lyrics'])
validation_genre_lyrics = validation_50k.filter(['genre', 'lyrics'])
testing_genre_lyrics = testing_50k.filter(['genre', 'lyrics'])

In [9]:
testing_genre_lyrics.head()

Unnamed: 0,genre,lyrics
116723,Folk,"Salamat, tayo'y magkasamang muli Salamat at ma..."
123354,Rock,Get out Damn it Close the door Don't slam it L...
79170,Hip-Hop,"[Intro] Bobby, bitch, oh, you ain't know? Bobb..."
179593,Rock,Lightning burned the fields Black fire choked ...
306248,Indie,"Lost my faith, now I'm on my own Lost inside, ..."


In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

cvec = CountVectorizer(min_df=1, max_df=.5, ngram_range=(1,3))
cvec

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=1,
        ngram_range=(1, 3), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [11]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
training_genre_lyrics['stemmed'] = training_genre_lyrics.lyrics.map(lambda x: ' '.join([stemmer.stem(y) for y in x.split(' ')]))
training_genre_lyrics.stemmed.head()

127097    young thug in this motherfuck don't break up t...
198654    if your heart tell you so that you should leav...
3445      sai come prendermi in giro le tue favol spaven...
145022    [vers 1] girl, you'v been look for someth new ...
156245    how, everybody? how about a bodi at moonlight ...
Name: stemmed, dtype: object

In [12]:
from itertools import islice
cvec.fit(training_genre_lyrics.stemmed)
list(islice(cvec.vocabulary_.items(), 20))

[('young', 6194358),
 ('thug', 5314755),
 ('this', 5258981),
 ('motherfuck', 3311637),
 ('don', 1333348),
 ('break', 693326),
 ('up', 5597938),
 ('fight', 1677468),
 ('let', 2824860),
 ('em', 1467446),
 ('rumbl', 4285864),
 ('make', 3050728),
 ('enemi', 1494047),
 ('with', 5964213),
 ('tri', 5492085),
 ('tell', 4937045),
 ('these', 5213352),
 ('they', 5220589),
 ('ain', 77036),
 ('see', 4391832)]

In [13]:
len(cvec.vocabulary_)

6271040

In [14]:
cvec_counts = cvec.transform(training_genre_lyrics.stemmed)
print('sparse matrix shape:', cvec_counts.shape)
print('nonzero count:', cvec_counts.nnz)
print('sparsity: %.2f%%' % (100.0 * cvec_counts.nnz / (cvec_counts.shape[0] * cvec_counts.shape[1])))

sparse matrix shape: (50000, 6271040)
nonzero count: 19501290
sparsity: 0.01%


In [15]:
occ = np.asarray(cvec_counts.sum(axis=0)).ravel().tolist()
counts_df = pd.DataFrame({'term': cvec.get_feature_names(), 'occurrences': occ})
counts_df.sort_values(by='occurrences', ascending=False).head(20)

Unnamed: 0,occurrences,term
5785127,92945,we
470514,76953,be
807541,72682,can
2992171,67337,love
4608647,65935,so
1333348,64453,don
2720177,63413,know
5964213,62005,with
3530163,61992,no
749938,60984,but


In [16]:
transformer = TfidfTransformer()
transformed_weights = transformer.fit_transform(cvec_counts)
transformed_weights

<50000x6271040 sparse matrix of type '<class 'numpy.float64'>'
	with 19501290 stored elements in Compressed Sparse Row format>

In [17]:
weights = np.asarray(transformed_weights.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': cvec.get_feature_names(), 'weight': weights})
weights_df.sort_values(by='weight', ascending=False).head(100)

Unnamed: 0,term,weight
5785127,we,0.011891
2992171,love,0.009675
470514,be,0.009138
807541,can,0.008822
2483710,instrument,0.008413
1333348,don,0.008353
2720177,know,0.008004
4608647,so,0.007980
3530163,no,0.007868
749938,but,0.007594


In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer
tvec = TfidfVectorizer(min_df=.0025, max_df=.1, ngram_range=(1,3))
tvec_weights = tvec.fit_transform(training_genre_lyrics.stemmed.dropna())
weights = np.asarray(tvec_weights.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': tvec.get_feature_names(), 'weight': weights})
top_100_terms = weights_df.sort_values(by='weight', ascending=False).head(100)

In [27]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [31]:
genres = np.array(training_genre_lyrics['genre'].value_counts().keys())

In [32]:
genres

array(['Rock', 'Pop', 'Hip-Hop', 'Metal', 'Country', 'Electronic', 'Jazz',
       'Other', 'R&B', 'Indie', 'Folk'], dtype=object)

In [33]:
le.fit(genres)

LabelEncoder()

In [34]:
le.classes_

array(['Country', 'Electronic', 'Folk', 'Hip-Hop', 'Indie', 'Jazz',
       'Metal', 'Other', 'Pop', 'R&B', 'Rock'], dtype=object)

In [40]:
genreId = le.transform(training_genre_lyrics['genre'])
genreId

array([ 3,  8, 10, ...,  8, 10, 10])

In [39]:
le.inverse_transform(genreId)

array(['Hip-Hop', 'Pop', 'Rock', ..., 'Pop', 'Rock', 'Rock'], dtype=object)

In [41]:
def feature(datum):
    # value = datum.parse()
    # 

Unnamed: 0,term,weight
0,10,0.000516
1,100,0.000337
2,12,0.000309
3,20,0.000341
4,24,0.000254
5,2x,0.000860
6,30,0.000291
7,40,0.000421
8,4x,0.000306
9,50,0.000361


In [61]:
from collections import defaultdict


TypeError: filter() got an unexpected keyword argument 'genre'