In [47]:
import pandas as pd
import numpy as np
import math
from nltk.stem.snowball import SnowballStemmer
from itertools import islice
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from collections import defaultdict
from sklearn.svm import LinearSVC

In [41]:
data = pd.read_csv("lyrics.csv")

In [42]:
data = data.replace({'[iI][Nn][sS][Tt][Rr][Uu][Mm][Ee][Nn][Tt]' : math.nan}, regex=True)
data = data.replace({'instrument': math.nan})
data = data.replace({'Instrument' : math.nan})
data = data.dropna()
data = data[data.genre != "Not Available"]
data = data.replace({'\n': ' '}, regex=True)

In [43]:
len(data)

237769

In [44]:
data = data.loc[(data['genre'] == 'Pop') | (data['genre'] == 'Rock') | (data['genre'] == 'Hip-Hop') | (data['genre'] == 'Metal') | (data['genre'] == 'Country')]
#x = training_genre_lyrics.loc[training_genre_lyrics['genre'] == 'Jazz']
len(data)
data.head()

Unnamed: 0,index,song,year,artist,genre,lyrics
0,0,ego-remix,2009,beyonce-knowles,Pop,"Oh baby, how you doing? You know I'm gonna cut..."
1,1,then-tell-me,2009,beyonce-knowles,Pop,"playin' everything so easy, it's like you seem..."
2,2,honesty,2009,beyonce-knowles,Pop,If you search For tenderness It isn't hard to ...
3,3,you-are-my-rock,2009,beyonce-knowles,Pop,"Oh oh oh I, oh oh oh I [Verse 1:] If I wrote a..."
4,4,black-culture,2009,beyonce-knowles,Pop,"Party the people, the people the party it's po..."


In [45]:
data_pop = data.loc[(data['genre'] == 'Pop')]
data_rock = data.loc[(data['genre'] == 'Rock')]
data_hiphop = data.loc[(data['genre'] == "Hip-Hop")]
data_metal = data.loc[(data['genre']== "Metal")]
data_country = data.loc[(data['genre'] == "Country")]


def get_sample(num_each):
    pop_df = data_pop.sample(n = num_each, replace = True)
    rock_df = (data_rock.sample(n = num_each, replace=True))
    hiphop_df = (data_hiphop.sample(n = num_each, replace=True))
    metal_df = (data_metal.sample(n = num_each, replace=True))
    country_df = (data_country.sample(n = num_each, replace=True))
    ret_df = pd.concat([pop_df, rock_df, hiphop_df, metal_df, country_df])
    ret_df = ret_df.sample(frac=1)
    return ret_df

training_set = get_sample(10000)
validation_set = get_sample(10000)
test_set = get_sample(10000)

#training_50k = data.sample(n=50000, replace=True)
#validation_50k = data.sample(n=50000, replace=True)
#testing_50k = data.sample(n=50000, replace=True)



In [46]:
training_test['genre'].value_counts()

Country    10000
Rock       10000
Pop        10000
Metal      10000
Hip-Hop    10000
Name: genre, dtype: int64

In [52]:
# Filter just genre and lyrics columns
training_genre_lyrics = training_set.filter([ 'genre', 'lyrics'])
validation_genre_lyrics = validation_set.filter(['genre', 'lyrics'])
testing_genre_lyrics = test_set.filter(['genre', 'lyrics'])

In [57]:
#goes thru training set and append genre pop's lyrics
pop_lyrics = []
for x in training_genre_lyrics.iterrows():
    if x[1][0] == 'Pop':
        pop_lyrics.append(x[1][1])
pop_lyrics = pd.DataFrame({"ly": pop_lyrics})
print (len(pop_lyrics))

10000


In [62]:
rock_lyrics = []
for x in training_genre_lyrics.iterrows():
    if x[1][0] == 'Rock':
        rock_lyrics.append(x[1][1])
print (len(rock_lyrics))
rock_lyrics = pd.DataFrame({"ly": rock_lyrics})

10000


In [59]:
hiphop_lyrics = []
for x in training_genre_lyrics.iterrows():
    if x[1][0] == 'Hip-Hop':
        hiphop_lyrics.append(x[1][1])
hiphop_lyrics = pd.DataFrame({"ly": hiphop_lyrics})
print (len(hiphop_lyrics))

10000


In [60]:
metal_lyrics = []
for x in training_genre_lyrics.iterrows():
    if x[1][0] == 'Metal':
        metal_lyrics.append(x[1][1])
metal_lyrics = pd.DataFrame({"ly": metal_lyrics})
print (len(metal_lyrics))

10000


In [61]:
country_lyrics = []
for x in training_genre_lyrics.iterrows():
    if x[1][0] == 'Country':
        country_lyrics.append(x[1][1])
country_lyrics = pd.DataFrame({"ly": country_lyrics})
print (len(country_lyrics))

10000


In [63]:
# Takes a dataframe/list containing lyrics and returns a tuple with the terms and weights 
def vectorize_feature(genre_lyrics, num_words):
    stemmer = SnowballStemmer("english")
    genre_lyrics['stemmed'] = genre_lyrics.ly.map(lambda x: ' '.join([stemmer.stem(y) for y in x.split(' ')]))
    #genre_lyrics.stemmed.head()
    
    cvec = CountVectorizer(min_df=1, max_df=.5, ngram_range=(1,3))
    cvec.fit(genre_lyrics.stemmed)
    
    # -- use for debugging -- x = list(islice(cvec.vocabulary_.items(), 20))
    
    cvec_counts = cvec.transform(genre_lyrics.stemmed)
    
    transformer = TfidfTransformer()
    transformed_weights = transformer.fit_transform(cvec_counts)
    
    weights = np.asarray(transformed_weights.mean(axis=0)).ravel().tolist()
    weights_df = pd.DataFrame({'term': cvec.get_feature_names(), 'weight': weights})
    weights_df = weights_df.sort_values(by='weight', ascending=False)[:num_words]
    
    #term_weight_list = []
    term_weight_dict = defaultdict(int)
    for _, row in weights_df.iterrows():
        # Returns a list of (term, weight) tuples
        #term_phrase_list.append((row['term'], row['weight']))
        term_weight_dict[row['term']] = row['weight']
    
    # return term_weight_list
    return term_weight_dict

In [73]:
rock_dicts = vectorize_feature(rock_lyrics, 500)
pop_dicts = vectorize_feature(pop_lyrics, 500)
hiphop_dicts = vectorize_feature(hiphop_lyrics, 500)
metal_dicts = vectorize_feature(metal_lyrics, 500)
country_dicts = vectorize_feature(country_lyrics, 500)

In [74]:
# Given new lyrics, return list of sums of weights
def find_weights(lyrics):
    weight_list = [0]*5
    stemmer = SnowballStemmer("english")
    stemmed_lyrics = ' '.join([stemmer.stem(y) for y in lyrics.split(' ')])
    for i in stemmed_lyrics.split():
        weight_list[0] += rock_dicts[i]
        weight_list[1] += pop_dicts[i]
        weight_list[2] += hiphop_dicts[i]
        weight_list[3] += metal_dicts[i]
        weight_list[4] += country_dicts[i]
    return weight_list

In [75]:
find_weights(pop_lyrics.ly[9])

[0.06618734711187454,
 0.1127357938314214,
 0.06984031249846831,
 0.035128957807037856,
 0.04818570913556183,
 0,
 0,
 0,
 0,
 0,
 0]