## Genius songs
### Multivariate Data Analysis project
Based on:<br>
Derek Lim, Austin Benson. <br> _"Expertise and Dynamics within Crowdsourced Musical Knowledge Curation: A Case Study of the Genius Platform."_ 2020.

In [1]:
import json
import pandas as pd
from tqdm import tqdm
import json_lines
import scipy
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import re
from nltk.tokenize import word_tokenize
import enchant
from nltk.corpus import wordnet as wn
from nltk.stem import PorterStemmer
from collections import Counter
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

In [6]:
info_raw = [json.loads(line) for line in open('song_info.json', 'r', encoding='utf-8')]

In [7]:
song_info = pd.DataFrame(info_raw)

In [28]:
lyrics_raw = []
with open('lyrics.jl') as f:
    for line in f:
        lyrics_raw.append(json.loads(line))

In [29]:
lyrics_info = pd.DataFrame(lyrics_raw)

In [30]:
song_info.head()

Unnamed: 0,url_name,title,primary_artist,release_date,pyongs,contributors,has_bio,views,tags,annotations
0,Kendrick-lamar-swimming-pools-drank-lyrics,Swimming Pools (Drank),Kendrick-lamar,"July 31, 2012",894.0,403,True,5589280.0,"[Trap, Conscious Hip-Hop, Memes, West Coast, R...",
1,Kendrick-lamar-money-trees-lyrics,Money Trees,Kendrick-lamar,"October 22, 2012",880.0,394,True,4592003.0,"[Conscious Hip-Hop, West Coast, Rap, Producer]",
2,Kendrick-lamar-xxx-lyrics,XXX.,Kendrick-lamar,"April 14, 2017",188.0,389,True,4651514.0,"[Conscious Hip-Hop, Boom Bap, Pop, West Coast,...",
3,A-ap-rocky-fuckin-problems-lyrics,Fuckin’ Problems,A-ap-rocky,"October 24, 2012",706.0,437,True,7378309.0,"[Gangsta Rap, Dirty South, Atlanta, Posse Cut,...",
4,Kendrick-lamar-dna-lyrics,DNA.,Kendrick-lamar,"April 14, 2017",555.0,570,True,5113687.0,"[Politics, Producer, News, Conscious Hip-Hop, ...",


#### Merge the lyrics to the song info

In [31]:
lyrics_info = lyrics_info.rename(columns = {'song':'url_name'})

In [32]:
to_process = pd.merge(lyrics_info, song_info, how="left", on=["url_name"])['lyrics'].tolist()

### Preprocessing

_This chunk of code is from the paper <br>
(I modified it a bit to suit this task)_ <br>
Removes part indicators and whitespaces

In [33]:
parts = ["Intro", "Outro", "Chorus", "Hook",
            "Pre-Hook", "Bridge", "Verse", "Refrain",
            "Pre-Chorus", "Part", "Post-Chorus", 'Interlude']
re_parts = "|".join(map(lambda s: r"\b" + s + r"\b", parts))
regex_parts = re.compile(r"\[("+re_parts+").*\]")

In [34]:
lyrics = []
for song in tqdm(to_process):
    song = re.sub(regex_parts, '', song)
    song = re.sub('\n', ' ', song)
    song = re.sub('[\(\[].*?[\)\]]', '', song) # Remove adlibs and parts
    song = re.sub('   ', ' ', song)
    song = re.sub('  ', ' ', song)
    song = re.sub('—', ' ', song)
    song = song.replace('?', '')
    song = song.replace('!', '')
    song = song.replace("'", '')
    song = song.replace("'", '')
    song = song.replace(',' , '')
    song = song.replace('’', '')
    song = song.lower()
    lyrics.append(song)

100%|██████████| 37993/37993 [00:04<00:00, 9448.95it/s] 


#### Removing stopwords and going along with the pipeline

In [4]:
stop_words = set(stopwords.words('english'))

In [5]:
lyrics_clean = []
for lyric in tqdm(lyrics):
    split_lyric = lyric.split()
    resultwords = [word for word in split_lyric if word not in stop_words and len(word) > 2]
    result = ' '.join(resultwords)
    lyrics_clean.append(result)

NameError: name 'lyrics' is not defined

Define the lemmatizer class

Specify the vectorizer: maximal features is the number of words it will count, try with _20_ and continue with no limit (maybe overnight), max and min df will specify what the range of proportion the words that appear have to be in ngram_range could be 2 (?) <- to consider

Merge lyrics to dataframe

In [37]:
lyrics_info['lyrics'] = lyrics_clean

In [38]:
df = pd.merge(lyrics_info, song_info, how="left", on=["url_name"])

Remove non-English lyrics (or songs with less than 70% English words)

In [39]:
d = enchant.Dict("en_US")

In [40]:
enwords = []
nonenwords = []
for index, row in tqdm(df.iterrows()):
    split_lyric = row['lyrics'].split()
    n_enword = 0
    n_nonenword = 0
    for word in split_lyric:
        if d.check(str(word)) == True:
            n_enword += 1
        else: 
            n_nonenword += 1
    enwords.append(n_enword)
    nonenwords.append(n_nonenword)

37993it [01:13, 520.21it/s]


In [41]:
df['enwords'] = [enwords[i]/(enwords[i]+nonenwords[i]) 
                          if enwords[i] != 0 and nonenwords[i] !=0
                          else 0
                          for i in range(len(enwords))]

_Load and save_

In [70]:
# df.to_csv('unfiltered_dataframe.csv', index=False)
df = pd.read_csv('unfiltered_dataframe.csv')

The proportion of English words should be more than 70%

In [71]:
df = df[df['enwords'] > 0.7]
df = df[df['views'] != 0]

Fit the vectorizer, specify the number of features

In [72]:
class lemmatizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, df):
        return [self.wnl.lemmatize(word) for word in word_tokenize(df)]

In [87]:
vectorizer = CountVectorizer(max_features=1000, max_df=0.8, min_df=0.05,
                             tokenizer=lemmatizer(), lowercase=True, stop_words='english',
                             token_pattern = r'w+')

In [88]:
vectorizer.fit(df['lyrics'])

  'stop_words.' % sorted(inconsistent))


CountVectorizer(max_df=0.8, max_features=1000, min_df=0.05,
                stop_words='english', token_pattern='w+',
                tokenizer=<__main__.lemmatizer object at 0x7f87966c6910>)

In [63]:
df['lyrics']

'0        pour head shot sit stand pass wake faded faded...\n1        niggas tryna get bish hit house lick: tell wit...\n2        america god bless good america please take han...\n3        love bad bitches thats fuckin problem yeah lik...\n4        got got got got loyalty got royalty inside dna...\n                               ...                        \n37952    stone wall dog gaze duct taped ceiling stucco ...\n37953    headless skid like rita lifted ocean drift lik...\n37973    dont want club rather sit couch watch movie gi...\n37974    say something wrong hear thinking talk way lon...\n37992    ayy wild things youre doin night trips whereve...\nName: lyrics, Length: 34445, dtype: object'

### Create the sparse matrix

In [89]:
count_vector = vectorizer.transform(df['lyrics'])

In [90]:
bow = count_vector.toarray()
bow = pd.DataFrame(bow, columns=vectorizer.get_feature_names())
bow.head()

Unnamed: 0,'',.,...,:,;,``,act,aint,air,arm,....1,wrong,yall,yeah,year,yes,youll,young,youre,youve,‘
0,0,0,0,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,2,0,0
1,1,12,0,5,0,1,0,3,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,2,0,2,1,3,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,3,0,1,3,0,0,...,0,0,13,0,1,0,0,0,0,1
4,0,0,0,0,0,0,0,4,0,0,...,0,0,5,1,0,0,1,0,0,0


In [10]:
## Save the BOW df
#bow.to_csv('bag_of_words.csv', index=False)
bow = pd.read_csv('bag_of_words.csv')

Remove the weirdo words

In [11]:
bow_filtered = bow.iloc[:, 6:996]

In [12]:
unnecessary = ['aint', 'ayy', 'baby', 'yall', 'yeah', 'yes', 'youre', 'youve', 'youll', 'big', 'bitch', 'bos', 
               'caught', 'chick', 'comin', 'damn', 'doin', 'dont', 'dude', 'feelin', 'fuck', 'fucked', 
               'fuckin', 'fucking', 'gettin', 'getting', 'goin', 'going', 'gon', 'gone', 'hey', 'hoe', 
               'huh', 'ima', 'imma', 'ive', 'let', 'lookin', 'man', 'motherfucker', 'na', 'nah', 'nigga', 
               'nothin', 'okay', 'ooh', 'pas', 'playin', 'really', 'sayin', 'shes', 'smokin', 'somethin', 
               'son', 'ta', 'talkin', 'thats', 'theyre', 'thinkin', 'til', 'till', 'took', 'tryin', 'tryna', 
               'wan', 'want', 'wasnt', 'whats', 'wont', 'wouldnt', '‘']

for column_name in unnecessary:
    del bow_filtered[column_name]

### Sentiment Analysis


There are pre-trained language models, e.g. TextBlob for this purpose, however, since lyrics are very specific linguistically, I will train my own model on the given dataset

In [13]:
bow_filtered['no_of_views'] = song_info['views']

  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [15]:
bow_filtered.head()

Unnamed: 0,act,air,arm,ask,away,bad,bag,ball,band,bank,...,word,work,world,worth,wrist,write,wrong,year,young,no_of_views
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5589280.0
1,0,0,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,4592003.0
2,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,4651514.0
3,1,0,0,0,0,12,0,1,0,0,...,1,0,1,0,0,0,0,0,0,7378309.0
4,0,0,0,0,0,0,0,0,0,0,...,0,2,1,0,0,0,0,1,1,5113687.0


In [39]:
bow_filt['genre_of_song'] = song_info['tags']

In [27]:
main_tags = ['Country', 'R&B', 'Rap', 'Rock', 'Pop'] #Pop is the default main tag

In [30]:
tag_list = []
for tag in tqdm(main_tags):
        for index, row in bow_filt.iterrows():
            if tag in row['genre_of_song']:
                tag_list.append(tag)

100%|██████████| 5/5 [00:09<00:00,  1.82s/it]


In [35]:
main_tags_list = []
for index, row in bow_filt.iterrows():
    main = None
    for tag in main_tags:
        if tag in row['genre_of_song']:
            main = tag
    main_tags_list.append(main)
        

34445it [00:02, 14346.31it/s]


In [43]:
bow_filt['main_tag'] = main_tags_list

In [42]:
bow_filt.drop('genre_of_song', axis=1, inplace=True)

In [45]:
bow_filt.to_csv('filtered_dataset.csv', index=False)