In [2]:
from collections import Counter, defaultdict
from itertools import combinations
from spacy import displacy

import seaborn as sn
import pandas as pd
import numpy as np
import regex as re

import en_core_web_lg
import inflect
import pyphen
import pickle
import string
import random
import spacy
import nltk
import sys
import csv
import os


ModuleNotFoundError: No module named 'pyphen'

In [None]:
def get_nicknames():
    
    """ Returns a dictionairy of nicknames per artist, based on Wikipedia """

    return {"JAY-Z":["Jay-Z","Jay","Hova","HOV","hov","Hov","Jigga","Shawn Carter","Shawn","Carter"],
    "Eminem": ["Marshall Mathers","Marshall","Mathers","Slim Shady","Slim","Shady"],
    "Future": ["Nayvadius Wilburn","Neyvadius","Wiburn","Meathead","Caeser Lee","Ceaser","Lee"],
    "Ice Cube": ["Ice","Cube","O'Shea Jackson","O'Shea","Jackson"],
    "Lil’ Kim": ["Lil’","Kim","Kimberley Jones","Kimberley","Jones","Queen Bee","Queen","Bee", "Lil'","Lil' Kim","own_nameme", "own_name own_name"],
    "Machine Gun Kelly": ["Machine Gun","Gun Kelly","Kelly","Kells","Richard Baker","Richard","Baker"],
    "Nas": ["Nas","own_namety","Escobar", "Jones"],
    "Nicki Minaj": ["Nicki","Minaj","Onika Maraj","Onika","Maraj"],
    "50 Cent": ["fifty Cent","fifty","fiftycent","50","Cent","Ferrari F-50","Ferrari","F-50","Curtis Jackson","Curtis","Jackson"],
    "2Pac": ["twopac","Tupac Shakur","Tupac","Shakur","Makaveli","MC New York", "Pac"],
    "Lil Wayne": ["Wayne","Tunechi","Weezy F. Baby", "Weezy","President Carter","Dwayne Carter","Dwayne","Carter"],
    "Snoop Dogg": ["Snoop","Dogg","DJ Snoopadelic","Snoopadelic","Niggarachi","Snoopzilla","Nemo Hoes","Nemo"],
    "Damian Marley": ["Damian Marley","Damian","Robert","Nesta","Jr. Gong","Junior Gong","Gong","Junior","Jr."],
    "Kanye West": ["Kanye West","Kanye","West","Yeezy","\bYe\b", "Omari"]
    }

In the following cell functions are implemented to create (bleached) representations.

In [7]:
def create_length_representation(lyrics):
    
    """ Converts words to their length, e.g.: Hello PC --> 05 02 """
    
    length_repr = ''
    for sentence in lyrics.split('\n'):
        sentence_repr = ''
        for word in sentence.split():
            sentence_repr += '0' + str(len(word)) + ' '
        length_repr += sentence_repr.rstrip() + '\n' # add newline to preserve line structure
    
    return length_repr.rstrip()

def create_punctC_representation(lyrics):
    
    """ Creates a representation in which punctuation is preserved """
    
    punctC_repr = ""
    for sentence in lyrics.split('\n'):
        sentence_repr = ''
        for word in sentence.split():
            punctC = ""
            for char in word:
                if char not in string.punctuation:
                    punctC += 'W'
                else:
                    punctC += char
            punctC = re.sub("W+", "W", punctC) + ' '
            sentence_repr += punctC
        punctC_repr += sentence_repr.rstrip() + '\n'
        
    return punctC_repr.rstrip()

def create_shape_representation(lyrics):
    
    """ Creates a representation which is based on capitality of letters, and digits"""
    
    shape_repr = ''
    for sentence in lyrics.split('\n'):
        sentence_repr = ''
        for word in sentence.split():
            shape = ''
            for char in word:
                if char.isupper():
                    shape += 'U'
                elif char.islower():
                    shape += 'L'
                elif char.isdigit():
                    shape += 'D'
                else:
                    shape += 'X'
            for letter in 'ULDX':
                shape = diminish_duplicate_letters(shape,letter)
            sentence_repr += shape + ' '
        shape_repr += sentence_repr.rstrip() + '\n'
    return shape_repr.rstrip()
                
def diminish_duplicate_letters(chars,char): # converts a 3 or more idental consecutive letters to 2
    return re.sub(char +"{3,}",char+char,chars)

def create_vowel_representation(lyrics):
    
    """ Create a representation based on vowels """
    
    vowel_representations = ''
    for sentence in lyrics.split('\n'):
        sentence_repr = ''
        for word in sentence.split():
            vowel_repr = ''
            for char in word:
                if char.lower() in 'aeiou':
                    vowel_repr += 'V'
                elif char.lower() in 'bcdfghjklmnpqrstvwxyz':
                    vowel_repr += 'C'
                else:
                    vowel_repr += 'O'
            sentence_repr += vowel_repr + ' '
        vowel_representations += sentence_repr.rstrip() + '\n'
    return vowel_representations.rstrip()

def create_alliteration_representation(lyrics):
    first_letters = [word[0] for word in lyrics.split()]
    return "".join(first_letters)  

def create_syllable_representation(lyrics):
    lyrics.translate(str.maketrans('', '', string.punctuation)) # source: https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string
    to_syllables = pyphen.Pyphen(lang='en')
    syllable_representation = ''
    for sentence in lyrics.split('\n'):
        words = sentence.split()
        for word in words:
            word = re.sub(r'([a-z])\1+', r'\1', word) # remove repeating consecutive letters as the it confuses the syllable generator
            syllables = to_syllables.inserted(word)
            syllable_representation += re.sub("\-", " ", syllables) + ' '
        syllable_representation += '\n'
    return syllable_representation.rstrip()

def create_NER_representation(lyrics, nlp):
    
    """ Creates a representation based on Named Entity Recognition"""
    
    NER_repr = ''
    for sentence in lyrics.split('\n'):
        NER_repr += ' '.join([X.label_ for X in nlp(sentence).ents]) + '\n'
    if NER_repr.strip() == '':
        NER_repr = 'None'
    return NER_repr.rstrip()

def create_POS_representation(lyrics):
    
    """ Creates a representation based on POS tagging """
    
    tokens = nltk.word_tokenize(lyrics)
    pos_tags = [output[1] for output in nltk.pos_tag(tokens)]
    return ' '.join(pos_tags)

In [8]:
create_syllable_representation("Hallo ik ven ... Tommem\nik houd van jullie")

'Ha lo ik ven ... Tomem \nik houd van julie'

In [9]:
def number_to_word(number): # converts a number to its word representation, e.g. 50 to fifty
    return inflect.engine().number_to_words(number.group(1))

In [10]:
create_POS_representation('I have .50 euros')

'PRP VBP VBN NNS'

In [11]:
def preprocess_lyrics(x,y):
    
    """ Returns a preprocessed x """
    
    inflect_engine = inflect.engine()
    nicknames = get_nicknames()
    filtered_x, length_repr, punctC_repr, shape_repr, vowel_repr, alliteration_repr, ner_repr, pos_repr = [],[],[],[],[],[],[],[]
    syllable_repr = []
    print("Total instances to preprocess: {}".format(len(x)))
    i = 0 # to track where the program is
    nlp = en_core_web_lg.load()
    for song,artist in zip(x,y):
        print(i,end=' ') # to track where the program is while running
        song = re.sub("\[.*\]", "", song) # removes info like [Intro: Eminem]
        song = re.sub("\*.*?\*", "", song) # text between *..* usually announces something
        song = re.sub("\n+","\n", song) # replaces multiple newlines by a single newline
        song = re.sub("\.([1-9])",r'\1',song) # convert .9 to 9
        pos_repr.append(create_POS_representation(song))
        song = re.sub("([1-9]+)",number_to_word,song) # convert numbers to words e.g. 50 to fifty
        alliteration_repr.append(create_alliteration_representation(song))
        song = re.sub(artist,"own_name",song) # replaces artists' own name with 'own_name'
        song = re.sub(artist.lower(),"own_name",song)
        song = re.sub(artist.upper(),"own_name",song)
        for nickname in nicknames[artist]: # replaces artists' nicknames with 'own_name'
            song = re.sub(nickname,"own_name",song)
            song = re.sub(nickname.lower(),"own_name",song)
            song = re.sub(nickname.upper(),"own_name",song)
        syllable_repr.append(create_syllable_representation(song))
        length_repr.append(create_length_representation(song))
        punctC_repr.append(create_punctC_representation(song))
        shape_repr.append(create_shape_representation(song))
        vowel_repr.append(create_vowel_representation(song))
        
        song = re.sub(" [\'\"\*\’\:\;\(\)]"," ",song) # removes specific punctuation after a space
        song = re.sub("[\'\"\*\’\:\;\(\)]([ \n])",r'\1',song) # removes specific punctuation before a space
        song = re.sub("([\.\,\!\?]) ", r' \1 ', song) # adds space between word and punct if last character of word is punct
        song = re.sub("([\.\,\!\?])\n", r' \1\n', song) # adds space between word and punct if last character of word is punct
        filtered_x.append(song)
        song = re.sub("own_name","John", song) # convert own_name to John for better NER_tagging
        ner_repr.append(create_NER_representation(song,nlp))
        
        i += 1 # to track where to program is while running
        
    return filtered_x, length_repr, punctC_repr, shape_repr, vowel_repr, alliteration_repr, ner_repr, pos_repr, syllable_repr

In [12]:
def import_data(path):
    
    """ Imports pickle files, and returns a shuffled x, and the according ys """
    
    artist_dict_list = []
    for filename in os.listdir(path):
        if str(filename)[-2:] == '.p':
            with open(path+filename, "rb") as f:
                artist_dict = pickle.load(f)
            artist_dict_list.append(artist_dict)
            
    # create a list of tuples with (lyrics, artist)
    x,y = [],[]
    for artist_dict in artist_dict_list:
        for song_title, song_info in artist_dict.items():
            #c = Counter(y)   # uncomment to set limit
                #if c[song_info[0]] < 99:    # uncomment to set limit
            x.append(song_info[3]) # song_info[3] = lyrics
            y.append(song_info[0]) # song_info[0] = artist
            
    x, y = shuffe_x_and_y(x,y)
    return x,y


In [13]:
def convert_to_verse_classification(x,y):
    
    """ Converts instances of songs to instances of verses """
    
    nicknames = get_nicknames()
    new_x, new_y = [],[]
    for lyrics,y in zip(x,y):
        lyrics = re.sub("\n","___",lyrics) # replace by ___ to preserse the location of the newline
        verses = re.findall("\[.+?\].+?\[",lyrics,overlapped=True) # [...] indicates the start of a new verse
        verses = [re.sub("___","\n",verse) for verse in verses] # reinsert the newlines
        verses = [re.sub("\n+\[","",verse) for verse in verses] # remove a remaining [
        for verse in verses:
            verse = re.sub("\[.+?\]","",verse) # remove the [...]
            if len(verse.split()) >= 20:
                new_y.append(y)
                new_x.append(verse)
    new_x, new_y = remove_duplicate_verses(new_x,new_y)
    return new_x, new_y

In [14]:
def convert_to_verse_classification_duo_artist(x,y):
    
    """ Convert to verse classification in a duo artist songs in which artist is set as e.g.: Jay-Z & Kanye West"""
    
    nicknames = get_nicknames()
    artist1, artist2 = y[0].split('&')[0].strip(), y[0].split('&')[1].strip()
    new_x, new_y = [],[]
    for lyrics,y in zip(x,y):
        lyrics = re.sub("\n","___",lyrics)
        verses = re.findall("\[.+?\].+?\[",lyrics,overlapped=True)
        verses = [re.sub("___","\n",verse) for verse in verses]
        verses = [re.sub("\n+\[","",verse) for verse in verses]
        for verse in verses:
            y_verse = "OTHER ARTIST" # in case more artists particiate than artist1 and artist2
            header = re.findall("\[.+?\]",verse)[0].lower() # header of a verse, as in [..]
            verse = re.sub("\[.+?\]","",verse)
            for name in nicknames[artist1]:
                if name.lower() in header: # set verse to artist1
                    y_verse = artist1
            for name in nicknames[artist2]: # set verse to artist2 of its not set to artist 1 or combined verse yet
                if name.lower() in header and y_verse != artist1 and y_verse != 'combined verse':
                    y_verse = artist2
                if name.lower() in header and y_verse == artist1: # set verse to combined if both artists are mentioned in the header
                    y_verse = 'combined verse'
            if y_verse == artist1 or y_verse == artist2:
                if len(verse.split()) >= 20:
                    new_y.append(y_verse)
                    new_x.append(verse)
    new_x, new_y = remove_duplicate_verses(new_x,new_y)
    return new_x, new_y

In [15]:
def remove_duplicate_verses(x,y):
    new_x, new_y = [],[]
    while len(x) > 0:
        tempx = x.pop(0)
        tempy = y.pop(0)
        if tempx not in x:
            new_x.append(tempx)
            new_y.append(tempy)
    return new_x, new_y

In [16]:
def shuffe_x_and_y(x,y):
    
    """ Returns a shuffled x and y """
    
    x_and_y = [(x,y) for x,y in zip(x,y)] # combine x and y to keep the y related to the right x
    random.seed(30)
    random.shuffle(x_and_y)
    new_x, new_y = [], []
    for x,y in x_and_y:
        new_x.append(x)
        new_y.append(y)
    return new_x, new_y

In [17]:
def get_word_count(x):
    x = [re.sub("['’]"," ",lyrics) for lyrics in x] # to convert e.g. I'm into I m
    x = [lyrics.translate(str.maketrans('','',string.punctuation)) for lyrics in x] # removes punctuation
    return [len(lyrics.split()) for lyrics in x]

def get_sentence_count(x):
    return [len(lyrics.split('\n')) for lyrics in x]

def get_avg_word_length(x):
    x = [lyrics.translate(str.maketrans('','',string.punctuation)) for lyrics in x] # removes punctuation
    avg_word_lengths = []
    for lyrics in x:
        word_lengths = [len(word) for word in lyrics.split()]
        avg_word_lengths.append(round(sum(word_lengths) / len(lyrics.split()),2))
    return avg_word_lengths
    
def get_exclam_mark_count(x):
    return [lyrics.count('!') for lyrics in x]

def get_question_mark_count(x):
    return [lyrics.count('?') for lyrics in x]

def get_comma_count(x):
    return [lyrics.count(',')for lyrics in x]

def get_comma_ratio(x):
    return [round(lyrics.count(',') / len(lyrics.split('\n')),2) for lyrics in x]

def get_unique_word_ratio(x):
    x = [re.sub("['’]"," ",lyrics) for lyrics in x] # to convert e.g. I'm into I m
    x = [re.sub("own_name","",lyrics) for lyrics in x] # remove own name tags
    x = [lyrics.translate(str.maketrans('','',string.punctuation)) for lyrics in x] # removes punctuation
    return [round(len(set(lyrics.split()))/len(lyrics.split()),2) for lyrics in x]


def get_repeated_sentence_ratios(x):
    repeated_sentence_count_ratios = [] # sum of sentences that are repeated / amount of sentences
    repeated_sentence_ratios = [] # sum of different sentences that are repeated / amount of different sentences
    for lyrics in x:
        sentence_counter = Counter(lyrics.split('\n'))
        total_sentences = len(lyrics.split('\n'))
        repeated_sentences_count = 0
        repeated_sentences = 0
        for sentence, instances in sentence_counter.items():
            if instances > 1:
                repeated_sentences_count += instances
                repeated_sentences += 1
        repeated_sentence_count_ratios.append(round(repeated_sentences_count/total_sentences,2))
        repeated_sentence_ratios.append((round(repeated_sentences/len(sentence_counter),2)))
    return repeated_sentence_count_ratios, repeated_sentence_ratios
        

In [None]:
def main():
    path = "../lyrics/afro_males/"
    path = "../lyrics/diverse/"
    path = "../lyrics/verse_classification/"
    filename = "afro_male_artist_even"
    filename = "vc_jayz_kanye"
    filename = "vc_jayz_kanye_songs"
    #filename = "diverse_artist"
    settype = "_train.csv"
    #settype = "_test.csv"
    print(path+filename+settype)
    x, y = import_data(path)
    print(Counter(y))
    x, y = convert_to_verse_classification(x,y)
    #x, y = convert_to_verse_classification_duo_artist(x,y)
    x, length_repr, punctC_repr, shape_repr, vowel_repr, alliteration_repr, ner_repr, pos_repr, syllable_repr = preprocess_lyrics(x,y)
    c = Counter(y).most_common()
    #print(c[-1][1])
    word_count = get_word_count(x)
    sentence_count = get_sentence_count(x)
    avg_word_length = get_avg_word_length(x)
    exclam_mark_count = get_exclam_mark_count(x)
    comma_count = get_comma_count(x)
    comma_ratio = get_comma_ratio(x)
    unique_word_ratio = get_unique_word_ratio(x)
    question_mark_count = get_question_mark_count(x)
    repeated_sentence_count_ratio, repeated_sentence_ratio = get_repeated_sentence_ratios(x)
    
    with open(path+filename+settype, mode='w') as csv_file:
        fieldnames = ['lyrics','artist','word_count','sentence_count','!_count','?_count',',_count',
                      'avg_word_length', ',_ratio', 'uniq_word_ratio','rep_sent_count_ratio','rep_sent_ratio',
                     'length_repr','punctC_repr','shape_repr','vowel_repr','alliteration_repr','ner_repr',
                      'pos_repr','syllable_repr']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()
        print("yeah")
        for i in range(len(x)):
            writer.writerow({'lyrics': x[i],\
                             'artist': y[i],\
                             'word_count':word_count[i],\
                             'sentence_count':sentence_count[i],\
                             '!_count':exclam_mark_count[i],\
                             '?_count':question_mark_count[i],\
                             ',_count':comma_count[i],\
                             'avg_word_length':avg_word_length[i],\
                             ',_ratio':comma_ratio[i],\
                            'uniq_word_ratio':unique_word_ratio[i],\
                             'rep_sent_count_ratio':repeated_sentence_count_ratio[i],\
                             'rep_sent_ratio':repeated_sentence_ratio[i],\
                            'length_repr':length_repr[i],\
                            'punctC_repr':punctC_repr[i],\
                            'shape_repr':shape_repr[i],\
                            'vowel_repr':vowel_repr[i],\
                           'alliteration_repr':alliteration_repr[i],\
                             'ner_repr':ner_repr[i],
                            'pos_repr':pos_repr[i],
                            'syllable_repr'})

    
    """i = 0
    
    print(unique_word_ratio[i])
    print(exclam_mark_count[i])
    print(question_mark_count[i])
    print(repeated_sentence_count_ratio[i])
    print(repeated_sentence_ratio[i])
    print(x[i])"""
#main()

Select import path and output filename

In [None]:
outpath = "../lyrics/afro_males/"
#path = "../lyrics/diverse/"
outpath = "../lyrics/verse_classification/"
#filename = "afro_male_artist_even"
filename = "vc_jayz_kanye"
filename = "vc_jayz_kanye_songs"
#filename = "vc_marley_nas"
#filename = "vc_marley_nas_songs"
#filename = "diverse_artist_verses"
settype = "_train.csv"
#settype = "_test.csv"
print(path+filename+settype)

In [27]:
outpath = "../lyrics/verse_classification/"
inpath = "../lyrics/jayz&kanye/"
filename = "vc_jayz_kanye_verses"
#filename = "vc_jayz_kanye_songs"
settype = "_train.csv"

In [47]:
outpath = "../lyrics/verse_classification/"
inpath = "../lyrics/marley&nas/"
filename = "vc_marley_nas_verses"
#filename = "vc_marley_nas_songs"
settype = "_train.csv"

In [34]:
outpath = "../lyrics/verse_classification/"
inpath = "../lyrics/jayz&kanye_combined/"
filename = "vc_jayz_kanye_verses"
settype = "_test.csv"

In [54]:
outpath = "../lyrics/verse_classification/"
inpath = "../lyrics/marley&nas_combined/"
filename = "vc_marley_nas_verses"
settype = "_test.csv"

In [55]:
print(inpath)
print(outpath+filename+settype)

../lyrics/marley&nas_combined/
../lyrics/verse_classification/vc_marley_nas_verses_test.csv


In [56]:
x, y = import_data(inpath)
Counter(y)

Counter({'Damian Marley & Nas': 14})

In [50]:
x, y = convert_to_verse_classification(x,y)
Counter(y)

Counter({'Nas': 598, 'Damian Marley': 166})

Get represtations of the lyrics and the preprocessed lyrics itself

In [57]:
x, y = convert_to_verse_classification_duo_artist(x,y)
Counter(y)

Counter({'Nas': 24, 'Damian Marley': 35})

In [58]:
x, length_repr, punctC_repr, shape_repr, vowel_repr, alliteration_repr, ner_repr, pos_repr, syllable_repr = preprocess_lyrics(x,y)

Total instances to preprocess: 59
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 

Get statistical features

In [59]:
word_count = get_word_count(x)
sentence_count = get_sentence_count(x)
avg_word_length = get_avg_word_length(x)
exclam_mark_count = get_exclam_mark_count(x)
comma_count = get_comma_count(x)
comma_ratio = get_comma_ratio(x)
unique_word_ratio = get_unique_word_ratio(x)
question_mark_count = get_question_mark_count(x)
repeated_sentence_count_ratio, repeated_sentence_ratio = get_repeated_sentence_ratios(x)

Write the data

In [60]:
with open(outpath+filename+settype, mode='w') as csv_file:
        fieldnames = ['lyrics','artist','word_count','sentence_count','!_count','?_count',',_count',
                      'avg_word_length', ',_ratio', 'uniq_word_ratio','rep_sent_count_ratio','rep_sent_ratio',
                     'length_repr','punctC_repr','shape_repr','vowel_repr','alliteration_repr','ner_repr',
                      'pos_repr','syllable_repr']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()
        for i in range(len(x)):
        #for i in range(int(0.8*len(x))):
            writer.writerow({'lyrics': x[i],\
                             'artist': y[i],\
                             'word_count':word_count[i],\
                             'sentence_count':sentence_count[i],\
                             '!_count':exclam_mark_count[i],\
                             '?_count':question_mark_count[i],\
                             ',_count':comma_count[i],\
                             'avg_word_length':avg_word_length[i],\
                             ',_ratio':comma_ratio[i],\
                            'uniq_word_ratio':unique_word_ratio[i],\
                             'rep_sent_count_ratio':repeated_sentence_count_ratio[i],\
                             'rep_sent_ratio':repeated_sentence_ratio[i],\
                            'length_repr':length_repr[i],\
                            'punctC_repr':punctC_repr[i],\
                            'shape_repr':shape_repr[i],\
                            'vowel_repr':vowel_repr[i],\
                           'alliteration_repr':alliteration_repr[i],\
                             'ner_repr':ner_repr[i],
                            'pos_repr':pos_repr[i],
                           'syllable_repr':syllable_repr[i]})