In [4]:
from collections import Counter, defaultdict
from itertools import combinations
from spacy import displacy

import seaborn as sn
import pandas as pd
import numpy as np
import regex as re

import en_core_web_lg
import phonetics
import warnings
import inflect
import pyphen
import pickle
import string
import random
import spacy
import nltk
import math
import sys
import csv
import os

warnings.filterwarnings('ignore')

In [5]:
def get_nicknames():
    
    """ Returns a dictionairy of nicknames per artist, based on Wikipedia """

    return {"JAY-Z":["Jay-Z","Jay","Hova","HOV","hov","Hov","Jigga","Shawn Carter","Shawn","Carter"],
    "Eminem": ["Eminem","Marshall Mathers","Marshall","Mathers","Slim Shady","Slim","Shady"],
    "Future": ["Future","Nayvadius Wilburn","Neyvadius","Wiburn","Meathead","Caeser Lee","Ceaser","Lee"],
    "Ice Cube": ["Ice Cube","Ice","Cube","O'Shea Jackson","O'Shea","Jackson"],
    "Lil’ Kim": ["Lil’ Kim","Lil’","Kim","Kimberley Jones","Kimberley","Jones","Queen Bee","Queen","Bee", "Lil'","Lil' Kim","own_nameme", "own_name own_name"],
    "Machine Gun Kelly": ["Machine Gun Kelly","Machine Gun","Gun Kelly","Kelly","Kells","Richard Baker","Richard","Baker","MGK"],
    "Nas": ["Nasty Nas","Nasty","Nas","Escobar", "Jones"],
    "Nicki Minaj": ["Nicki Minaj","Nicki","Minaj","Onika Maraj","Onika","Maraj"],
    "50 Cent": ["50 Cent","Fifty Cent","fifty cent","fifty","fiftycent","50","Cent","Ferrari F-50","Ferrari","F-50","Curtis Jackson","Curtis","Jackson"],
    "2Pac": ["2Pac","twopac","Tupac Shakur","Tupac","Shakur","Makaveli","MC New York", "Pac"],
    "Lil Wayne": ["Lil Wayne","Wayne","Tunechi","Weezy F. Baby", "Weezy","President Carter","Dwayne Carter","Dwayne","Carter"],
    "Snoop Dogg": ["Snoop Dogg","Snoop","Doggy","Dogg","DJ Snoopadelic","Snoopadelic","Niggarachi","Snoopzilla","Nemo Hoes","Nemo"],
    "Damian Marley": ["Damian Marley","Damian","Robert","Nesta","Jr. Gong","Jr Gong","Junior Gong","Gong","Junior","Jr."],
    "Kanye West": ["Kanye West","Kanye","West","Yeezy","\bYe\b", "Omari"],
    "Cardi B": ['Cardi B','Cardi','\bB\b','Belcalis','Marlenis','Alamanzar'],
    "MC Lyte": ['MC Lyte','Lyte','Lana','Michelle','Moorer'],
    "Missy Elliott": ['Missy Elliot','Missy','Elliot','Misdemeanor','Melissa','Arnette'],
    "Iggy Azalea": ['Iggy Azalea','Iggy','Azalea','Amethyst','Amelia','Kelly'],
    "Queen Latifah": ['Queen Latifah','Queen','Latifah','Dana','Elaine','Owens']
    }

In the following cell functions are implemented to create (bleached) representations.

In [6]:
#def create_frequency_representation(lyrics):
    
def create_phonetic_representation(lyrics):
    to_phonetics = nltk.corpus.cmudict.dict()
    phonetics_repr = ''
    lyrics = lyrics.lower()
    lyrics = re.sub("' "," ",lyrics) # to convert words as runnin' to runnin
    lyrics = re.sub("\-"," ",lyrics) # convert words a four-door to four door
    for word in lyrics.lower().split():
        try:
            phonetics_repr += "".join(to_phonetics[word][0]) + ' '
        except:
            pass
    return phonetics_repr.rstrip()
    
    
def create_soundex_representation(lyrics):
    soundex_repr = ''
    for word in lyrics.split():
        try:
            soundex_repr += phonetics.soundex(word) + ' '
        except:
            word = re.sub("'","",word)
            words = re.sub("\-", " ",word)
            for word in words.split():
                try:
                    soundex_repr += phonetics.soundex(word) + ' '
                except:
                    pass
    return soundex_repr.rstrip()

def create_metaphone_representation(lyrics):
    metaphone_repr = ''
    for word in lyrics.split():
        try:
            metaphone_repr += phonetics.metaphone(word) + ' '
        except:
            print(word)
    return metaphone_repr.rstrip()


def create_frequency_representation(lyrics,c_all_words):
    return ' '.join([str(int(c_all_words[word]/20)) for word in lyrics.split()])

def create_length_representation(lyrics):
    
    """ Converts words to their length, e.g.: Hello PC --> 05 02 """
    
    length_repr = ''
    for sentence in lyrics.split('\n'):
        sentence_repr = ''
        for word in sentence.split():
            sentence_repr += '0' + str(len(word)) + ' '
        length_repr += sentence_repr.rstrip() + '\n' # add newline to preserve line structure
    
    return length_repr.rstrip()

def create_punctC_representation(lyrics):
    
    """ Creates a representation in which punctuation is preserved """
    
    punctC_repr = ""
    for sentence in lyrics.split('\n'):
        sentence_repr = ''
        for word in sentence.split():
            punctC = ""
            for char in word:
                if char not in string.punctuation:
                    punctC += 'W'
                else:
                    punctC += char
            punctC = re.sub("W+", "W", punctC) + ' '
            sentence_repr += punctC
        punctC_repr += sentence_repr.rstrip() + '\n'
        
    return punctC_repr.rstrip()

def create_shape_representation(lyrics):
    
    """ Creates a representation which is based on capitality of letters, and digits"""
    
    shape_repr = ''
    for sentence in lyrics.split('\n'):
        sentence_repr = ''
        for word in sentence.split():
            shape = ''
            for char in word:
                if char.isupper():
                    shape += 'U'
                elif char.islower():
                    shape += 'L'
                elif char.isdigit():
                    shape += 'D'
                else:
                    shape += 'X'
            for letter in 'ULDX':
                shape = diminish_duplicate_letters(shape,letter)
            sentence_repr += shape + ' '
        shape_repr += sentence_repr.rstrip() + '\n'
    return shape_repr.rstrip()
                
def diminish_duplicate_letters(chars,char): # converts a 3 or more idental consecutive letters to 2
    return re.sub(char +"{3,}",char+char,chars)

def create_vowel_representation(lyrics):
    
    """ Create a representation based on vowels """
    
    vowel_representations = ''
    for sentence in lyrics.split('\n'):
        sentence_repr = ''
        for word in sentence.split():
            vowel_repr = ''
            for char in word:
                if char.lower() in 'aeiou':
                    vowel_repr += 'V'
                elif char.lower() in 'bcdfghjklmnpqrstvwxyz':
                    vowel_repr += 'C'
                else:
                    vowel_repr += 'O'
            sentence_repr += vowel_repr + ' '
        vowel_representations += sentence_repr.rstrip() + '\n'
    return vowel_representations.rstrip()

def create_alliteration_representation(lyrics):
    first_letters = [word[0] for word in lyrics.split()]
    return "".join(first_letters)  

def create_syllable_representation(lyrics):
    lyrics.translate(str.maketrans('', '', string.punctuation)) # source: https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string
    to_syllables = pyphen.Pyphen(lang='en')
    syllable_representation = ''
    for sentence in lyrics.split('\n'):
        words = sentence.split()
        for word in words:
            syllables = to_syllables.inserted(word)
            syllable_representation += re.sub("\-", " ", syllables) + ' '
        syllable_representation += '\n'
    return syllable_representation.rstrip()

def create_NER_representation(lyrics, nlp):
    
    """ Creates a representation based on Named Entity Recognition"""
    
    NER_repr = ''
    for sentence in lyrics.split('\n'):
        NER = [(X.text,X.label_) for X in nlp(sentence).ents]
        for word in sentence.split():
            added_NER = False
            for tupl in NER:
                if word == tupl[0]:
                    NER_repr += tupl[1]
                    added_NER = True
            if added_NER == False:
                NER_repr += word
            NER_repr += ' '
        NER_repr.rstrip()
        NER_repr += '\n'
            
    return NER_repr.rstrip()

def create_POS_representation(lyrics):
    
    """ Creates a representation based on POS tagging """
    
    tokens = nltk.word_tokenize(lyrics)
    pos_tags = [output[1] for output in nltk.pos_tag(tokens)]
    return ' '.join(pos_tags)

def number_to_word(number): # converts a number to its word representation, e.g. 50 to fifty
    return inflect.engine().number_to_words(number.group(1))

In [154]:
#nlp = en_core_web_lg.load()
#create_NER_representation('I am John\nNineteen twenty-one is what I represent',nlp)
create_phonetic_representation('5')

''

In [7]:
def preprocess_lyrics(data):
    
    """ Returns a preprocessed x """
    
    inflect_engine = inflect.engine()
    nicknames = get_nicknames()
    filtered_x, length_repr, punctC_repr, shape_repr, vowel_repr, alliteration_repr, ner_repr, pos_repr = [],[],[],[],[],[],[],[]
    syllable_repr = []
    print("Total instances to preprocess: {}".format(len(data)))
    i = 0 # to track where the program is
    #nlp = en_core_web_lg.load()
    new_data = []

    for dictio in data:
        lyrics = dictio['lyrics']
        artist = dictio['artist']
        lyrics = re.sub("\[.*\]", "", lyrics) # removes info like [Intro: Eminem]
        lyrics = re.sub("\*.*?\*", "", lyrics) # text between *..* usually announces something
        lyrics = re.sub("[wW]\/","", lyrics)
        lyrics = re.sub("[Cc]horus","", lyrics)
        lyrics = re.sub("[Vv]erse","",lyrics)
        lyrics = re.sub("[xX][1-9]","",lyrics)
        lyrics = re.sub("\n+","\n", lyrics) # replaces multiple newlines by a single newline
        lyrics = re.sub("\{\}\[\]\*\&", "", lyrics)
        for nickname in nicknames[dictio['artist']]: # replaces artists' nicknames with 'own_name'
            lyrics = re.sub(nickname,"own_name",lyrics)
            lyrics = re.sub(nickname.lower(),"own_name",lyrics)
            lyrics = re.sub(nickname.upper(),"own_name",lyrics)
        
        lyrics = re.sub(" .*?own_name.* "," own_name ",lyrics)
       
        dictio['shape_repr'] = create_shape_representation(lyrics)
        lyrics = re.sub("\.([1-9])",r'\1',lyrics) # convert .9 to 9
        dictio['pos_repr'] = create_POS_representation(lyrics)
        #lyrics = re.sub(" 911", " 9 1 1",lyrics)
        #lyrics = re.sub("19([0-9]{2})",r'19 \1',lyrics)
        #lyrics = re.sub("([0-9]+)",number_to_word,lyrics) # convert numbers to words e.g. 50 to fifty
        dictio['word_count'] = get_word_count(lyrics)
        dictio['sentence_count'] = get_sentence_count(lyrics)
        dictio['avg_word_length'] = get_avg_word_length(lyrics)
        dictio['unique_word_ratio'] = get_unique_word_ratio(lyrics)
        dictio['repeated_sentence_count_ratio'], dictio['repeated_sentence_ratio'] = get_repeated_sentence_ratios(lyrics)
        #dictio['alliter_repr'] = create_alliteration_representation(lyrics)
        
        
        
        #lyrics = re.sub(" [\'\"\*\’\:\;\(\)]"," ",lyrics) # removes specific punctuation after a space
        #lyrics = re.sub("[\'\"\*\’\:\;\(\)]([ \n])",r'\1',lyrics) # removes specific punctuation before a space
        #lyrics = re.sub("([\.\,\!\?]) ", r' \1 ', lyrics) # adds space between word and punct if last character of word is punct
        #lyrics = re.sub("([\.\,\!\?])\n", r' \1\n', lyrics) # adds space between word and punct if last character of word is punct
        #lyrics.translate(str.maketrans('', '', string.punctuation))

        
        dictio['lyrics'] = lyrics
        
        dictio['syllab_repr'] = create_syllable_representation(lyrics)
        dictio['length_repr'] = create_length_representation(lyrics)
        dictio['punctC_repr'] = create_punctC_representation(lyrics)
        dictio['vowel_repr'] = create_vowel_representation(lyrics)
        dictio['syllab_repr'] = create_syllable_representation(lyrics)
        dictio['metaphone_repr'] = create_metaphone_representation(lyrics)
        dictio['soundex_repr'] = create_soundex_representation(lyrics)
        dictio['phonetic_repr'] = create_phonetic_representation(lyrics)
        lyrics = re.sub("own_name","John", lyrics) # convert own_name to John for better NER_tagging
        dictio['ner_repr'] = create_NER_representation(lyrics,nlp)
        
        new_data.append(dictio)
        
        # to track where to program is while running
        i += 1 
        if i % 100 == 0:
            print(i,end=' ')
   # all_lyrics = [dictio['lyrics'].split() for dictio in new_data]
    #c_all_words = Counter([word for lyrics in all_lyrics for word in lyrics])
    
    #for dictio in new_data:
    #    dictio['frequency_repr'] = create_frequency_representation(dictio['lyrics'],c_all_words)
    return new_data

In [17]:
re.sub(".*own_name.* "," own_name "," own_nametradameus ")
re.sub("19([0-9]{2})",r'19 \1',"199666")

'19 9666'

In [8]:
def get_artist_list(data_set):
    return [dictio['artist'] for dictio in data_set]

In [9]:
def import_raw_data(path):
    songs_per_artist = []
    for filename in os.listdir(path):
        if filename[-4:] == ".csv":
            if 'dev' not in filename and 'train' not in filename and 'test' not in filename:
                songs_per_artist.append(pd.read_csv(path+filename))
    df = pd.concat(songs_per_artist, ignore_index = True)
    data = []
    for i,row in df.iterrows():
       data.append({"song_title":row["song_title"],"artist":row['artist'],"lyrics":row['lyrics'],"featuring":row['featuring']})
    return data

In [10]:
def import_one_csv_file(path):
    df = pd.read_csv(path)
    data = []
    for i,row in df.iterrows():
        data.append({"song_title":row["song_title"],"artist":row['artist'],"lyrics":row['lyrics'],"featuring":row['featuring']})
    return data

In [11]:
def form_x_y_including_verses(data):
    nicknames = get_nicknames()
    new_data = []
    for dictio in data:
        #print(dictio['song_title'])
        if isinstance(dictio['featuring'],float): # nan is a float, thus if no featuring artists it's a flaot
            dictio['type'] = 'song'
            new_data.append(dictio)
        else:
            #print(dictio.keys())
            lyrics = dictio['lyrics']
            lyrics = re.sub("\n","___",lyrics)
            verses = re.findall("\[.+?\].+?\[",lyrics,overlapped=True)
            verses = [re.sub("___","\n",verse) for verse in verses]
            verses = [re.sub("\n+\[","",verse) for verse in verses]
            combined_verses = []
            for verse in verses:
                header = re.findall("\[.+?\]",verse) # header of a verse, as in [..]
                if header != []:
                    header = header[0].lower()
                    header = header.split(':')
                    if len(header) > 1:
                        header = header[1].strip()[:-1]
                    for nickname in nicknames[dictio['artist']]:
                        if header == nickname.lower():
                            verse = re.sub("\[.+?\]","",verse)
                            combined_verses.append(verse)
                            break
            combined_verses = "\n".join(combined_verses)
            if len(combined_verses.split()) > 20:
                dictio['lyrics'] = combined_verses
                dictio['type'] = 'verses'
                new_data.append(dictio)
    return new_data

In [12]:
def convert_to_verse_classification(data):
    
    """ Converts instances of songs to instances of verses """
    
    nicknames = get_nicknames()
    new_data = []
    all_verses = []
    for dictio in data:
        artist = dictio['artist']
        lyrics = dictio['lyrics']
        lyrics = re.sub("\n","___",lyrics) # replace by ___ to preserse the location of the newline
        verses = re.findall("\[.+?\].+?\[",lyrics,overlapped=True) # [...] indicates the start of a new verse
        verses = [re.sub("___","\n",verse) for verse in verses] # reinsert the newlines
        verses = [re.sub("\n+\[","",verse) for verse in verses] # remove a remaining [
        for verse in verses:
            #print("\n\n",verse)
            if isinstance(dictio['featuring'],float): # if the entire song is by the same artist, simply add the verse to the data
                verse = re.sub("\[.+?\]","",verse)
                if len(verse.split()) > 20:
                    new_dictio = dictio.copy()
                    new_dictio['lyrics'] = verse.strip()
                    #print("\n\n",new_dictio['lyrics'],"\n\n")
                    if new_dictio not in new_data:
                        #print("appends verse")
                        new_data.append(new_dictio)
                        all_verses.append(verse.strip())
            else: # if the song in by multiple artists, check the artist of each verse
                header = re.findall("\[.+?\]",verse) # header of a verse, as in [..]
                if header != []:
                    header = header[0].lower()
                    header = header.split(':')
                    if len(header) > 1:
                        header = header[1].strip()[:-1]
                    for nickname in nicknames[dictio['artist']]:
                        if header == nickname.lower():
                            verse = re.sub("\[.+?\]","",verse)
                            if len(verse.split()) > 20:
                                new_dictio = dictio.copy()
                                new_dictio['lyrics'] = verse.strip()
                                print("\n\n",new_dictio['lyrics'])
                                if new_dictio not in new_data:
                                    new_data.append(new_dictio)
                                    all_verses.append(verse.strip())
    song_titles = [dictio['song_title'] for dictio in new_data]
    lyrics = [dictio['lyrics'] for dictio in new_data]
    #for l in all_verses:
     #   print(l,"\n\n")
    return new_data

In [96]:
inpath = "../lyrics/CADS/jayz&kanye/"
outpath = "../lyrics/CADS/jayz&kanye/jayz&kanye"
data = import_raw_data(inpath)
splitted_data = split_train_dev_test(data,True)
datas = [('train',splitted_data[0][1] + splitted_data[1][1] + splitted_data[2][1])] # combine train dev and test
datas = [(data_type,convert_to_verse_classification(data)) for data_type, data in datas[0:1]]



 Merry Christmas to all, and all a good night
Huh, now we all livin' the good life
Yeah, though it's forty below the wind chill
And we wipin' snow up off the windshield
It's still wonderful night to be alive, baby
And I'm so happy I'm with my baby
And we a little late with the Christmas gifts
Rushin' for the mall, don't trip, you know I drive crazy
The streets lit up, it feel like Christmas officially
Told her that "You the star at the top of my Christmas tree"
My only question is, "Where my presents?"
She said, "Shhh," she got a gift for me that ain't for the kids to see
Well, I like the way you think, mami
Now pour some more eggnog in your drink, mami
You've been a bad girl, give Santa three kisses
Gave her the hot chocolate, she said, "It's Dee-ricious"
Hahahaha, yeah


 You sweat her and I ain't talkin 'bout a Coogi
You a Big L and I ain't talkin 'bout Cool J
See me at the airport, at least twenty Louis
Treat me like the Prince and this my sweet brother Numpsay
(Brother Numpsay!)



 ("Breaking up") We can make it better
("Breaking up") We can make it better
("Breaking up") We can make it better
("Make It Up") We can make it


 Somebody told me Delta's brown-skinned, AKA's light-skinned
And they supposed to be bougie, so they got white friends
First day of school, I'ma take you sight seein'
Show you what we do on weekends for excitement
She said, "I know what you about to say, like your hype man"
Last year a nigga hit her, and now she only date white men
And if a nigga even wave at her, it's frightenin', but


 I wanted to walk just like you (remember?)
Wanted to talk just like you (word)
Often momma said I look too much
And I thought just like you (and I'd get happy)
Wanted to drink Miller nips and smoke Newports just like you
But you left me, now I'm going to court just like you
I would say "my daddy loves me and he'll never go away"
Bullshit, do you even remember December's my birthday?
Do you even remember the tender boy
You turned into a cold young man with

In [13]:
def convert_to_verse_classification_duo_artist(data):
    
    """ Convert to verse classification in a duo artist songs in which artist is set as e.g.: Jay-Z & Kanye West"""
    
    nicknames = get_nicknames()
    artist1, artist2 = data[0]['artist'].split('&')[0].strip(), data[0]['artist'].split('&')[1].strip()
    new_data = []
    for dictio in data:
        del dictio['featuring']
        lyrics = dictio['lyrics']
        lyrics = re.sub("\n","___",lyrics)
        verses = re.findall("\[.+?\].+?\[",lyrics,overlapped=True)
        verses = [re.sub("___","\n",verse) for verse in verses]
        verses = [re.sub("\n+\[","",verse) for verse in verses]
        for verse in verses:
            y_verse = "OTHER ARTIST" # in case more artists particiate than artist1 and artist2
            header = re.findall("\[.+?\]",verse)[0].lower() # header of a verse, as in [..]
            verse = re.sub("\[.+?\]","",verse)
            if header != []:
                header = header.split(':') # usually headers are like [verse1: artist]
                if len(header) > 1:
                    header = header[1].strip()[:-1]
                elif type(header) == list: # this means the header didn't have a :
                    header = header[0].split('-') # sometimes headers are like [verse1 - artist]
                    if len(header) > 1:
                        header = header[1].strip()[:-1]
                for nickname in nicknames[artist1]:
                    if header == nickname.lower():
                        y_verse = artist1
                for name in nicknames[artist2]: # set verse to artist2 of its not set to artist 1 or combined verse yet
                    if name.lower() == header and y_verse != artist1 and y_verse != 'combined verse':
                        y_verse = artist2
                if y_verse == artist1 or y_verse == artist2:
                    if len(verse.split()) >= 20:
                        #print("found")
                        new_dictio = dictio.copy()
                        new_dictio['artist'] = y_verse
                        new_dictio['lyrics'] = verse
                        new_data.append(new_dictio)
            #print(y_verse)
            
    return new_data

In [14]:
def remove_duplicates(x,y):
    new_x, new_y = [],[]
    while len(x) > 0:
        tempx = x.pop(0)
        tempy = y.pop(0)
        if tempx not in x:
            new_x.append(tempx)
            new_y.append(tempy)
    return new_x, new_y

In [15]:
def shuffe_x_and_y(x,y):
    
    """ Returns a shuffled x and y """
    
    x_and_y = [(x,y) for x,y in zip(x,y)] # combine x and y to keep the y related to the right x
    random.seed(50)
    random.shuffle(x_and_y)
    new_x, new_y = [], []
    for x,y in x_and_y:
        new_x.append(x)
        new_y.append(y)
    return new_x, new_y

In [30]:
def get_word_count(lyrics):
    lyrics = re.sub("['’]"," ",lyrics) # to convert e.g. I'm into I m
    lyrics = lyrics.translate(str.maketrans('','',string.punctuation))
    return len(lyrics.split())
    

def get_sentence_count(lyrics):
    return len(lyrics.split('\n'))

def get_avg_word_length(lyrics):
    lyrics = lyrics.translate(str.maketrans('','',string.punctuation))
    return round(sum([len(word) for word in lyrics.split()]) / len(lyrics.split()),2)
    
def get_exclam_mark_count(x):
    return [lyrics.count('!') for lyrics in x]

def get_question_mark_count(x):
    return [lyrics.count('?') for lyrics in x]

def get_comma_count(x):
    return [lyrics.count(',')for lyrics in x]

def get_comma_ratio(x):
    return [round(lyrics.count(',') / len(lyrics.split('\n')),2) for lyrics in x]

def get_unique_word_ratio(lyrics):
    lyrics = re.sub("['’]"," ",lyrics)
    lyrics =lyrics.translate(str.maketrans('','',string.punctuation))
    return round(len(set(lyrics.split())) / len(lyrics.split()),2)


def get_repeated_sentence_ratios(lyrics):
    repeated_sentence_count_ratios = [] # sum of sentences that are repeated / amount of sentences
    repeated_sentence_ratios = [] # sum of different sentences that are repeated / amount of different sentences
    sentence_counter = Counter(lyrics.split('\n'))
    total_sentences = len(lyrics.split('\n'))
    repeated_sentences_count = sum([instances for sentence,instances in sentence_counter.items()])
    repeated_sentences = sum([1 for sentence,instances in sentence_counter.items()])
    return round(repeated_sentences_count/total_sentences,2), round(repeated_sentences/len(sentence_counter),2)
                             

In [33]:
def split_train_dev_test(data,add_verses):
    songs = []
    verses = []
    for dictio in data:
        if isinstance(dictio['featuring'],float): # nan is a float, thus if no featuring artists it's a float
            songs.append(dictio)
        else:
            verses.append(dictio)
    random.seed(50)
    random.shuffle(songs)
    train = songs[:int(0.8*len(songs))]
    dev = songs[int(0.8*len(songs)):int(0.9*len(songs))]
    test = songs[int(0.9*len(songs)):]
    if add_verses == True:
        train = train + verses
        random.shuffle(train)
    return [["train",train],["dev",dev],["test",test]]

In [22]:
def remove_verses(data_set):
    return [dictio for dictio in data_set if isinstance(dictio['featuring'],float)]

def return_verses(data_set):
    return [dictio for dictio in data_set if not isinstance(dictio['featuring'],float)]

def balance_data_set(data_set_l, data_set_s,mc_artist_l,mc_artist_s):    
        
    #y_data_set_l = get_artist_list(data_set_l)
    #y_data_set_l = get_artist_list(data_set_to_sort_by)
    y_data_set_s = Counter(get_artist_list(data_set_s))
    
    #mc_artist_l = Counter(y_data_set_l).most_common()
    #mc_artist_s = Counter(y_data_set_s).most_common()
    new_data_set_l = []
    for l,s in zip(mc_artist_l,mc_artist_s):
        artist_l = l[0]
        songs_s = y_data_set_s[s[0]]
        new_data_set_l += [dictio for dictio in data_set_l if dictio['artist'] == artist_l][:songs_s]
    
    print(Counter(get_artist_list(new_data_set_l)))
    print(Counter(get_artist_list(data_set_s)))
    return new_data_set_l


"""data_set_s = import_raw_data("../lyrics/diverse/")
data_set_l = import_raw_data("../lyrics/afro_males/")
data_set_s = remove_verses(data_set_s)
data_set_l = remove_verses(data_set_l)



outpath = "../lyrics/experiments/AAMDS_to_DADS"
data = balance_data_set(data_set_l,data_set_s)
datas = split_train_dev_test(data,False)
datas = [(data_type,form_x_y_including_verses(data)) for data_type, data in datas]
datas = [(data_type,preprocess_lyrics(data)) for data_type, data in datas]
for data_type, data in datas:
    write_to_csv(data,outpath,data_type)"""

'data_set_s = import_raw_data("../lyrics/diverse/")\ndata_set_l = import_raw_data("../lyrics/afro_males/")\ndata_set_s = remove_verses(data_set_s)\ndata_set_l = remove_verses(data_set_l)\n\n\n\noutpath = "../lyrics/experiments/AAMDS_to_DADS"\ndata = balance_data_set(data_set_l,data_set_s)\ndatas = split_train_dev_test(data,False)\ndatas = [(data_type,form_x_y_including_verses(data)) for data_type, data in datas]\ndatas = [(data_type,preprocess_lyrics(data)) for data_type, data in datas]\nfor data_type, data in datas:\n    write_to_csv(data,outpath,data_type)'

In [17]:
def write_to_csv(data, path, data_type):
    keys = list(data[0].keys())
    with open(path + "_" + data_type + ".csv", 'w') as output_file:
        dict_writer = csv.DictWriter(output_file, keys)
        dict_writer.writeheader()
        dict_writer.writerows(data)

In [59]:
def write_for_glad(train_data,test_data,path_to_glad_folder):
    """artists = list(set([dictio['artist'] for dictio in train_data]))
    artist1 = [dictio['lyrics'] for dictio in train_data if dictio['artist'] == artists[0]]
    artist2 = [dictio['lyrics'] for dictio in train_data if dictio['artist'] == artists[1]]
    random.seed(30)
    
    artist_names = list(set([dictio['artist'] for dictio in train_data]))
    data_per_artist = []
    for name in artist_names:
        data_per_artist.append((name,[dictio['lyrics'] for dictio in train_data if dictio['artist'] == name]))"""
    
    song_amount_per_artist = Counter([dictio['artist'] for dictio in train_data])
    song_amount_per_artist = song_amount_per_artist.most_common()
    print(song_amount_per_artist)
    known_data = [] # list with each item a list of the song dictios of one specific artist
    unknown_data = [] # list with each item a list of the song dictios of two artists
    for i in range(0,len(song_amount_per_artist),2):
        artist1 = song_amount_per_artist[i][0]
        artist2 = song_amount_per_artist[i+1][0]
        known_data_artist1 = [dictio for dictio in train_data if dictio['artist'] == artist1]
        known_data_artist2 = [dictio for dictio in train_data if dictio['artist'] == artist2]
        known_data.append(known_data_artist1)
        known_data.append(known_data_artist2)
        
        # unknown data must be of two artists, to make sure there are matching and non matching cases
        pair_unknown_data = [dictio for dictio in test_data if dictio['artist'] == artist1] + \
                            [dictio for dictio in test_data if dictio['artist'] == artist2]
        random.shuffle(pair_unknown_data) # to be unknown lyrics must be shuffled to have random order
        
        # make sure the amount of unknown songs is in proportion to the amount of known songs
        limit = int(len(known_data_artist1) / (len(known_data_artist1) + len(known_data_artist2)) * \
                    len(pair_unknown_data))
        unknown1 = pair_unknown_data[:limit]
        unknown2 = pair_unknown_data[limit:]
        
        unknown_data.append(unknown1)
        unknown_data.append(unknown2)
        
    #artists = [(artists[0],artist1), (artists[1],artist2)] # index 0 refers to artists1 index 1 to artists2
    train_docs_per_unknown = int(len(train_data)/len(test_data))
    print(train_docs_per_unknown)
    
    random.shuffle(test_data)
    problem_i = 0
    if not os.path.exists(path_to_glad_folder):
        os.mkdir(path_to_glad_folder)
    
    if not os.path.exists(path_to_glad_folder +"/truth"):
        os.mkdir(path_to_glad_folder + "/truth")
    with open(path_to_glad_folder + "truth/truth.txt", 'w') as truthtxt:
        for known_set, unknown_set in zip(known_data, unknown_data):
            train_docs_per_unknown = int(len(known_set)/len(unknown_set))
            start_i = 0
            end_i = start_i + train_docs_per_unknown
            artist_name = known_set[0]['artist']
            lyrics_list = [dictio['lyrics'] for dictio in known_set]
            print(len(lyrics_list))
            while start_i < len(lyrics_list) - train_docs_per_unknown and len(unknown_set) > 0:
                problem_i += 1
                problem_id = ('000' + str(problem_i))[-4:]
                if not os.path.exists(path_to_glad_folder + problem_id):
                    os.mkdir(path_to_glad_folder + problem_id)
                train_docs = lyrics_list[start_i:end_i]
                #print(len(train_docs))
                doc_i = 1
                doc_id = ('00'+ str(doc_i))[-2:]
                for lyrics in train_docs[:1]:
                    doc_id = ('00'+ str(doc_i))[-2:]
                    with open(path_to_glad_folder + problem_id + '/' + problem_id + '_known0' + str(doc_id)+ '.txt','w') as knownfile:
                        knownfile.write(lyrics)
                        doc_i += 1
                with open(path_to_glad_folder + problem_id + '/unknown.txt','w') as unknownfile:
                    test_instance = unknown_set.pop(0)
                    unknown_verse = test_instance['lyrics']
                    unknown_artist = test_instance['artist']
                    unknownfile.write(unknown_verse)
                    if unknown_artist == artist_name:
                        truthtxt.write(problem_id + ' Y\n')
                    else:
                        truthtxt.write(problem_id + ' N\n')
                start_i += train_docs_per_unknown
                end_i = start_i + train_docs_per_unknown
                    #print(start_i, len(lyrics_list),len(test_data))
                 
    

In [29]:
inpath = "../lyrics/gender/female/"
data_female = import_raw_data(inpath)
for dictio in data_female:
    dictio['gender'] = 'female'
datas_female = split_train_dev_test(data_female)



data_male = import_raw_data("../lyrics/gender/male/")
for dictio in data_male:
    dictio['gender'] = 'male'
    
verses_male = return_verses(data_male)
verses_female = return_verses(data_female)    
verses_male = form_x_y_including_verses(verses_male)
verses_female = form_x_y_including_verses(verses_female)
#added_verses_male = balance_data_set(verses_male,verses_female,data_male)

datas_male = split_train_dev_test(data_male,False)

mc_data_female = Counter(get_artist_list(datas_female[0][1])).most_common()
mc_data_male = Counter(get_artist_list(datas_male[0][1])).most_common()


added_verses_male = balance_data_set(verses_male,verses_female,mc_data_male,mc_data_female)
print("!!!!!!!!!!!!!!!!!!!!!",Counter(get_artist_list(added_verses_male)))
print("!!!!!!!!!!!!!!!!!!!!!",Counter(get_artist_list(verses_female)))
datas = []
for data_female, data_male in zip(datas_female,datas_male):
    #print(data_female[1])
    male_balanced_to_female = balance_data_set(data_male[1], data_female[1],mc_data_male,mc_data_female)
    datas.append([data_female[0],male_balanced_to_female+data_female[1]])

datas = [[data_type,form_x_y_including_verses(data)] for data_type, data in datas]
#datas[0][1] += added_verses_male + verses_female


datas = [(data_type,preprocess_lyrics(data)) for data_type, data in datas]
for data_type, data in datas:
    write_to_csv(data,'../lyrics/gender/GBDS_no_verses',data_type)

Counter({'50 Cent': 47, 'Snoop Dogg': 46, 'Nas': 32, 'Ice Cube': 19, '2Pac': 9, 'Eminem': 5, 'JAY-Z': 4})
Counter({'Nicki Minaj': 47, 'Lil’ Kim': 46, 'Iggy Azalea': 32, 'Missy Elliott': 19, 'Cardi B': 9, 'MC Lyte': 5, 'Queen Latifah': 4})
!!!!!!!!!!!!!!!!!!!!! Counter({'50 Cent': 47, 'Snoop Dogg': 46, 'Nas': 32, 'Ice Cube': 19, '2Pac': 9, 'Eminem': 5, 'JAY-Z': 4})
!!!!!!!!!!!!!!!!!!!!! Counter({'Nicki Minaj': 47, 'Lil’ Kim': 46, 'Iggy Azalea': 32, 'Missy Elliott': 19, 'Cardi B': 9, 'MC Lyte': 5, 'Queen Latifah': 4})
Counter({'50 Cent': 67, 'Snoop Dogg': 51, 'Eminem': 44, 'Nas': 39, 'Ice Cube': 39, 'JAY-Z': 26, '2Pac': 24})
Counter({'Nicki Minaj': 67, 'Lil’ Kim': 51, 'MC Lyte': 44, 'Iggy Azalea': 39, 'Missy Elliott': 39, 'Queen Latifah': 26, 'Cardi B': 24})
Counter({'Eminem': 10, 'Snoop Dogg': 6, '50 Cent': 5, 'Ice Cube': 5, 'JAY-Z': 5, 'Nas': 3, '2Pac': 2})
Counter({'MC Lyte': 10, 'Lil’ Kim': 6, 'Queen Latifah': 5, 'Nicki Minaj': 5, 'Missy Elliott': 5, 'Iggy Azalea': 3, 'Cardi B': 2})


## inpath = "../lyrics/afro_males/"
data = import_raw_data(inpath)
datas = split_train_dev_test(data,True)
datas = [form_x_y_including_verses(data) for data_type,data in datas]
songs_incl_verses = [song['artist'] for song in datas[0]]
data = import_raw_data(inpath)
datas = split_train_dev_test(data,False)
songs_train = [song['artist'] for song in datas[0][1]]
songs_dev = [song['artist'] for song in datas[1][1]]
songs_test = [song['artist'] for song in datas[2][1]]
c_total_songs_per_artist = Counter(songs_train+songs_dev+songs_test)
total_songs_per_artist = c_total_songs_per_artist.most_common()
print(total_songs_per_artist)
c_total = Counter(songs_incl_verses)
c_dev = Counter(songs_dev)
c_test = Counter(songs_test)
c_train = Counter(songs_train)
train_total, verse_total, dev_total, test_total = 0,0,0,0
print(c_total)
for artist, x in total_songs_per_artist:
    print("{}\t{}\t{}\t{}\t{}\t{}".format(artist,c_total_songs_per_artist[artist],c_train[artist],c_dev[artist],c_test[artist],c_total[artist]-c_train[artist]))
print("{}\t{}\t{}\t{}\t{}\t{}".format("TOTAL",len(songs_train)+len(songs_dev)+len(songs_test),len(songs_train),len(songs_dev), len(songs_test),len(songs_incl_verses)-len(songs_train)))

In [16]:
# run this cell to create the train dev and test set based on songs

inpath = "../lyrics/diverse/"
outpath = "../lyrics/diverse/diverse_artist_no_punct"
data = import_raw_data(inpath)
datas = split_train_dev_test(data,)
datas = [(data_type,form_x_y_including_verses(data)) for data_type, data in datas]
datas = [(data_type,preprocess_lyrics(data)) for data_type, data in datas]
#splitted_data = split_train_dev_test(data,False)
#for data_type, data in datas:
    #write_to_csv(data,outpath,data_type)

Total instances to preprocess: 830


KeyboardInterrupt: 

In [37]:
inpath = "../lyrics_old/afro_males/"
outpath = "../lyrics_old/experiments/AAMDS_no_verses_with_digits"
#inpath = "../lyrics/diverse/"
#outpath = "../lyrics/experiments/DADS_TRY_wv_nopunct"
data = import_raw_data(inpath)
#print(Counter(get_artist_list(data)))
datas = split_train_dev_test(data,False)
datas = [(data_type,form_x_y_including_verses(data)) for data_type, data in datas]
for data_type, data in datas:
    print(data_type,Counter(get_artist_list(data)))
#datas = [(data_type,preprocess_lyrics(data)) for data_type, data in datas]
#for data_type, data in datas:
    #write_to_csv(data,outpath,data_type)

Counter({'Lil Wayne': 641, 'Snoop Dogg': 428, 'Future': 325, '50 Cent': 284, '2Pac': 255, 'Nas': 241, 'JAY-Z': 228, 'Ice Cube': 183})
train Counter({'Lil Wayne': 269, 'Future': 182, '50 Cent': 137, 'Snoop Dogg': 129, 'Nas': 121, 'Ice Cube': 109, 'JAY-Z': 95, '2Pac': 80})
dev Counter({'Lil Wayne': 38, 'Future': 26, 'Ice Cube': 16, 'Nas': 14, '50 Cent': 13, 'JAY-Z': 13, 'Snoop Dogg': 12, '2Pac': 8})
test Counter({'Lil Wayne': 31, 'Future': 27, '50 Cent': 18, 'Snoop Dogg': 16, 'JAY-Z': 13, 'Ice Cube': 13, '2Pac': 12, 'Nas': 11})


In [34]:
# run this cell to create the train data with added verses

inpath = "../lyrics_old/afro_males/"
outpath = "../lyrics_old/afro_males/DADS_with_verses"
data = import_raw_data(inpath)
datas = split_train_dev_test(data,False)
#for data_type, data in datas:
#    print(data_type,Counter(get_artist_list(data)))
datas = [(data_type,form_x_y_including_verses(data)) for data_type, data in datas]
datas = [(data_type,preprocess_lyrics(data)) for data_type, data in datas]
for data_type, data in datas:
    print(data_type,Counter(get_artist_list(data)))
#for data_type, data in datas:
    #write_to_csv(data,outpath,data_type)

Total instances to preprocess: 1122


NameError: name 'nlp' is not defined

In [None]:
inpath = "../lyrics/marley&nas/"
outpath = "../lyrics/marley&nas/marley&nas"
data = import_raw_data(inpath)
data = form_x_y_including_verses(data)
data = preprocess_lyrics(data)
splitted_data = split_train_dev_test(data,True)

In [28]:
inpath = "../lyrics/marley&nas/"
outpath = "../lyrics/marley&nas/marley&nas"
data = import_raw_data(inpath)
splitted_data = split_train_dev_test(data,True)
datas = [('train',splitted_data[0][1] + splitted_data[1][1] + splitted_data[2][1])] # combine train dev and test
datas = [(data_type,convert_to_verse_classification(data)) for data_type, data in datas]
#datas = [(data_type,preprocess_lyrics(data)) for data_type, data in datas]
print(Counter(get_artist_list(datas[0][1])))
#splitted_data = split_train_dev_test(data,False)
for data_type, data in datas:
    write_to_csv(data,outpath,data_type)
    
# create test data for duo artist classification on Damian Marley and Nas    
data = import_one_csv_file("../lyrics/marley&nas_combined/marley&nas.csv")  
data = convert_to_verse_classification_duo_artist(data)
data = preprocess_lyrics(data)
write_to_csv(data,"../lyrics/marley&nas_combined/marely&nas","test")
print(Counter(get_artist_list(data)))

Counter({'Nas': 711, 'Damian Marley': 224})
Total instances to preprocess: 50


Counter({'Damian Marley': 30, 'Nas': 20})


In [98]:
inpath = "../lyrics/CADS/jayz&kanye/"
outpath = "../lyrics/CADS/jayz&kanye/jayz&kanye"
data = import_raw_data(inpath)
splitted_data = split_train_dev_test(data,True)
datas = [('train',splitted_data[0][1] + splitted_data[1][1] + splitted_data[2][1])] # combine train dev and test
#datas = [(data_type,convert_to_verse_classification(data)) for data_type, data in datas]
datas = [(data_type,form_x_y_including_verses(data)) for data_type, data in datas]
datas = [(data_type,preprocess_lyrics(data)) for data_type, data in datas]
print(Counter(get_artist_list(datas[0][1])))
#splitted_data = split_train_dev_test(data,False)
for data_type, data in datas:
    write_to_csv(data,outpath,data_type)
    
# create test data for duo artist classification on Damian Marley and Nas    
data = import_one_csv_file("../lyrics/CADS/jayz&kanye_combined/jayz&kanye.csv")  
data = convert_to_verse_classification_duo_artist(data)
data = preprocess_lyrics(data)
write_to_csv(data,"../lyrics/CADS/jayz&kanye_combined/jayz&kanye","test")
print(Counter(get_artist_list(data))) 

Total instances to preprocess: 354
100 200 300 Counter({'JAY-Z': 212, 'Kanye West': 142})
Total instances to preprocess: 30
Counter({'Kanye West': 19, 'JAY-Z': 11})


In [99]:

inpath = "../lyrics/CADS/marley&nas/"
outpath = "../lyrics/CADS/marley&nas/marley&nas"
data = import_raw_data(inpath)
splitted_data = split_train_dev_test(data,True)
datas = [('train',splitted_data[0][1] + splitted_data[1][1] + splitted_data[2][1])] # combine train dev and test
#datas = [(data_type,convert_to_verse_classification(data)) for data_type, data in datas]
datas = [(data_type,form_x_y_including_verses(data)) for data_type, data in datas]
datas = [(data_type,preprocess_lyrics(data)) for data_type, data in datas]
print(Counter(get_artist_list(datas[0][1])))
#splitted_data = split_train_dev_test(data,False)
for data_type, data in datas:
    write_to_csv(data,outpath,data_type)
    
# create test data for duo artist classification on Damian Marley and Nas    
data = import_one_csv_file("../lyrics/CADS/marley&nas_combined/marley&nas.csv")  
data = convert_to_verse_classification_duo_artist(data)
data = preprocess_lyrics(data)
write_to_csv(data,"../lyrics/CADS/marley&nas_combined/marley&nas","test")
print(Counter(get_artist_list(data))) 

Total instances to preprocess: 272
100 200 Counter({'Nas': 214, 'Damian Marley': 58})
Total instances to preprocess: 50
Counter({'Damian Marley': 30, 'Nas': 20})


In [None]:
### run this cell to create the test data for glad verse verification of Damian Marley & Nas

inpath = "../lyrics/marley&nas/"
known_data = import_raw_data(inpath)
known_data = convert_to_verse_classification(known_data)
known_data = preprocess_lyrics(known_data)
#splitted_data = split_train_dev_test(data,True)
#training_data = preprocess_lyrics(training_data)

unknown_data = import_one_csv_file("../lyrics/marley&nas_combined/marley&nas.csv")
unknown_data = convert_to_verse_classification_duo_artist(unknown_data)
unknown_data = preprocess_lyrics(unknown_data)
write_for_glad(known_data,unknown_data,"../lyrics/glad/test_data/marley&nas_single/")
#test_data = preprocess_lyrics(test_data)
#print("\n\n",len(training_data),len(test_data))

In [23]:
# run this cell to create the train data for glad verse classification

inpath = "../lyrics/glad/raw_train_data/"
outpath = "../lyrics/glad/train_data_single/"
data = import_raw_data(inpath)
data = convert_to_verse_classification(data)
data = preprocess_lyrics(data)
splitted_data = split_train_dev_test(data,True)
#training_data = preprocess_lyrics(training_data)
write_for_glad(splitted_data[0][1],splitted_data[1][1] + splitted_data[2][1],outpath)
#test_data = preprocess_lyrics(test_data)
#print("\n\n",len(training_data),len(test_data))

Total instances to preprocess: 8203
100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000 5100 5200 5300 5400 5500 5600 5700 5800 5900 6000 6100 6200 6300 6400 6500 6600 6700 6800 6900 7000 7100 7200 7300 7400 7500 7600 7700 7800 7900 8000 8100 8200 [('Lil Wayne', 1454), ('Future', 1063), ('Snoop Dogg', 769), ('Eminem', 749), ('2Pac', 748), ('50 Cent', 685), ('Ice Cube', 527), ('Nicki Minaj', 364), ('Machine Gun Kelly', 360), ('Lil’ Kim', 267)]
5
1454
1063
769
749
748
685
527
364
360
267


In [None]:
inpath = "../lyrics/jayz&kanye/"
known_data = import_raw_data(inpath)
known_data = convert_to_verse_classification(known_data)
known_data = preprocess_lyrics(known_data)
#splitted_data = split_train_dev_test(data,True)
#training_data = preprocess_lyrics(training_data)

unknown_data = import_one_csv_file("../lyrics/marley&nas_combined/jayz&kanye.csv")
unknown_data = convert_to_verse_classification_duo_artist(unknown_data)
unknown_data = preprocess_lyrics(unknown_data)
write_for_glad(known_data,unknown_data,"../lyrics/glad/test_data/jayz&kanye_unpreproccesed/")

In [20]:
# run this cell to create the test data for glad verse verification of Damian Marley & Nas

inpath = "../lyrics/marley&nas/"
known_data = import_raw_data(inpath)
known_data = convert_to_verse_classification(known_data)
known_data = preprocess_lyrics(known_data)
splitted_data = split_train_dev_test(known_data,True)
#training_data = preprocess_lyrics(training_data)

#unknown_data = import_one_csv_file("../lyrics/marley&nas_combined/marley&nas.csv")
#unknown_data = convert_to_verse_classification_duo_artist(unknown_data)
#unknown_data = preprocess_lyrics(unknown_data)
write_for_glad(splitted_data[0][1],splitted_data[1][1] + splitted_data[2][1],"../lyrics/glad/train_data_marley&nas/")
#test_data = preprocess_lyrics(test_data)
#print("\n\n",len(training_data),len(test_data))

Total instances to preprocess: 935
100 200 300 400 500 600 700 800 900 [('Nas', 596), ('Damian Marley', 195)]
5
596
195
