# Emotion detection in song lyrics

In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

## Data Import

In [2]:
# Importing EmoBank datasets
path = "Data/EmoBank-master/corpus/"

raw = pd.read_csv(path+'raw.csv') # Text
reader = pd.read_csv(path+'reader.csv') # VAD values
meta = pd.read_csv(path+'meta.tsv', sep='\t') # Category

In [3]:
# Retrieving useful columns
reader = reader.iloc[:, :4]
meta = meta.iloc[:, :3]

In [4]:
from functools import reduce

# Merging dataframes
dfs = [raw, reader, meta]
data = reduce(lambda  left, right: pd.merge(left, right, on=['id']), dfs)

In [5]:
# Map VAD scores on the range [-1, 1]
from scipy.interpolate import interp1d

range_map = interp1d([1, 5], [-1, 1])
data.iloc[:, 2:5] = range_map(data.iloc[:, 2:5])

In [6]:
data.head()

Unnamed: 0,id,text,V,A,D,document,category
0,Acephalous-Cant-believe_4_47,I can't believe I wrote all that last year.,0.0,0.2,0.1,Acephalous-Cant-believe,blog
1,Acephalous-Cant-believe_83_354,Because I've been grading all damn day and am ...,-0.1,0.1,0.1,Acephalous-Cant-believe,blog
2,Acephalous-Cant-believe_355_499,"However, when I started looking through my arc...",0.2,0.0,0.1,Acephalous-Cant-believe,blog
3,Acephalous-Cant-believe_500_515,What do I mean?,0.0,0.0,0.0,Acephalous-Cant-believe,blog
4,Acephalous-Cant-believe_517_626,The posts I consider foundational to my curren...,0.0,0.0,0.0,Acephalous-Cant-believe,blog


## Text Preprocessing

In [7]:
import spacy
import string
import contractions
import nltk

nlp = spacy.load("en_core_web_sm")
punct = string.punctuation

def text_preproc(text):
    """
    text: string to tokenize
    
    Normalize the input, lower characters, extend contractions, 
    remove functional punctuation, retrieve lemmas.
    Return a list of words.
    """
    
    text = text.lower()
    text = contractions.fix(text)
    text = "".join([char for char in text if char not in punct])
    text = nlp(text)
    
    doc = [word.lemma_ for word in text]
        
    return doc

In [8]:
tqdm.pandas()
data['tokens'] = data['text'].progress_apply(lambda x: text_preproc(x))

  0%|          | 0/10325 [00:00<?, ?it/s]

In [9]:
data.iloc[:, [1, 7]].head()

Unnamed: 0,text,tokens
0,I can't believe I wrote all that last year.,"[I, can, not, believe, I, write, all, that, la..."
1,Because I've been grading all damn day and am ...,"[because, I, have, be, grade, all, damn, day, ..."
2,"However, when I started looking through my arc...","[however, when, I, start, look, through, my, a..."
3,What do I mean?,"[what, do, I, mean]"
4,The posts I consider foundational to my curren...,"[the, post, I, consider, foundational, to, my,..."


## Embedding

### Word2Vec

In [10]:
import gensim.downloader as api
w2v = api.load('word2vec-google-news-300')



In [11]:
def sent2vec(sent):
    
    vectors = []
    for word in sent:
        try:
            vectors.append(w2v.get_vector(word))
        except:
            pass
    
    if not vectors:
        return np.zeros(300)
        
    avg = np.mean(vectors, axis=0)
    
    return avg

In [12]:
data['word2vec'] = data['tokens'].apply(lambda x: sent2vec(x))

In [13]:
data['word2vec'].head()

0    [0.031823732, -0.0037963868, 0.06633453, 0.100...
1    [0.032586347, 0.014574503, 0.01281658, 0.12373...
2    [0.0410791, 0.04775322, 0.020323487, 0.1447412...
3    [0.070129395, -0.016685486, 0.11654663, 0.1768...
4    [0.022753906, 0.0005666097, 0.032589722, 0.099...
Name: word2vec, dtype: object

### Doc2Vec

In [14]:
from gensim.models import doc2vec

# Transform tokens into tagged documents, necessary for doc2vec model
tagDocs = list(data.apply(lambda x: doc2vec.TaggedDocument(x.tokens, [x.name]), axis=1))
tagDocs[0]

TaggedDocument(words=['I', 'can', 'not', 'believe', 'I', 'write', 'all', 'that', 'last', 'year'], tags=[0])

In [15]:
# Instantiate model and vocabulary
modelDoc2Vec = doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
modelDoc2Vec.build_vocab(tagDocs)

In [16]:
# Train the model
modelDoc2Vec.train(tagDocs, total_examples=modelDoc2Vec.corpus_count, epochs=modelDoc2Vec.epochs)

In [17]:
# Vector of the first document
modelDoc2Vec.infer_vector(tagDocs[0].words)

array([ 0.14607349, -0.45510152, -0.21096148,  0.08329249,  0.2612397 ,
       -0.6539716 , -0.06509317, -0.41590542, -0.4496931 ,  0.03339815,
       -0.10746129, -0.4153262 ,  0.15846336,  0.02890161, -0.5478532 ,
       -0.03179524,  0.66885114,  0.25283146, -0.6836037 , -0.37476912,
        0.31410566,  0.05275747,  0.34006548,  0.16123846,  0.2709097 ,
       -0.02648826,  0.04230324,  0.06991088, -0.4077144 , -0.08271628,
       -0.1367673 ,  0.49990827, -0.02510439,  0.08225597, -0.34561038,
       -0.08017121,  0.0610089 , -0.00568248, -0.05780919,  0.4120754 ,
        0.19155551, -0.20670773,  0.16153032, -0.40042785, -0.11737451,
        0.11770932,  0.16606946, -0.15631115,  0.220769  ,  0.10135162],
      dtype=float32)

In [18]:
data['doc2vec'] = data.apply(lambda x: modelDoc2Vec.dv[x.name], axis=1)
data['doc2vec'].head()

0    [0.13474539, -0.30156022, 0.014730556, -0.0655...
1    [0.26257446, -0.5316987, -0.023668356, -0.5169...
2    [0.12866974, -0.26300073, 0.6015844, -0.201791...
3    [0.042556852, 0.030388653, 0.0549632, -0.08883...
4    [-0.00035672277, 0.05945972, 0.08832626, -0.21...
Name: doc2vec, dtype: object

### Doc2Vec w/o neutral words

In [19]:
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn

def wn_pos_tag(tag):
    """
    tag: string indicating the tag retrieved by nltk
    Map nltk tag into wordnet tag.
    """
    
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('V'):
        return wn.VERB
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    else:
        return ''

def polarity(token, tag):
    """
    token: word
    tag: wordnet position tag
    Check the polarity of a word according to SentiWordNet.
    """
    
    try:
        ss = swn.senti_synset(token.lower() + '.' + tag + '.01')
        pol_values = (ss.pos_score(), ss.neg_score(), ss.obj_score())
    except:
        pol_values = (0, 0, 0)
        
    return pol_values

In [20]:
def polarity_check(sentence):
    """
    sentence: list of strings (words)
    Given a sentence, check which of the words is neutral
    and discard it.
    """
    
    tokens = nltk.pos_tag(sentence)
    sent_words = []
    
    for token, tag in tokens:
        tag = wn_pos_tag(tag)
        pol_values = polarity(token, tag)
        
        # Check if the word has a greater objective score
        if (pol_values[2] >= pol_values[1]) & (pol_values[2] >= pol_values[0]):
            continue
        
        sent_words.append(token)
        
    return sent_words

In [21]:
data['sent_words'] = data['tokens'].progress_apply(lambda x: polarity_check(x))
data['sent_words'].head()

  0%|          | 0/10325 [00:00<?, ?it/s]

0                         [not]
1    [other, other, good, good]
2                     [however]
3                            []
4                            []
Name: sent_words, dtype: object

In [22]:
# Apply Doc2Vec to the sentences w/o neutral words
tagDocs2 = list(data.apply(lambda x: doc2vec.TaggedDocument(x.sent_words, [x.name]), axis=1))
modelDoc2Vec2 = doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
modelDoc2Vec2.build_vocab(tagDocs2)
modelDoc2Vec2.train(tagDocs2, total_examples=modelDoc2Vec2.corpus_count, epochs=modelDoc2Vec2.epochs)
data['doc2vec_no_neutral'] = data.apply(lambda x: modelDoc2Vec2.dv[x.name], axis=1)
data['doc2vec_no_neutral'].head()

0    [-0.012546806, -0.008351298, -0.021624334, 0.0...
1    [-0.003872214, 4.819216e-05, -0.013426666, -0....
2    [0.0068667415, -0.04351351, 0.00440728, 0.0674...
3    [-0.006571603, 0.019242125, 0.014944697, -0.01...
4    [-0.015749183, -0.01840879, 0.016828537, -0.00...
Name: doc2vec_no_neutral, dtype: object

## Regression Model

### Model 1 (Word2Vec)

In [23]:
import tensorflow
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, LeakyReLU
from tensorflow.keras import initializers

# Define a baseline model in keras
def baseline_model():
    model = Sequential()
    model.add(Dense(16, input_dim=300, kernel_initializer='he_normal'))
    model.add(LeakyReLU(alpha=0.01))
    model.add(Dropout(0.5))
    model.add(Dense(8, kernel_initializer='he_normal'))
    model.add(LeakyReLU(alpha=0.01))
    model.add(Dropout(0.5))
    model.add(Dense(3, activation='linear'))

    model.compile(loss='mse', optimizer='adam')
    
    return model

In [24]:
# Define variables
X = pd.DataFrame(data['word2vec'].values.tolist())
y = data[['V', 'A', 'D']]

In [25]:
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

model = KerasRegressor(build_fn=baseline_model, epochs=100, batch_size=10, verbose=0)
kfold = KFold(n_splits=5)
scores = cross_val_score(model, X, y, cv=kfold)

In [26]:
print('MSE:', -scores.mean())

MSE: 0.02651027888059616


In [27]:
model.fit(X, y)

<keras.callbacks.History at 0x7f7f5627b040>

### Model 2 (Doc2Vec)

In [28]:
def baseline_model2():
    model = Sequential()
    model.add(Dense(16, input_dim=50, kernel_initializer='he_normal'))
    model.add(LeakyReLU(alpha=0.01))
    model.add(Dropout(0.5))
    model.add(Dense(8, kernel_initializer='he_normal'))
    model.add(LeakyReLU(alpha=0.01))
    model.add(Dropout(0.5))
    model.add(Dense(3, activation='linear'))

    model.compile(loss='mse', optimizer='adam')
    
    return model

X = pd.DataFrame(data['doc2vec'].values.tolist())
model2 = KerasRegressor(build_fn=baseline_model2, epochs=100, batch_size=10, verbose=0)
scores2 = cross_val_score(model2, X, y, cv=kfold)
print('MSE:', -scores2.mean())

MSE: 0.03020787611603737


In [29]:
model2.fit(X, y)

<keras.callbacks.History at 0x7f7f53c06670>

### Model 3 (Doc2Vec w/o neutral words)

In [30]:
X = pd.DataFrame(data['doc2vec_no_neutral'].values.tolist())
model3 = KerasRegressor(build_fn=baseline_model2, epochs=100, batch_size=10, verbose=0)
scores3 = cross_val_score(model3, X, y, cv=kfold)
print('MSE:', -scores3.mean())

MSE: 0.03117205761373043


In [31]:
model3.fit(X, y)

<keras.callbacks.History at 0x7f806bf6ad00>

## VAD to Categories

### 1st approach

In [32]:
# Importing SemEval datasets
path2 = 'Data/AffectiveText.Semeval.2007/'

sem_eval1 = pd.read_csv(path2+'AffectiveText.trial/affectivetext_trial.emotions.gold', 
                        header=None, sep=' ', index_col=0)
sem_eval2 = pd.read_csv(path2+'AffectiveText.test/affectivetext_test.emotions.gold', 
                        header=None, sep=' ', index_col=0)

In [33]:
# Renaming columns
sem_eval_em = pd.concat([sem_eval1, sem_eval2]).rename(columns={0:'id', 1:'anger', 2:'disgust', 
                                                                3:'fear', 4:'joy', 5:'sadness', 
                                                                6:'surprise'})
sem_eval_em.index.name = None

In [34]:
sem_eval_em.head()

Unnamed: 0,anger,disgust,fear,joy,sadness,surprise
1,22,2,60,0,64,0
2,0,0,0,93,0,38
3,18,0,52,66,20,65
4,66,39,94,0,86,0
5,0,0,25,26,2,46


In [35]:
# Importing EmoBank dataset to perform analysis on mixed values
emobank = pd.read_csv(path+'emobank.csv') # EmoBank
emobank = emobank.iloc[:, [0, 2, 3, 4]]

# Merging dataframes
dfs = [raw, emobank, meta]
emobank = reduce(lambda  left, right: pd.merge(left, right, on=['id']), dfs)
emobank.head()

Unnamed: 0,id,text,V,A,D,document,category
0,Acephalous-Cant-believe_4_47,I can't believe I wrote all that last year.,3.1,3.4,3.1,Acephalous-Cant-believe,blog
1,Acephalous-Cant-believe_83_354,Because I've been grading all damn day and am ...,2.5,3.5,3.5,Acephalous-Cant-believe,blog
2,Acephalous-Cant-believe_355_499,"However, when I started looking through my arc...",3.5,3.2,3.5,Acephalous-Cant-believe,blog
3,Acephalous-Cant-believe_500_515,What do I mean?,3.0,3.1,2.9,Acephalous-Cant-believe,blog
4,Acephalous-Cant-believe_517_626,The posts I consider foundational to my curren...,3.0,3.0,3.1,Acephalous-Cant-believe,blog


In [36]:
# Retrieving SemEval data from EmoBank
sem_eval_vad = emobank[emobank['category']=='SemEval'].iloc[:, [0, 2, 3, 4]]

# Recreating same index as sem_eval_em
sem_eval_vad['id'] = sem_eval_vad['id'].apply(lambda x: x.replace('SemEval_', ''))
sem_eval_vad.set_index('id', inplace=True)
sem_eval_vad.index.name = None
sem_eval_vad.index = sem_eval_vad.index.astype('int64')

sem_eval_vad.head()

Unnamed: 0,V,A,D
1,2.29,3.29,2.86
2,3.7,2.9,3.1
3,3.5,3.4,3.3
4,2.11,3.11,3.0
5,3.22,3.44,3.44


In [37]:
# Merge VAD values with Emotions' ones
sem_eval = pd.merge(sem_eval_vad, sem_eval_em, left_index=True, right_index=True)
sem_eval.head()

Unnamed: 0,V,A,D,anger,disgust,fear,joy,sadness,surprise
1,2.29,3.29,2.86,22,2,60,0,64,0
2,3.7,2.9,3.1,0,0,0,93,0,38
3,3.5,3.4,3.3,18,0,52,66,20,65
4,2.11,3.11,3.0,66,39,94,0,86,0
5,3.22,3.44,3.44,0,0,25,26,2,46


In [38]:
# Train and test split for regression task
from sklearn.model_selection import train_test_split

X, y = np.split(sem_eval, [3], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, shuffle=True, random_state=123)

In [39]:
# Decision Tree Regression
from sklearn.tree import DecisionTreeRegressor

regTree = DecisionTreeRegressor().fit(X_train, y_train)
regTree.score(X_train, y_train), regTree.score(X_test, y_test)

(0.8226080089381597, -0.3403314852008215)

In [40]:
# MSE for Decision Tree Regression
from sklearn.metrics import mean_squared_error

mean_squared_error(y_train, regTree.predict(X_train)), mean_squared_error(y_test, regTree.predict(X_test))

(54.0302513782708, 383.05900725655215)

In [41]:
# One-hot encode continuous value according to a given threshold (presence/non-presence of emotion)
sem_eval_em_oh = sem_eval_em >= 50
sem_eval_em_oh = sem_eval_em_oh.astype('int')
sem_eval = pd.merge(sem_eval_vad, sem_eval_em_oh, left_index=True, right_index=True)
sem_eval.head()

Unnamed: 0,V,A,D,anger,disgust,fear,joy,sadness,surprise
1,2.29,3.29,2.86,0,0,1,0,1,0
2,3.7,2.9,3.1,0,0,0,1,0,0
3,3.5,3.4,3.3,0,0,1,1,0,1
4,2.11,3.11,3.0,1,0,1,0,1,0
5,3.22,3.44,3.44,0,0,0,0,0,0


In [42]:
# Train and test split for classification task
X, y = np.split(sem_eval, [3], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, shuffle=True, random_state=123)

In [43]:
# Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(criterion='entropy', random_state=123).fit(X_train, y_train)
tree.score(X_train, y_train), tree.score(X_test, y_test)

(0.8982161594963274, 0.5606694560669456)

In [44]:
# Evaluation metrics
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

y_pred = tree.predict(X_test)

accuracy_score(y_test, y_pred), \
recall_score(y_test, y_pred, average=None), \
precision_score(y_test, y_pred, average=None), \
f1_score(y_test, y_pred, average=None)

(0.5606694560669456,
 array([0.25      , 0.        , 0.08695652, 0.3       , 0.41935484,
        0.1       ]),
 array([0.11111111, 0.        , 0.08333333, 0.52941176, 0.65      ,
        0.09090909]),
 array([0.15384615, 0.        , 0.08510638, 0.38297872, 0.50980392,
        0.0952381 ]))

In [45]:
# Confusion matrix
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test.iloc[:, 0], y_pred[:, 0])

array([[227,   8],
       [  3,   1]])

### 2nd approach

In [46]:
# Define Ekman's emotions coordinates in VAD space according to Mehrabian and Russell (1977)
emo_coord = {
    'anger': (-0.51, 0.59, 0.25),
    'disgust': (-0.6, 0.35, 0.11),
    'fear': (-0.64, 0.60, -0.43),
    'joy': (0.74, 0.48, 0.35),
    'sadness': (-0.63, -0.27, -0.33),
    'surprise': (0.4, 0.67, -0.13)
}

In [89]:
# Plot EmoBank data in VAD space
%matplotlib notebook
#from mpl_toolkits import mplot3d
import matplotlib.pyplot as plt

fig = plt.figure()
ax = plt.axes(projection='3d')
ax.scatter(data['V'], data['A'], data['D'], s=.1, c='black', alpha=.2)
ax.set_xlim(-1, 1), ax.set_ylim(-1, 1), ax.set_zlim(-1, 1)

colors = ['blue', 'orange', 'green', 'red', 'purple', 'brown']

for idx, (key, value) in enumerate(emo_coord.items()):
    ax.scatter(value[0], value[1], value[2], s=10, c=colors[idx])
    ax.text(value[0], value[1], value[2], key, color=colors[idx])

plt.show()

<IPython.core.display.Javascript object>

In [48]:
from scipy.spatial import distance

# Distance of each emotion from the center
for emo, coord in emo_coord.items():
    print(emo, distance.euclidean((0, 0, 0), coord))

anger 0.8189627586160435
disgust 0.7032780389006896
fear 0.976985158536198
joy 0.9489467845985885
sadness 0.760723339986358
surprise 0.7910752176626443


In [49]:
def vad_to_em(x):
    
    """
    x: DataFrame entry of VAD values
    
    For each observation, compute the distances from the point
    to each of the different emotions. Assign each point to the
    closest one, unless it can be considered a neutral sentence
    being too close to the center.
    """
    
    x = x.to_numpy()
    emotion = ''
    dist = 1
    
    if distance.euclidean(x, (0, 0, 0)) < 0.4:
        return emotion
    else:
        for emo, coord in emo_coord.items():
            new_dist = distance.euclidean(x, coord)
            if new_dist < dist:
                emotion = emo
                dist = new_dist
    
    return emotion

In [50]:
# Generate a variable emotion using the function above
data['emotion'] = data.apply(lambda x: vad_to_em(x[2:5]), axis=1)

In [51]:
# Observations assigned to each emotion
data['emotion'].value_counts()

            8453
disgust      539
joy          510
sadness      419
surprise     230
fear          96
anger         78
Name: emotion, dtype: int64

## Lyrics Dataset

In [52]:
# Use Genius API and library 'lyricsgenius' to retrieve information about songs
import lyricsgenius as lg

genius = lg.Genius('E60-Z9CYOy7qOQ003Bgl8cuMgtzecGTYQLBOwb4r6MsQYRn6eenLoqqLNm_W4NiZ',
                             skip_non_songs=True, remove_section_headers=True, retries=3)



def get_songs(pages=1, period='all_time'):
    """
    pages: number of pages to consider (each page has 50 items)
    period: time period which the charts refer to ('all_time', 'month', 'week', 'day')
    
    For each genre, retrieve the charts based on the time period they refer to.
    Each chart is composed of items (songs), whose quantity is defined by pages*50.
    For each song, retrieve id, title, artist and genre.
    Return the list of songs.
    """
    
    genres = ['rap', 'pop', 'rb', 'rock', 'country']
    songs = []
    
    for genre in genres:
        for page in range(1, pages+1):
            items = genius.charts(time_period=period, chart_genre=genre, 
                                  per_page=50, page=page)['chart_items']
            for item in items:
                song = item['item']
                songs.append([song['id'], song['title'].replace('\u200b', ''), 
                              song['primary_artist']['name'], genre])
                
    return pd.DataFrame(songs, columns=['id', 'title', 'artist', 'genre'])

In [53]:
songs_db = get_songs(pages=2)
songs_db.head()

Unnamed: 0,id,title,artist,genre
0,235729,Rap God,Eminem,rap
1,5832126,WAP,Cardi B,rap
2,3039923,HUMBLE.,Kendrick Lamar,rap
3,2845980,Bad and Boujee,Migos,rap
4,3315890,God’s Plan,Drake,rap


In [54]:
# Given song's id, retrieve its lyrics 
songs_db['lyrics'] = songs_db['id'].progress_apply(lambda x: genius.lyrics(x))
songs_db['lyrics'].head()

  0%|          | 0/483 [00:00<?, ?it/s]

0    "Look, I was gonna go easy on you not to hurt ...
1    Whores in this house\nThere's some whores in t...
2    Nobody pray for me\nIt been that day for me\nW...
3    You know, young rich niggas\nYou know somethin...
4    And they wishin' and wishin' and wishin' and w...
Name: lyrics, dtype: object

In [55]:
def get_tokens(lyrics):
    """
    lyrics: text of the song
    Given the lyrics, split the text in sentences,
    removing unicode characters. Then, tokenize and 
    normalize each sentence.
    """
    
    text = lyrics.encode('ascii', 'ignore').decode().split('\n')
    tokens = [text_preproc(sent) for sent in text]
    
    return tokens

In [57]:
# Tokenize and vectorize lyrics
songs_db['tokens'] = songs_db['lyrics'].apply(lambda x: get_tokens(x))
# First model
songs_db['word2vec'] = songs_db['tokens'].apply(lambda x: [sent2vec(sent) for sent in x])
# Second model
songs_db['doc2vec'] = songs_db['tokens'].apply(lambda x: [modelDoc2Vec.infer_vector(sent) for sent in x])
# Third model
songs_db['sent_words'] = songs_db['tokens'].apply(lambda x: [polarity_check(sent) for sent in x])
songs_db['doc2vec_no_neutral'] = songs_db['sent_words'].apply(lambda x: [modelDoc2Vec2.infer_vector(sent) for sent in x])

songs_db.head()

Unnamed: 0,id,title,artist,genre,lyrics,tokens,word2vec,doc2vec,sent_words,doc2vec_no_neutral
0,235729,Rap God,Eminem,rap,"""Look, I was gonna go easy on you not to hurt ...","[[look, I, be, go, to, go, easy, on, you, not,...","[[0.045401257, 0.039497375, 0.012878418, 0.165...","[[0.13366325, -0.46054986, -0.02450592, -0.119...","[[easy, not], [], [wrong], [not], [], [not], [...","[[0.01580186, -0.019095812, 0.0011912873, 0.02..."
1,5832126,WAP,Cardi B,rap,Whores in this house\nThere's some whores in t...,"[[whore, in, this, house], [there, be, some, w...","[[0.12133789, -0.018920898, 0.03717041, 0.1058...","[[-0.0527356, -0.05202704, 0.047151618, 0.0016...","[[], [], [], [], [certified], [weak], [], [], ...","[[-0.0038090122, 0.0065576825, 0.0007762337, 0..."
2,3039923,HUMBLE.,Kendrick Lamar,rap,Nobody pray for me\nIt been that day for me\nW...,"[[nobody, pray, for, I], [it, be, that, day, f...","[[0.08091736, -0.051979065, 0.10723877, 0.1708...","[[0.22447364, -0.20586772, 0.043614, -0.120065...","[[], [], [], [], [], [], [], [], [], [not], [p...","[[0.00788563, -0.008240136, 0.0037215876, 0.00..."
3,2845980,Bad and Boujee,Migos,rap,"You know, young rich niggas\nYou know somethin...","[[you, know, young, rich, niggas], [you, know,...","[[0.0597168, 0.051171876, 0.025634766, 0.21888...","[[0.27976647, -0.026207156, 0.28834197, 0.3661...","[[], [not, really, never], [], [not], [], [], ...","[[0.007620679, 0.0036392964, 0.009436485, 0.00..."
4,3315890,God’s Plan,Drake,rap,And they wishin' and wishin' and wishin' and w...,"[[and, they, wishin, and, wishin, and, wishin,...","[[-0.059375, 0.0009765625, -0.0056640627, 0.19...","[[0.0016151808, 0.10187321, -0.05933735, -0.13...","[[], [], [], [calm, not], [], [not], [], [not]...","[[0.004585327, 0.0058101737, -0.008089632, 0.0..."


## Model Application

In [71]:
def song2emo(song, nn):
    """
    song: list of vector representation of each sentence in the song
    Apply the regression model to each sentence and assign an emotion
    to it. Return the occurrences of each emotion in the song.
    """
    
    song = pd.DataFrame(song)
    vad = pd.DataFrame(nn.predict(song))
    emotions = vad.apply(lambda x: vad_to_em(x), axis=1).value_counts()
    
    emo_occ = []
    
    for emo in emo_coord.keys():
        try:
            emo_occ.append(emotions[emo])
        except:
            emo_occ.append(np.nan)
            
    return emo_occ

def freq_emo(emo_occ):
    
    emo_df = pd.DataFrame(emo_occ.to_list(), columns=list(emo_coord.keys()))
    
    # Assign the emotion with highest occurrencies to the song
    emo_df['main_emotion'] = emo_df.apply(lambda x: x.idxmax(), axis=1)
    
    return emo_df

### Model 1 (Word2Vec)

In [73]:
# Apply the regression model in order to predict VAD values
emotions = songs_db['word2vec'].progress_apply(lambda x: song2emo(x, nn=model))
emotions = freq_emo(emotions)
emotions.head()

  0%|          | 0/483 [00:00<?, ?it/s]

Unnamed: 0,anger,disgust,fear,joy,sadness,surprise,main_emotion
0,,1.0,,,,,disgust
1,,,,,,,
2,,,,,,,
3,,4.0,,,,,disgust
4,,,,,,,


In [74]:
emotions['main_emotion'].value_counts()

disgust    47
joy        47
fear        1
Name: main_emotion, dtype: int64

### Model 2 (Doc2Vec)

In [75]:
emotions2 = songs_db['doc2vec'].progress_apply(lambda x: song2emo(x, nn=model2))
emotions2 = freq_emo(emotions2)
emotions2['main_emotion'].value_counts()

  0%|          | 0/483 [00:00<?, ?it/s]

disgust    66
sadness     9
joy         5
Name: main_emotion, dtype: int64

### Model 3 (Doc2Vec w/o neutral words)

In [76]:
emotions3 = songs_db['doc2vec_no_neutral'].progress_apply(lambda x: song2emo(x, nn=model3))
emotions3 = freq_emo(emotions3)
emotions3['main_emotion'].value_counts()

  0%|          | 0/483 [00:00<?, ?it/s]

Series([], Name: main_emotion, dtype: int64)

## Correlation Analysis

In [78]:
# Retrieve main emotions for each song and model
songs_db['emotion1'] = emotions['main_emotion']
songs_db['emotion2'] = emotions2['main_emotion']

In [79]:
# Check the distribution of genres for each emotion
songs_db.groupby('emotion1')['genre'].value_counts()

emotion1  genre  
disgust   rap        16
          rock       11
          pop         8
          rb          7
          country     5
fear      country     1
joy       pop        14
          rb         13
          rap         8
          country     6
          rock        6
Name: genre, dtype: int64

In [80]:
songs_db.groupby('emotion2')['genre'].value_counts()

emotion2  genre  
disgust   rap        21
          country    12
          pop        12
          rb         11
          rock       10
joy       pop         2
          rap         2
          rb          1
sadness   rap         5
          pop         2
          rock        2
Name: genre, dtype: int64

## Playlist Generation

In [88]:
# Generate playlist based on genre and emotion
genre = 'pop'
emotion = 'joy'
model = '1'

songs_db.loc[(songs_db['genre'] == genre) & (songs_db['emotion'+model] == emotion), ['id', 'title', 'artist']]

Unnamed: 0,id,title,artist
102,4063065,"thank u, next",Ariana Grande
105,299177,Drunk in Love,Beyoncé
130,2412669,"Father Stretch My Hands, Pt. 1",Kanye West
135,2413549,Ultralight Beam,Kanye West
143,3047141,LOVE.,Kendrick Lamar
158,2819412,Let Me Love You,DJ Snake
159,3754239,Girls Like You (Remix),Maroon 5
160,118812,Fly Me to the Moon,Frank Sinatra
169,2998843,Something Just Like This,The Chainsmokers & Coldplay
176,3114474,Wild Thoughts,DJ Khaled
