Inspired by [https://www.kaggle.com/kredy10/simple-lstm-for-text-classification](https://www.kaggle.com/kredy10/simple-lstm-for-text-classification)

Data augmentation based on: [https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/discussion/48038](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/discussion/48038)

In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model, Sequential
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical, np_utils
from keras.callbacks import EarlyStopping
%matplotlib inline

import os

DATASET_PATH = './../src/datasets/MoodyLyrics4Q.csv'
LYRICS_DATASET_PATH = './ml4q_lyrics.csv'
AUGMENTED_LYRICS_DATASET_PATH = './ml4q_lyrics_augmented.csv'

# Dataset

The first thing this time will be to download lyrics and put them in a table form. Our dataset will have two columns: the lyrics and the releated emotion.

In [2]:
ml4q = pd.read_csv(DATASET_PATH)
ml4q.head()

Unnamed: 0,Index,Artist,Title,Mood
0,ML1,George Michael,I Want Your Sex,happy
1,ML2,Rob Zombie,Pussy Liquor,angry
2,ML3,Katatonia,12,sad
3,ML4,Bing Crosby,Swinging On A Star,happy
4,ML5,Ludacris,Get Back,angry


In [3]:
import lyricwikia

def download_lyric(song):
    try:
        return lyricwikia.get_lyrics(song['Artist'], song['Title'])
    except lyricwikia.LyricsNotFound:
        print('Could not download {}: {}, {}'.format(song['Index'], song['Artist'], song['Title']))
        return False

In [4]:
if os.path.exists(LYRICS_DATASET_PATH):
    ml4q_lyrics = pd.read_csv(LYRICS_DATASET_PATH)
else:
    lyrics_rows = list()

    for (i, row) in ml4q.iterrows():
        lyrics = download_lyric(row)
        if lyrics:
            lyrics_rows.append((
                row['Index'], lyrics, row['Mood']
            ))

    ml4q_lyrics = pd.DataFrame(lyrics_rows, columns=['ID', 'Lyrics', 'Emotion'])
    ml4q_lyrics.to_csv(LYRICSC_DATASET_PATH)

In [5]:
ml4q_lyrics.head()

Unnamed: 0.1,Unnamed: 0,ID,Lyrics,Emotion
0,0,ML1,There's things that you guess\nAnd things that...,happy
1,1,ML2,Baby:\nGimme a B\nGimme a A\nGimme a B\nGimme ...,angry
2,2,ML3,Black theatre of love\nViolet dancers cast the...,sad
3,3,ML4,Would you like to swing on a star\nCarry moonb...,happy
4,4,ML5,Hands up! Hands up\nHere's another one\nAnd a ...,angry


In [6]:
ml4q_lyrics.describe()

Unnamed: 0.1,Unnamed: 0
count,1948.0
mean,973.5
std,562.483481
min,0.0
25%,486.75
50%,973.5
75%,1460.25
max,1947.0


In [7]:
ml4q_lyrics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1948 entries, 0 to 1947
Data columns (total 4 columns):
Unnamed: 0    1948 non-null int64
ID            1948 non-null object
Lyrics        1948 non-null object
Emotion       1948 non-null object
dtypes: int64(1), object(3)
memory usage: 61.0+ KB


## Data Agumentation

To improve the size of the dataset a peculiar data augmentation technique is emploied here. Specifically, while parsing each song, we add to the dataset the original lyrics and the lyrics obtained by applying the following translations:
- EN -> IT -> EN
- EN -> FR -> EN
- EN -> DE -> EN

In [10]:
from textblob import TextBlob
from textblob.translate import NotTranslated

from googletrans import Translator

trans = Translator()

def translate(lyrics, language):
    if hasattr(lyrics, "decode"):
        lyrics = lyrics.decode("utf-8")

    #text = TextBlob(lyrics)
    
    try:
        #text = text.translate(to=language)
        #text = text.translate(to="en")
        text = trans.translate(lyrics, dest=language).text
        text = trans.translate(text, dest='en').text
    except:#NotTranslated:
        return False

    return str(text)

In [11]:
languages = ['it', 'fr', 'de']

if os.path.exists(AUGMENTED_LYRICS_DATASET_PATH):
    ml4q_lyrics_agumented = pd.read_csv(AUGMENTED_LYRICS_DATASET_PATH)
else:
    lyrics_rows = list()

    for (i, row) in ml4q_lyrics.iterrows():
        lyrics = row['Lyrics']
        lyrics_rows.append((
            row['ID'], lyrics, row['Emotion']
        ))
        for l in languages:
            tr = translate(lyrics, l)
            if tr:
                lyrics_rows.append((
                    row['ID'] + l, tr, row['Emotion']
                ))

    ml4q_lyrics_augmented = pd.DataFrame(lyrics_rows, columns=['ID', 'Lyrics', 'Emotion'])
    ml4q_lyrics_augmented.to_csv(AUGMENTED_LYRICS_DATASET_PATH)

In [12]:
ml4q_lyrics_augmented.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7002 entries, 0 to 7001
Data columns (total 3 columns):
ID         7002 non-null object
Lyrics     7002 non-null object
Emotion    7002 non-null object
dtypes: object(3)
memory usage: 164.2+ KB


# Modeling

In [13]:
X = ml4q_lyrics['Lyrics']
Y = ml4q_lyrics['Emotion']

In [14]:
le = LabelEncoder()
Y = le.fit_transform(Y)
Y =  np_utils.to_categorical(Y)

In [15]:
X_train,X_test,Y_train,Y_test = train_test_split(X, Y, test_size=0.2)

## Data Preprocessing

- Tokenize the data and convert the text to sequences.
- Add padding to ensure that all the sequences have the same shape.
- There are many ways of taking the max_len and here an arbitrary length of 300 is chosen.

In [98]:
max_words = 150
max_len = 200
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences = tok.texts_to_sequences(X_train)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

## Model training

### RNN

In [105]:
def RNN():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words,50,input_length=max_len)(inputs)
    #layer = LSTM(128, return_sequences=True)(layer)
    layer = LSTM(32)(layer)
    layer = Dense(256, name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.2)(layer)
    #layer = Dense(128, name='FC2')(layer)
    #layer = Activation('tanh')(layer)
    #layer = Dropout(0.5)(layer)
    layer = Dense(4,name='out_layer')(layer)
    layer = Activation('softmax')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [106]:
model = RNN()
model.summary()
model.compile(loss='categorical_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          (None, 200)               0         
_________________________________________________________________
embedding_13 (Embedding)     (None, 200, 50)           7500      
_________________________________________________________________
lstm_18 (LSTM)               (None, 32)                10624     
_________________________________________________________________
FC1 (Dense)                  (None, 256)               8448      
_________________________________________________________________
activation_27 (Activation)   (None, 256)               0         
_________________________________________________________________
dropout_17 (Dropout)         (None, 256)               0         
_________________________________________________________________
out_layer (Dense)            (None, 4)                 1028      
__________

In [107]:
model.fit(sequences_matrix,Y_train,batch_size=128,epochs=50,
          validation_split=0.2)

Train on 1246 samples, validate on 312 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fdfc1b3ff60>

Evaluate model on test

In [108]:
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)

In [109]:
accr = model.evaluate(test_sequences_matrix,Y_test)



In [110]:
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 1.423
  Accuracy: 0.438


### Feedforward NN

In [69]:
def feedforward_nn():
    classifier = Sequential()
    classifier = Sequential()
    classifier.add(Dense(units = 256, kernel_initializer = 'random_normal', 
                         activation = 'sigmoid', input_dim = max_len))
    classifier.add(Dropout(0.5))
    classifier.add(Dense(units = 4, kernel_initializer = 'glorot_normal', activation = 'softmax'))
    classifier.compile(optimizer='adam', loss='categorical_crossentropy', metrics = ['accuracy'])
    return classifier

In [70]:
model = feedforward_nn()
model.summary()
model.compile(loss='categorical_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 256)               77056     
_________________________________________________________________
dropout_10 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 4)                 1028      
Total params: 78,084
Trainable params: 78,084
Non-trainable params: 0
_________________________________________________________________
