In [1]:
"""
Lyric Data Processing
CMPE 351 Group Project
Spring 2021
"""


#%% Import actual data

import pandas as pd

ld = pd.read_csv('./data/track_features.csv')
ld = ld[ld["lyrics"]!="''"]

#%% Encode labels as 0 or 1

ld.valence = round(ld.valence)
ld.danceability = round(ld.danceability)

#%% Language filter

import nltk
import os

nltk.download('words')
def eng_ratio(text):
    ''' Returns the ratio of non-English to English words from a text '''

    english_vocab = set(w.lower() for w in nltk.corpus.words.words()) 
    text_vocab = set(w.lower() for w in text.split() if w.lower().isalpha()) 
    unusual = text_vocab.difference(english_vocab)
    diff = len(unusual)/len(text_vocab)
    return diff


before = ld.shape[0]
for row_id in ld.index:
    text = ld.loc[row_id]['lyrics']
    try:
        diff = eng_ratio(text)
    except:
        ld = ld[ld.index != row_id]
        print('row %s is causing problems' %row_id)
    if diff >= 0.5:
        ld = ld[ld.index != row_id]
after = ld.shape[0]
rem = before - after
print('%s have been removed.' %rem)
print('%s songs remain in the dataset.' %after)

dataPath1 = "/Users/Ryan/Documents/GitHub/351-lyric-analysis/data/filtered_data.csv"

# ld.to_csv(os.path.join(dataPath1), index=False)

#%% Split into training, test
import numpy as np

msk = np.random.rand(len(ld)) < 0.8

train = ld[msk]
test = ld[~msk]
                 

#%% Porter-Stemmer Tokenizer, suffix stripper

import nltk
import string
import re

porter_stemmer = nltk.stem.porter.PorterStemmer()

def porter_tokenizer(text, stemmer=porter_stemmer):
    """
    A Porter-Stemmer-Tokenizer hybrid to splits sentences into words (tokens) 
    and applies the porter stemming algorithm to each of the obtained token. 
    Tokens that are only consisting of punctuation characters are removed as well.
    Only tokens that consist of more than one letter are being kept.
    
    Parameters
    ----------
        
    text : `str`. 
      A sentence that is to split into words.
        
    Returns
    ----------
    
    no_punct : `str`. 
      A list of tokens after stemming and removing Sentence punctuation patterns.
    
    """
    lower_txt = text.lower()
    tokens = nltk.wordpunct_tokenize(lower_txt)
    stems = [porter_stemmer.stem(t) for t in tokens]
    no_punct = [s for s in stems if re.match('^[a-zA-Z]+$', s) is not None]
    return no_punct

#%% Stop words

# # One-time download of stop words file:
# nltk.download('stopwords')
# stp = nltk.corpus.stopwords.words('english')
# with open('./stopwords_eng.txt', 'w') as outfile:
#     outfile.write('\n'.join(stp))
    
    
with open('./stopwords_eng.txt', 'r') as infile:
    stop_words = infile.read().splitlines()
print('stop words %s ...' %stop_words[:5])

#%% Count Vectorizer

from sklearn.feature_extraction.text import CountVectorizer

# can try different values for ngram_range
countVec = CountVectorizer(
            encoding='utf-8',
            decode_error='replace',
            strip_accents='unicode',
            analyzer='word',
            binary=False,
            stop_words=stop_words,
            tokenizer=porter_tokenizer,
            ngram_range=(1,1)
    )

valenceTrain = train["valence"]
valenceTest = test["valence"]
danceTrain = train["danceability"]
danceTest = test["danceability"]
# print('Vocabulary size: %s' %len(countVecTrain.get_feature_names()))



[nltk_data] Downloading package words to
[nltk_data]     C:\Users\JTOCo\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


row 1076 is causing problems
row 2075 is causing problems
row 2533 is causing problems
row 3573 is causing problems
row 4250 is causing problems
row 4380 is causing problems
row 4725 is causing problems
row 4730 is causing problems
row 5592 is causing problems
row 5895 is causing problems
row 6149 is causing problems
row 6816 is causing problems
row 7510 is causing problems
row 7535 is causing problems
499 have been removed.
8424 songs remain in the dataset.
stop words ['i', 'me', 'my', 'myself', 'we'] ...


  'stop_words.' % sorted(inconsistent))


In [63]:
#Beginning of valence prediction
valenceTrain = train["valence"].astype(str)
valenceTest = test["valence"].astype(str)
danceTrain = train["danceability"].astype(str)
danceTest = test["danceability"].astype(str)

In [80]:
#valenceTrain.value_counts()
valenceTest.value_counts()

1.0    1057
0.0     626
Name: valence, dtype: int64

In [76]:
print(NB_V_predictions)
NB_V_predictions.value_counts()

['0.0' '0.0' '0.0' ... '0.0' '0.0' '0.0']


AttributeError: 'numpy.ndarray' object has no attribute 'value_counts'

In [77]:
unique_elements, counts_elements = np.unique(NB_V_predictions, return_counts=True)
print("Frequency of unique values of the said array:")
print(np.asarray((unique_elements, counts_elements)))

Frequency of unique values of the said array:
[['0.0' '1.0']
 ['1282' '401']]


In [81]:
countVec.fit(train["lyrics"].values.ravel())

CountVectorizer(analyzer='word', binary=False, decode_error='replace',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None,
                stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", "you've", "you'll",
                            "you'd", 'your', 'yours', 'yourself', 'yourselves',
                            'he', 'him', 'his', 'himself', 'she', "she's",
                            'her', 'hers', 'herself', 'it', "it's", 'its',
                            'itself', ...],
                strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<function porter_tokenizer at 0x0000021ADFF06438>,
                vocabulary=None)

In [82]:
countVecTrain = countVec.transform(train["lyrics"].values)
countVecTest = countVec.transform(test["lyrics"].values) 

In [83]:
#Naive Bayes Model 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

clf_NB = MultinomialNB()
clf_NB.fit(countVecTrain, valenceTrain)
NB_V_predictions = clf_NB.predict(countVecTest)
print('NB Valence Accuracy score:' , accuracy_score(valenceTest, NB_V_predictions))

NB Valence Accuracy score: 0.5008912655971479


In [86]:
#Random Forest Model
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

clf_RF=RandomForestClassifier(n_estimators=1)
clf_RF.fit(countVecTrain,valenceTrain)
RF_V_predictions = clf_RF.predict(countVecTest)
print("RF Valence Accuracy:", accuracy_score(valenceTest, RF_V_predictions))

RF Valence Accuracy: 0.5591206179441474


In [84]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression

clf_LR = LogisticRegression()
clf_LR.fit(countVecTrain, valenceTrain)
NB_V_predictions = clf_LR.predict(countVecTest)
print("LR Valence Accuracy:", accuracy_score(valenceTest, NB_V_predictions))



LR Valence Accuracy: 0.6333927510398099




In [94]:
from __future__ import division, print_function
from gensim import models
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Dropout, Reshape, Flatten, concatenate, Input, Conv1D, GlobalMaxPooling1D, Embedding
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import os
import collections
import re
import string
nltk.download('punkt')
from nltk import word_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords

#put together dataset for CNN
df = pd.read_csv ('filtered_data.csv')
df1 = df[['lyrics', 'valence']]
df1.valence = pd.cut(df1.valence,bins=[0,0.5,1],labels=[0,1])
df1.lyrics = df1['lyrics'].astype(str)


#Tokens
tokens = [word_tokenize(sen) for sen in df1.lyrics]
def lower_token(tokens): 
    return [w.lower() for w in tokens]    
    
lower_tokens = [lower_token(token) for token in tokens]

#Pos and Neg
pos = []
neg = []
for l in df1.valence:
    if l == 0:
        pos.append(0)
        neg.append(1)
    elif l == 1:
        pos.append(1)
        neg.append(0)
df1['Pos']= pos
df1['Neg']= neg

data = df1[['lyrics', 'tokens', 'valence', 'Pos', 'Neg']]
data.head()

data_train, data_test = train_test_split(data, test_size=0.10, random_state=42)

all_training_words = [word for tokens in data_train["tokens"] for word in tokens]
training_sentence_lengths = [len(tokens) for tokens in data_train["tokens"]]
TRAINING_VOCAB = sorted(list(set(all_training_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))
print("Max sentence length is %s" % max(training_sentence_lengths))
all_test_words = [word for tokens in data_test['tokens'] for word in tokens]
test_sentence_lengths = [len(tokens) for tokens in data_test['tokens']]
TEST_VOCAB = sorted(list(set(all_test_words)))
print('%s words total, with a vocabulary size of %s' % (len(all_test_words), len(TEST_VOCAB)))
print('Max sentence length is %s' % max(test_sentence_lengths))

import gensim
from gensim import models
from gensim.models import Word2Vec

word2vec_path = 'https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz'
word2vec = models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(vectors, clean_comments, generate_missing=False):
    embeddings = clean_comments['tokens'].apply(lambda x: get_average_word2vec(x, vectors, 
                                                                                generate_missing=generate_missing))
    return list(embeddings)

training_embeddings = get_word2vec_embeddings(word2vec, data_train, generate_missing=True)
MS_LENGTH = 50
EMBEDDING_DIM = 300
tokenizer = Tokenizer(num_words=len(TRAINING_VOCAB), lower=True, char_level=False)
tokenizer.fit_on_texts(data_train["lyrics"].tolist())
training_sequences = tokenizer.texts_to_sequences(data_train["lyrics"].tolist())

train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))

train_cnn_data = pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH)

tew = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))
for word,index in train_word_index.items():
    tew[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)
    
test_sequences = tokenizer.texts_to_sequences(data_test["lyrics"].tolist())
test_cnn_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
label_names = ['Pos', 'Neg']

y_train = data_train[label_names].values

x_train = train_cnn_data
y_tr = y_train

embedding_layer = Embedding(len(train_word_index)+1,
                         300,
                         weights=[tew],
                         input_length=MS_LENGTH,
                          trainable=False)
    
sequence_input = Input(shape=(MS_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
epochs_count = 3
b_size = 36

convs = []
filter_sizes = [2,3,4,5,6]

for filter_size in filter_sizes:
    l_conv = Conv1D(filters=250, kernel_size=filter_size, activation='relu')(embedded_sequences)
    l_pool = GlobalMaxPooling1D()(l_conv)
    convs.append(l_pool)


lm = concatenate(convs, axis=1)

x = Dropout(0.1)(lm)  
x = Dense(128, activation='relu')(x)
x = Dropout(0.2)(x)
preds = Dense(len(list(label_names)), activation='sigmoid')(x)

model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])
model.summary()

hist = model.fit(x_train, y_tr, epochs=epochs_count, validation_split=0.1, shuffle=True, batch_size=b_size)

predictions = model.predict(test_cnn_data, batch_size=1024, verbose=1)

labels = [1, 0]

sum(data_test.valence==prediction_labels)/len(prediction_labels)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\JTOCo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\JTOCo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


KeyboardInterrupt: 

In [90]:
# Ensemble for Max voting for several models
"""
Addtional Models - SVM, DT
 
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
 
svc = SVC(kernel='poly, probability=True)
dt = DecisionTreeClassifier()
 
"""
 
from sklearn.ensemble import VotingClassifier
lr = LogisticRegression()
nb = MultinomialNB()
rf = RandomForestClassifier(n_estimators=1)
 
classifiers = [('lr', lr), ('nb', nb), ('rf', rf)]
vc = VotingClassifier(estimators=classifiers, voting='hard')
 
# One method
# from - https://www.analyticsvidhya.com/blog/2018/06/comprehensive-guide-for-ensemble-models/ 
model = VotingClassifier(estimators=classifiers, voting='hard')
model.fit(countVecTrain, valenceTrain)
model.score(countVecTest, valenceTest)



0.5906120023767083

In [92]:
# Another method
# Taken from - https://medium.com/@sanchitamangale12/voting-classifier-1be10db6d7a5 
from sklearn.model_selection import cross_val_score
a = []
a.append(cross_val_score(lr, countVecTest, valenceTest, scoring='accuracy', cv=5).mean())
a.append(cross_val_score(nb, countVecTest, valenceTest, scoring='accuracy', cv=5).mean())
a.append(cross_val_score(rf, countVecTest, valenceTest, scoring='accuracy', cv=5).mean())
a.append(cross_val_score(vc, countVecTest, valenceTest, scoring='accuracy', cv=5).mean())
 
import numpy as np
print(np.array(a))



[0.56920692 0.579937   0.54366981 0.58467601]


In [None]:
# Things for CNN

"""
# Simple Bagging
from sklearn.ensemble import BaggingClassifier
# replace line of code that
# model = Model(sequence_input, preds)
model = BaggingClassifier(Model(sequence_input, preds))

# AdaBoostClassifier 
from sklearn.ensemble import AdaBoostClassifier
model = AdaBoostClassifier(Model(sequence_input, preds))

"""
"""
CNN Model - Average voting ensemble
Adapted from https://machinelearningmastery.com/weighted-average-ensemble-for-deep-learning-neural-networks/

Josh's CNN model is used down below
"""
from sklearn.datasets import make_blobs
from sklearn.metrics import accuracy_score
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from matplotlib import pyplot
from numpy import mean
from numpy import std
import numpy
from numpy import array
from numpy import argmax

# define Model + fit on dataset
def fit_model(x_train, y_tr):
    embedding_layer = Embedding(len(train_word_index)+1, 300, weights=[tew], input_length=MS_LENGTH, trainable=False)
    
    sequence_input = Input(shape=(MS_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    epochs_count = 5
    b_size = 36
    convs = []
    filter_sizes = [2,3,4,5,6]

    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=250, kernel_size=filter_size, activation='relu')(embedded_sequences)
        l_pool = GlobalMaxPooling1D()(l_conv)
        convs.append(l_pool)

    lm = concatenate(convs, axis=1)

    x = Dropout(0.1)(lm)  
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.2)(x)
    preds = Dense(len(list(label_names)), activation='sigmoid')(x)

    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
    model.fit(x_train, y_tr, epochs=epochs_count, validation_split=0.1, shuffle=True, batch_size=b_size)
    return model

# Ensemble prediction
def ensemble_predictions(models, x_test):
    # Make predictions
    yhats = [model.predict(x_test) for model in models]
    yhats = array(yhats)
    # Sum ensemble members
    model_sum = numpy.sum(yhats, axis=0)
    result = argmax(model_sum, axis=1)
    return result

# Evaluate specifics
# Double check names of testing/training data
def evaluate_n_models(models, n_models, x_test, y_test):
    subset = models[:n_models]
    # Make a predicition
    yhat = ensemble_predictions(subset, x_test)
    return accuracy_score(test_y, yhat)

n_models = 5
models = [fit_model(train_x, train_y) for _ in range(n_models)]

single_scores, ensemble_scores = list(), list()
for i in range(1, len(models)+1):
	# evaluate model with i members
	ensemble_score = evaluate_n_models(members, i, test_x, test_y)
	# evaluate the i'th model standalone
	testy_enc = to_categorical(testy)
	_, single_score = members[i-1].evaluate(test_x, testy_enc, verbose=0)
	# summarize this step
	print('> %d: single=%.3f, ensemble=%.3f' % (i, single_score, ensemble_score))
	ensemble_scores.append(ensemble_score)
	single_scores.append(single_score)

# summarize average accuracy of a single final model
print('Accuracy %.3f (%.3f)' % (mean(single_scores), std(single_scores)))
# plot score vs number of ensemble members
x_axis = [i for i in range(1, len(models)+1)]
pyplot.plot(x_axis, single_scores, marker='o', linestyle='None')
pyplot.plot(x_axis, ensemble_scores, marker='o')
pyplot.show()

In [None]:
#Beginning of danceability prediction

In [74]:
#Naive Bayes Model 
clf_NB = MultinomialNB()
clf_NB.fit(countVecTrain, danceTrain)
NB_D_predictions = clf_NB.predict(countVecTest)
print('NB Danceability Accuracy score:' , accuracy_score(danceTest, NB_V_predictions))

NB Danceability Accuracy score: 0.477124183006536


In [None]:
#Random Forest Model
clf_RF=RandomForestClassifier(n_estimators=100)
clf_RF.fit(countVecTrain,danceTrain)
RF_D_predictions = clf.predict(countVecTest)
print("RF Danceability Accuracy:", accuracy_score(danceTest, RF_V_predictions))

In [None]:
#Logistic Regression
clf_LR = LogisticRegression()
clf_LR.fit(countVecTrain, danceTrain)
NB_D_predictions = clf.predict(countVecTest)
print("LR Danceability Accuracy:", accuracy_score(danceTest, NB_V_predictions))