In [2]:
"""
Lyric Data Processing
CMPE 351 Group Project
Spring 2021
"""
#bin names for genres rock, country, hip hop, pop

#%% Import actual data

import pandas as pd

ld = pd.read_csv('./data/track_features.csv')
ld = ld[ld["lyrics"]!="''"]

#%% Encode labels as 0 or 1

ld.valence = round(ld.valence)
ld.danceability = round(ld.danceability)

#%% Language filter

import nltk
import os

nltk.download('words')
def eng_ratio(text):
    ''' Returns the ratio of non-English to English words from a text '''

    english_vocab = set(w.lower() for w in nltk.corpus.words.words()) 
    text_vocab = set(w.lower() for w in text.split() if w.lower().isalpha()) 
    unusual = text_vocab.difference(english_vocab)
    diff = len(unusual)/len(text_vocab)
    return diff


before = ld.shape[0]
for row_id in ld.index:
    text = ld.loc[row_id]['lyrics']
    try:
        diff = eng_ratio(text)
    except:
        ld = ld[ld.index != row_id]
        print('row %s is causing problems' %row_id)
    if diff >= 0.5:
        ld = ld[ld.index != row_id]
after = ld.shape[0]
rem = before - after
print('%s have been removed.' %rem)
print('%s songs remain in the dataset.' %after)

dataPath1 = "/Users/Ryan/Documents/GitHub/351-lyric-analysis/data/filtered_data.csv"

# ld.to_csv(os.path.join(dataPath1), index=False)

#%% Split into training, test
import numpy as np

msk = np.random.rand(len(ld)) < 0.8

train = ld[msk]
test = ld[~msk]
                 

#%% Porter-Stemmer Tokenizer, suffix stripper

import nltk
import string
import re

porter_stemmer = nltk.stem.porter.PorterStemmer()

def porter_tokenizer(text, stemmer=porter_stemmer):
    """
    A Porter-Stemmer-Tokenizer hybrid to splits sentences into words (tokens) 
    and applies the porter stemming algorithm to each of the obtained token. 
    Tokens that are only consisting of punctuation characters are removed as well.
    Only tokens that consist of more than one letter are being kept.
    
    Parameters
    ----------
        
    text : `str`. 
      A sentence that is to split into words.
        
    Returns
    ----------
    
    no_punct : `str`. 
      A list of tokens after stemming and removing Sentence punctuation patterns.
    
    """
    lower_txt = text.lower()
    tokens = nltk.wordpunct_tokenize(lower_txt)
    stems = [porter_stemmer.stem(t) for t in tokens]
    no_punct = [s for s in stems if re.match('^[a-zA-Z]+$', s) is not None]
    return no_punct

#%% Stop words

# # One-time download of stop words file:
# nltk.download('stopwords')
# stp = nltk.corpus.stopwords.words('english')
# with open('./stopwords_eng.txt', 'w') as outfile:
#     outfile.write('\n'.join(stp))
    
    
with open('./stopwords_eng.txt', 'r') as infile:
    stop_words = infile.read().splitlines()
print('stop words %s ...' %stop_words[:5])

#%% Count Vectorizer

from sklearn.feature_extraction.text import CountVectorizer

# can try different values for ngram_range
countVec = CountVectorizer(
            encoding='utf-8',
            decode_error='replace',
            strip_accents='unicode',
            analyzer='word',
            binary=False,
            stop_words=stop_words,
            tokenizer=porter_tokenizer,
            ngram_range=(1,1)
    )

valenceTrain = train["valence"]
valenceTest = test["valence"]
danceTrain = train["danceability"]
danceTest = test["danceability"]
# print('Vocabulary size: %s' %len(countVecTrain.get_feature_names()))



[nltk_data] Error loading words: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


row 1076 is causing problems
row 2075 is causing problems
row 2533 is causing problems
row 3573 is causing problems
row 4250 is causing problems
row 4380 is causing problems
row 4725 is causing problems
row 4730 is causing problems
row 5592 is causing problems
row 5895 is causing problems
row 6149 is causing problems
row 6816 is causing problems
row 7510 is causing problems
row 7535 is causing problems
499 have been removed.
8424 songs remain in the dataset.
stop words ['i', 'me', 'my', 'myself', 'we'] ...


In [1]:
#non genre specific
valenceTrain = train["valence"].astype(str)
valenceTest = test["valence"].astype(str)
danceTrain = train["danceability"].astype(str)
danceTest = test["danceability"].astype(str)

NameError: name 'train' is not defined

In [None]:
countVec.fit(train["lyrics"].values.ravel())
countVecTrain = countVec.transform(train["lyrics"].values)
countVecTest = countVec.transform(test["lyrics"].values) 
#end of non genre specific

In [78]:
#genre specific
train_rock = train[train['genre']=="rock"]
train_pop = train[train['genre']=="pop"]
train_hiphop = train[train['genre']=="hip hop"]
train_country = train[train['genre']=="country"]

test_rock = test[test['genre']=="rock"]
test_pop = test[test['genre']=="pop"]
test_hiphop = test[test['genre']=="hip hop"]
test_country = test[test['genre']=="country"]

#For the next sections, choose only one depending on the genre you want to inspect

In [121]:
#genre specific country
valenceTrain = train_country["valence"].astype(str)
valenceTest = test_country["valence"].astype(str)
danceTrain = train_country["danceability"].astype(str)
danceTest = test_country["danceability"].astype(str)

countVec.fit(train_country["lyrics"].values.ravel())
countVecTrain = countVec.transform(train_country["lyrics"].values)
countVecTest = countVec.transform(test_country["lyrics"].values) 

In [122]:
#genre specific pop
valenceTrain = train_pop["valence"].astype(str)
valenceTest = test_pop["valence"].astype(str)
danceTrain = train_pop["danceability"].astype(str)
danceTest = test_pop["danceability"].astype(str)

countVec.fit(train_pop["lyrics"].values.ravel())
countVecTrain = countVec.transform(train_pop["lyrics"].values)
countVecTest = countVec.transform(test_pop["lyrics"].values) 

In [None]:
#genre specific hiphop
valenceTrain = train_hiphop["valence"].astype(str)
valenceTest = test_hiphop["valence"].astype(str)
danceTrain = train_hiphop["danceability"].astype(str)
danceTest = test_hiphop["danceability"].astype(str)

countVec.fit(train_hiphop["lyrics"].values.ravel())
countVecTrain = countVec.transform(train_hiphop["lyrics"].values)
countVecTest = countVec.transform(test_hiphop["lyrics"].values) 

In [None]:
#genre specific rock
valenceTrain = train_rock["valence"].astype(str)
valenceTest = test_rock["valence"].astype(str)
danceTrain = train_rock["danceability"].astype(str)
danceTest = test_rock["danceability"].astype(str)

countVec.fit(train_rock["lyrics"].values.ravel())
countVecTrain = countVec.transform(train_rock["lyrics"].values)
countVecTest = countVec.transform(test_rock["lyrics"].values) 

In [None]:
#end of genre specific

In [141]:
#Naive Bayes Model valence
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

clf_NB = MultinomialNB()
clf_NB.fit(countVecTrain, valenceTrain)
NB_V_predictions = clf_NB.predict(countVecTest)
print('NB Valence Accuracy score:' , accuracy_score(valenceTest, NB_V_predictions))

NB Valence Accuracy score: 0.5833333333333334


In [129]:
#Random Forest Model valence
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

clf_RF=RandomForestClassifier(n_estimators=10)
clf_RF.fit(countVecTrain,valenceTrain)
RF_V_predictions = clf_RF.predict(countVecTest)
print("RF Valence Accuracy:", accuracy_score(valenceTest, RF_V_predictions))

RF Valence Accuracy: 0.6041666666666666


In [142]:
#Logistic Regression valence
from sklearn.linear_model import LogisticRegression

clf_LR = LogisticRegression()
clf_LR.fit(countVecTrain, valenceTrain)
LR_V_predictions = clf_LR.predict(countVecTest)
print("LR Valence Accuracy:", accuracy_score(valenceTest, LR_V_predictions))

LR Valence Accuracy: 0.5833333333333334




In [21]:
#Start of CNN valence
from __future__ import division, print_function
from gensim import models
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Dropout, Reshape, Flatten, concatenate, Input, Conv1D, GlobalMaxPooling1D, Embedding
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import os
import collections
import re
import string
nltk.download('punkt')
from nltk import word_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\JTOCo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\JTOCo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [25]:
df = pd.read_csv ('filtered_data.csv')
df1 = df[['lyrics', 'valence']]

In [26]:
#df1.valence = pd.cut(df1.valence,bins=[0,0.5,1],labels=[0,1])
df1.lyrics = df1.lyrics.replace(r'\\n',' ', regex=True) 
df1.lyrics = df1['lyrics'].astype(str)

print("getting tokens")

#Tokens
tokens = [word_tokenize(sen) for sen in df1.lyrics]
def lower_token(tokens): 
    return [w.lower() for w in tokens]    
    
lower_tokens = [lower_token(token) for token in tokens]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


getting tokens


In [27]:
#Pos and Neg
pos = []
neg = []
for l in df1.valence:
    if l == 0:
        pos.append(0)
        neg.append(1)
    elif l == 1:
        pos.append(1)
        neg.append(0)
df1['Pos']= pos
df1['Neg']= neg

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


In [28]:

df1.head()

Unnamed: 0,lyrics,valence,Pos,Neg
0,'Once upon a time you dressed so fine Threw th...,1.0,1,0
1,"""Load up on guns, bring your friends It's fun ...",1.0,1,0
2,'CDsAC/DC - Back in Black (1980) - For Those A...,0.0,0,1
3,"'I, I love the colorful clothes she wears And ...",0.0,0,1
4,'Deep down in Louisiana close to New Orleans W...,1.0,1,0


In [29]:
stoplist = stopwords.words('english')
def removeStopWords(tokens): 
    return [w for w in tokens if w not in stoplist]
filtered_words = [removeStopWords(i) for i in lower_tokens]
df1['lyrics'] = [' '.join(i) for i in filtered_words]
df1['tokens'] = filtered_words

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [30]:
data = df1[['lyrics', 'tokens', 'valence', 'Pos', 'Neg']]
data.head()

data_train, data_test = train_test_split(data, test_size=0.10, random_state=42)

all_training_words = [word for tokens in data_train["tokens"] for word in tokens]
training_sentence_lengths = [len(tokens) for tokens in data_train["tokens"]]
TRAINING_VOCAB = sorted(list(set(all_training_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))
print("Max sentence length is %s" % max(training_sentence_lengths))
all_test_words = [word for tokens in data_test['tokens'] for word in tokens]
test_sentence_lengths = [len(tokens) for tokens in data_test['tokens']]
TEST_VOCAB = sorted(list(set(all_test_words)))
print('%s words total, with a vocabulary size of %s' % (len(all_test_words), len(TEST_VOCAB)))
print('Max sentence length is %s' % max(test_sentence_lengths))

15707217 words total, with a vocabulary size of 165789
Max sentence length is 146616
1481629 words total, with a vocabulary size of 72671
Max sentence length is 98008


In [31]:
import gensim
from gensim import models
from gensim.models import Word2Vec

word2vec_path = 'https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz'
word2vec = models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [32]:
def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(vectors, clean_comments, generate_missing=False):
    embeddings = clean_comments['tokens'].apply(lambda x: get_average_word2vec(x, vectors, 
                                                                                generate_missing=generate_missing))
    return list(embeddings)


In [33]:
training_embeddings = get_word2vec_embeddings(word2vec, data_train, generate_missing=True)

In [36]:

MS_LENGTH = 50
EMBEDDING_DIM = 300
tokenizer = Tokenizer(num_words=len(TRAINING_VOCAB), lower=True, char_level=False)
tokenizer.fit_on_texts(data_train["lyrics"].tolist())
training_sequences = tokenizer.texts_to_sequences(data_train["lyrics"].tolist())

In [37]:
train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))

Found 124932 unique tokens.


In [40]:


train_cnn_data = pad_sequences(training_sequences, maxlen=MS_LENGTH)

tew = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))
for word,index in train_word_index.items():
    tew[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)
    
test_sequences = tokenizer.texts_to_sequences(data_test["lyrics"].tolist())
test_cnn_data = pad_sequences(test_sequences, maxlen=MS_LENGTH)
label_names = ['Pos', 'Neg']

In [41]:
y_train = data_train[label_names].values

x_train = train_cnn_data
y_tr = y_train


In [42]:


embedding_layer = Embedding(len(train_word_index)+1,
                         300,
                         weights=[tew],
                         input_length=MS_LENGTH,
                          trainable=False)
    
sequence_input = Input(shape=(MS_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
epochs_count = 3
b_size = 36

convs = []
filter_sizes = [2,3,4,5,6]

for filter_size in filter_sizes:
    l_conv = Conv1D(filters=250, kernel_size=filter_size, activation='relu')(embedded_sequences)
    l_pool = GlobalMaxPooling1D()(l_conv)
    convs.append(l_pool)


lm = concatenate(convs, axis=1)

x = Dropout(0.1)(lm)  
x = Dense(128, activation='relu')(x)
x = Dropout(0.2)(x)
preds = Dense(len(list(label_names)), activation='sigmoid')(x)

model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 50)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 50, 300)      37479900    input_1[0][0]                    
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 49, 250)      150250      embedding[0][0]                  
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 48, 250)      225250      embedding[0][0]                  
______________________________________________________________________________________________

In [48]:
hist = model.fit(x_train, y_tr, epochs=epochs_count, validation_split=0.1, shuffle=True, batch_size=b_size)

predictions = model.predict(test_cnn_data, batch_size=1024, verbose=1)

labels = [1, 0]



Epoch 1/3
Epoch 2/3
Epoch 3/3


In [47]:
prediction_labels=[]
for p in predictions:
    prediction_labels.append(labels[np.argmax(p)])

sum(data_test.valence==prediction_labels)/len(prediction_labels)
#end of CNN valence

0.594306049822064

In [143]:
# Ensemble for Max voting for several models valence
"""
Addtional Models - SVM, DT
 
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
 
svc = SVC(kernel='poly, probability=True)
dt = DecisionTreeClassifier()
 
"""
 
from sklearn.ensemble import VotingClassifier
 
classifiers = [('lr', clf_LR), ('nb', clf_NB), ('rf', clf_RF)]
vc = VotingClassifier(estimators=classifiers, voting='hard')
 
# One method
# from - https://www.analyticsvidhya.com/blog/2018/06/comprehensive-guide-for-ensemble-models/ 
model = VotingClassifier(estimators=classifiers, voting='hard')
model.fit(countVecTrain, valenceTrain)
model.score(countVecTest, valenceTest)

0.5833333333333334

In [92]:
# Another method  #DELETE DELETE DELETE
# Taken from - https://medium.com/@sanchitamangale12/voting-classifier-1be10db6d7a5 
from sklearn.model_selection import cross_val_score
a = []
a.append(cross_val_score(lr, countVecTest, valenceTest, scoring='accuracy', cv=5).mean())
a.append(cross_val_score(nb, countVecTest, valenceTest, scoring='accuracy', cv=5).mean())
a.append(cross_val_score(rf, countVecTest, valenceTest, scoring='accuracy', cv=5).mean())
a.append(cross_val_score(vc, countVecTest, valenceTest, scoring='accuracy', cv=5).mean())
 
import numpy as np
print(np.array(a))



[0.56920692 0.579937   0.54366981 0.58467601]


In [None]:
# Things for CNN #DELETE DELETE DELETE

"""
# Simple Bagging
from sklearn.ensemble import BaggingClassifier
# replace line of code that
# model = Model(sequence_input, preds)
model = BaggingClassifier(Model(sequence_input, preds))

# AdaBoostClassifier 
from sklearn.ensemble import AdaBoostClassifier
model = AdaBoostClassifier(Model(sequence_input, preds))

"""
"""
CNN Model - Average voting ensemble
Adapted from https://machinelearningmastery.com/weighted-average-ensemble-for-deep-learning-neural-networks/

Josh's CNN model is used down below
"""
from sklearn.datasets import make_blobs
from sklearn.metrics import accuracy_score
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from matplotlib import pyplot
from numpy import mean
from numpy import std
import numpy
from numpy import array
from numpy import argmax

# define Model + fit on dataset
def fit_model(x_train, y_tr):
    embedding_layer = Embedding(len(train_word_index)+1, 300, weights=[tew], input_length=MS_LENGTH, trainable=False)
    
    sequence_input = Input(shape=(MS_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    epochs_count = 5
    b_size = 36
    convs = []
    filter_sizes = [2,3,4,5,6]

    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=250, kernel_size=filter_size, activation='relu')(embedded_sequences)
        l_pool = GlobalMaxPooling1D()(l_conv)
        convs.append(l_pool)

    lm = concatenate(convs, axis=1)

    x = Dropout(0.1)(lm)  
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.2)(x)
    preds = Dense(len(list(label_names)), activation='sigmoid')(x)

    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
    model.fit(x_train, y_tr, epochs=epochs_count, validation_split=0.1, shuffle=True, batch_size=b_size)
    return model

# Ensemble prediction
def ensemble_predictions(models, x_test):
    # Make predictions
    yhats = [model.predict(x_test) for model in models]
    yhats = array(yhats)
    # Sum ensemble members
    model_sum = numpy.sum(yhats, axis=0)
    result = argmax(model_sum, axis=1)
    return result

# Evaluate specifics
# Double check names of testing/training data
def evaluate_n_models(models, n_models, x_test, y_test):
    subset = models[:n_models]
    # Make a predicition
    yhat = ensemble_predictions(subset, x_test)
    return accuracy_score(test_y, yhat)

n_models = 5
models = [fit_model(train_x, train_y) for _ in range(n_models)]

single_scores, ensemble_scores = list(), list()
for i in range(1, len(models)+1):
	# evaluate model with i members
	ensemble_score = evaluate_n_models(members, i, test_x, test_y)
	# evaluate the i'th model standalone
	testy_enc = to_categorical(testy)
	_, single_score = members[i-1].evaluate(test_x, testy_enc, verbose=0)
	# summarize this step
	print('> %d: single=%.3f, ensemble=%.3f' % (i, single_score, ensemble_score))
	ensemble_scores.append(ensemble_score)
	single_scores.append(single_score)

# summarize average accuracy of a single final model
print('Accuracy %.3f (%.3f)' % (mean(single_scores), std(single_scores)))
# plot score vs number of ensemble members
x_axis = [i for i in range(1, len(models)+1)]
pyplot.plot(x_axis, single_scores, marker='o', linestyle='None')
pyplot.plot(x_axis, ensemble_scores, marker='o')
pyplot.show()

In [None]:
#Beginning of danceability prediction

In [155]:
#Naive Bayes Model  danceability
clf_NB = MultinomialNB()
clf_NB.fit(countVecTrain, danceTrain)
NB_D_predictions = clf_NB.predict(countVecTest)
print('NB Danceability Accuracy score:' , accuracy_score(danceTest, NB_D_predictions))

NB Danceability Accuracy score: 0.4166666666666667


In [145]:
#Random Forest Model danceability
clf_RF=RandomForestClassifier(n_estimators=10)
clf_RF.fit(countVecTrain,danceTrain)
RF_D_predictions = clf_RF.predict(countVecTest)
print("RF Danceability Accuracy:", accuracy_score(danceTest, RF_D_predictions))

RF Danceability Accuracy: 0.7083333333333334


In [146]:
#Logistic Regression danceability
clf_LR = LogisticRegression()
clf_LR.fit(countVecTrain, danceTrain)
LR_D_predictions = clf_LR.predict(countVecTest)
print("LR Danceability Accuracy:", accuracy_score(danceTest, LR_D_predictions))

LR Danceability Accuracy: 0.625




In [156]:
# Ensemble for Max voting for several models danceability
"""
Addtional Models - SVM, DT
 
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
 
svc = SVC(kernel='poly, probability=True)
dt = DecisionTreeClassifier()
 
"""
 
from sklearn.ensemble import VotingClassifier
 
classifiers = [('lr', clf_LR), ('nb', clf_NB), ('rf', clf_RF)]
vc = VotingClassifier(estimators=classifiers, voting='hard')
 
# One method
# from - https://www.analyticsvidhya.com/blog/2018/06/comprehensive-guide-for-ensemble-models/ 
model = VotingClassifier(estimators=classifiers, voting='hard')
model.fit(countVecTrain, danceTrain)
model.score(countVecTest, danceTest)



0.6041666666666666

In [None]:
#CNN for danceability

In [53]:
df = pd.read_csv ('filtered_data.csv')
df1 = df[['lyrics', 'danceability']]

In [55]:
#df1.valence = pd.cut(df1.valence,bins=[0,0.5,1],labels=[0,1])
df1.lyrics = df1.lyrics.replace(r'\\n',' ', regex=True) 
df1.lyrics = df1['lyrics'].astype(str)

print("getting tokens")

#Tokens
tokens = [word_tokenize(sen) for sen in df1.lyrics]
def lower_token(tokens): 
    return [w.lower() for w in tokens]    
    
lower_tokens = [lower_token(token) for token in tokens]

getting tokens


In [56]:
#Pos and Neg
pos = []
neg = []
for l in df1.danceability:
    if l == 0:
        pos.append(0)
        neg.append(1)
    elif l == 1:
        pos.append(1)
        neg.append(0)
df1['Pos']= pos
df1['Neg']= neg

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


In [58]:
stoplist = stopwords.words('english')
def removeStopWords(tokens): 
    return [w for w in tokens if w not in stoplist]
filtered_words = [removeStopWords(i) for i in lower_tokens]
df1['lyrics'] = [' '.join(i) for i in filtered_words]
df1['tokens'] = filtered_words

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [59]:
data = df1[['lyrics', 'tokens', 'danceability', 'Pos', 'Neg']]
data.head()

data_train, data_test = train_test_split(data, test_size=0.10, random_state=42)

all_training_words = [word for tokens in data_train["tokens"] for word in tokens]
training_sentence_lengths = [len(tokens) for tokens in data_train["tokens"]]
TRAINING_VOCAB = sorted(list(set(all_training_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))
print("Max sentence length is %s" % max(training_sentence_lengths))
all_test_words = [word for tokens in data_test['tokens'] for word in tokens]
test_sentence_lengths = [len(tokens) for tokens in data_test['tokens']]
TEST_VOCAB = sorted(list(set(all_test_words)))
print('%s words total, with a vocabulary size of %s' % (len(all_test_words), len(TEST_VOCAB)))
print('Max sentence length is %s' % max(test_sentence_lengths))

15707217 words total, with a vocabulary size of 165789
Max sentence length is 146616
1481629 words total, with a vocabulary size of 72671
Max sentence length is 98008


In [60]:
import gensim
from gensim import models
from gensim.models import Word2Vec

word2vec_path = 'https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz'
word2vec = models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [61]:
def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(vectors, clean_comments, generate_missing=False):
    embeddings = clean_comments['tokens'].apply(lambda x: get_average_word2vec(x, vectors, 
                                                                                generate_missing=generate_missing))
    return list(embeddings)

In [63]:
training_embeddings = get_word2vec_embeddings(word2vec, data_train, generate_missing=True)

In [64]:

MS_LENGTH = 50
EMBEDDING_DIM = 300
tokenizer = Tokenizer(num_words=len(TRAINING_VOCAB), lower=True, char_level=False)
tokenizer.fit_on_texts(data_train["lyrics"].tolist())
training_sequences = tokenizer.texts_to_sequences(data_train["lyrics"].tolist())

In [65]:
train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))

Found 124932 unique tokens.


In [66]:
train_cnn_data = pad_sequences(training_sequences, maxlen=MS_LENGTH)

tew = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))
for word,index in train_word_index.items():
    tew[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)
    
test_sequences = tokenizer.texts_to_sequences(data_test["lyrics"].tolist())
test_cnn_data = pad_sequences(test_sequences, maxlen=MS_LENGTH)
label_names = ['Pos', 'Neg']

In [67]:
y_train = data_train[label_names].values

x_train = train_cnn_data
y_tr = y_train


In [74]:


embedding_layer = Embedding(len(train_word_index)+1,
                         300,
                         weights=[tew],
                         input_length=MS_LENGTH,
                          trainable=False)
    
sequence_input = Input(shape=(MS_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
epochs_count = 10
b_size = 36

convs = []
filter_sizes = [2,3,4,5,6]

for filter_size in filter_sizes:
    l_conv = Conv1D(filters=250, kernel_size=filter_size, activation='relu')(embedded_sequences)
    l_pool = GlobalMaxPooling1D()(l_conv)
    convs.append(l_pool)


lm = concatenate(convs, axis=1)

x = Dropout(0.1)(lm)  
x = Dense(128, activation='relu')(x)
x = Dropout(0.2)(x)
preds = Dense(len(list(label_names)), activation='sigmoid')(x)

model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 50)]         0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 50, 300)      37479900    input_4[0][0]                    
__________________________________________________________________________________________________
conv1d_15 (Conv1D)              (None, 49, 250)      150250      embedding_3[0][0]                
__________________________________________________________________________________________________
conv1d_16 (Conv1D)              (None, 48, 250)      225250      embedding_3[0][0]                
____________________________________________________________________________________________

In [75]:
hist = model.fit(x_train, y_tr, epochs=epochs_count, validation_split=0.1, shuffle=True, batch_size=b_size)

predictions = model.predict(test_cnn_data, batch_size=1024, verbose=1)

labels = [1, 0]



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [76]:
prediction_labels=[]
for p in predictions:
    prediction_labels.append(labels[np.argmax(p)])

sum(data_test.danceability==prediction_labels)/len(prediction_labels)
#end of CNN for danceability

0.5954922894424673