# NLP Challenge Identifying Adele & Bob Marley Lyrics #

## by Lorenz Madarang ##

## Data: https://www.kaggle.com/paultimothymooney/poetry/kernels ##

In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk

## Overview ##
Two .txt files that contain lyrics to Adele songs and Bob Marley songs are ingested.  Supervised BoW models are run on on the corpus of lyrics and an unsupervised vector space model is used to find features.  Natural Language Processing has been applied to extract context from the corpus and to remove stop words.   

In [3]:
# Read in the Adele Lyrics 
f=open('adele.txt','rU')
adele=f.read()

# Read in the Bob Marley Lyrics
f=open('bob-marley.txt','rU')
marley=f.read()

  
  


In [4]:
import spacy
nlp = spacy.load('en')

# Apply spaCy nlp to the lyric corpus
adele_doc = nlp(adele)
marley_doc = nlp(marley)

In [5]:
# Take a peak at the token information of the Adele Lyrics
print("The adele_doc object is a {} object.".format(type(adele_doc)))
print("It is {} tokens long".format(len(adele_doc)))
print("The first three tokens are '{}'".format(adele_doc[:3]))

The adele_doc object is a <class 'spacy.tokens.doc.Doc'> object.
It is 23858 tokens long
The first three tokens are 'Looking for some'


### Word Frequencies ###
Word frequencies have been conducted on both lyrics.  Two versions of the word frequencies were created one that includes stop words and the other one that does not.  Newline characters were captured in both frequencies and weird contractions like "n't" and "'s".  In the word frequency that excludes stopwords more meaningful words were able to come through such as "love" and "You".  The unique words for each corpus are contractions.  It is also interesting to note that 'You' is unique to Adele but 'you' is used in Bob Marley songs.  It looks like the word frequencies are case sensitive.  

In [6]:
from collections import Counter

# Utility function to calculate how frequently words appear in the text.
def word_frequencies(text, include_stop=True):
    
    # Build a list of words.
    # Strip out punctuation and, optionally, stop words.
    words = []
    for token in text:
        if not token.is_punct and (not token.is_stop or include_stop):
            words.append(token.text)
            
    # Build and return a Counter object containing word counts.
    return Counter(words)
    
# The most frequent words:
adele_freq = word_frequencies(adele_doc).most_common(10)
marley_freq = word_frequencies(marley_doc).most_common(10)
print('Adele:', adele_freq)
print('Bob Marley:', marley_freq)

Adele: [('\n', 2399), ('I', 1249), ('you', 854), ('the', 590), ('me', 484), ('it', 429), ('to', 403), ("n't", 396), ('my', 358), ('your', 254)]
Bob Marley: [('\n', 2217), ('I', 596), ('the', 547), ('you', 511), ('a', 371), ('to', 295), ("n't", 275), ('it', 260), ('no', 238), ('be', 194)]


In [7]:
# Use our optional keyword argument to remove stop words.
adele_freq = word_frequencies(adele_doc, include_stop=False).most_common(10)
marley_freq = word_frequencies(marley_doc, include_stop=False).most_common(10)
print('Adele:', adele_freq)
print('Bob Marley:', marley_freq)

Adele: [('\n', 2399), ('I', 1249), ("n't", 396), ('love', 226), ('You', 197), ('And', 192), ("'s", 171), ("'m", 163), ("'re", 145), ("'ve", 129)]
Bob Marley: [('\n', 2217), ('I', 596), ("n't", 275), ("'s", 184), ('love', 164), ('yeah', 144), ('na', 142), ('We', 122), ('And', 119), ('oh', 117)]


In [8]:
# Pull out just the text from our frequency lists.
adele_common = [pair[0] for pair in adele_freq]
marley_common = [pair[0] for pair in marley_freq]

# Use sets to find the unique values in each top ten.
print('Unique to Adele:', set(adele_common) - set(marley_common))
print('Unique to Bob Marley:', set(marley_common) - set(adele_common))

Unique to Adele: {"'m", "'ve", 'You', "'re"}
Unique to Bob Marley: {'We', 'na', 'yeah', 'oh'}


### Lemma Frequencies ###
The lemma frequencies for each lyric corpus are pretty similar: "be," "not," and "love".  The unique lemmas for each corpus are interesting.  The lemmas might imply overall themes for their songs.  The most common and unique lemma to Adele lyrics is "know."  It can be taken that Adele wants her subject or audience to constantly 'know' something about her in her songs.  The most common and unique lemma to Bob Marley lyrics is "come." It can be taken that Bob Marley is either inviting his subjects in his songs or talking about a subject that has traversed.  

In [9]:
# Utility function to calculate how frequently lemmas appear in the text.
def lemma_frequencies(text, include_stop=True):
    
    # Build a list of lemas.
    # Strip out punctuation and, optionally, stop words.
    lemmas = []
    for token in text:
        if not token.is_punct and (not token.is_stop or include_stop):
            lemmas.append(token.lemma_)
            
    # Build and return a Counter object containing word counts.
    return Counter(lemmas)

# Instantiate our list of most common lemmas.
adele_lemma_freq = lemma_frequencies(adele_doc, include_stop=False).most_common(10)
marley_lemma_freq = lemma_frequencies(marley_doc, include_stop=False).most_common(10)
print('\nAdele:', adele_lemma_freq)
print('Bob Marley:', marley_lemma_freq)

# Again, identify the lemmas common to one text but not the other.
adele_lemma_common = [pair[0] for pair in adele_lemma_freq]
marley_lemma_common = [pair[0] for pair in marley_lemma_freq]
print('Unique to Adele:', set(adele_lemma_common) - set(marley_lemma_common))
print('Unique to Bob Marley:', set(marley_lemma_common) - set(adele_lemma_common))


Adele: [('\n', 2399), ('-PRON-', 1711), ('be', 585), ('not', 396), ('love', 254), ('and', 192), ('know', 163), ('will', 151), ('have', 137), ('let', 133)]
Bob Marley: [('\n', 2217), ('-PRON-', 945), ('be', 375), ('not', 277), ('love', 230), ('oh', 204), ('yeah', 159), ('go', 154), ('come', 149), ('to', 138)]
Unique to Adele: {'know', 'let', 'have', 'will', 'and'}
Unique to Bob Marley: {'to', 'come', 'oh', 'yeah', 'go'}


### Exploration of Sentences ###
spaCy was able to identify sentences within the lyrics.  It was also able to properly identify the parts of speech and dependencies in a sentence.  

In [10]:
# Initial exploration of sentences.
sentences = list(adele_doc.sents)
print("Adele has {} sentences.".format(len(sentences)))

example_sentence = sentences[2]
print("Here is an example: \n{}\n".format(example_sentence))

Adele has 2222 sentences.
Here is an example: 
I won't bore you with the details, baby




In [11]:
sentences[0]

Looking for some education
Made my way into the night

In [12]:
# View the part of speech for some tokens in our sentence.
print('\nParts of speech:')
for token in example_sentence[:9]:
    print(token.orth_, token.pos_)


Parts of speech:
I PRON
wo VERB
n't ADV
bore VERB
you PRON
with ADP
the DET
details NOUN
, PUNCT


In [13]:
# View the dependencies for some tokens.
print('\nDependencies:')
for token in example_sentence[:9]:
    print(token.orth_, token.dep_, token.head.orth_)


Dependencies:
I nsubj bore
wo aux bore
n't neg bore
bore ROOT bore
you dobj bore
with prep bore
the det details
details pobj with
, punct bore


## Bag of Words Model ##
A dataframe was created of the lyric sentences and its associated singer.  A bag of words was created for each singer's lyric corpus and then a combined bag of words was created.  And then a dataframe was created with the combined bag of words as features.  

In [14]:
# Group into sentences.
adele_sents = [[sent, "Adele"] for sent in list(adele_doc.sents)]
marley_sents = [[sent, "Bob Marley"] for sent in list(marley_doc.sents)]

# Combine the sentences from the two lyric corpus into one data frame.
sentences = pd.DataFrame(adele_sents + marley_sents)
sentences.head()

Unnamed: 0,0,1
0,"(Looking, for, some, education, \n, Made, my, ...",Adele
1,"(All, that, bullshit, conversation, \n, Baby, ...",Adele
2,"(I, wo, n't, bore, you, with, the, details, ,,...",Adele
3,"(I, do, n't, even, wanna, waste, your, time, \n)",Adele
4,"(Let, 's, just, say, that, maybe, \n, You, cou...",Adele


In [15]:
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]
    

# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 500 == 0:
            print("Processing row {}".format(i))
            
    return df

# Set up the bags.
adelewords = bag_of_words(adele_doc)
marleywords = bag_of_words(marley_doc)

# Combine bags to create a set of unique words.
common_words = set(adelewords + marleywords)

In [16]:
# Create our data frame with features. This can take a while to run.
word_counts = bow_features(sentences, common_words)
word_counts.head()

Processing row 0
Processing row 500
Processing row 1000
Processing row 1500
Processing row 2000
Processing row 2500
Processing row 3000
Processing row 3500
Processing row 4000


Unnamed: 0,tongue,belong,picture,lust,half,ashamed,reflexes,shoot,government,despair,...,almighty,thit,path,because,crows,bring,clearly,hearts,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Looking, for, some, education, \n, Made, my, ...",Adele
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(All, that, bullshit, conversation, \n, Baby, ...",Adele
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(I, wo, n't, bore, you, with, the, details, ,,...",Adele
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(I, do, n't, even, wanna, waste, your, time, \n)",Adele
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Let, 's, just, say, that, maybe, \n, You, cou...",Adele


### Supervised Models ###
Three models where ran on the Bag of Words.  The first model that was run was a Random Forest model, the second model was a Logistic Regression model, and the final model that was run was a Gradient Boosting model.  Most of the models are pretty accurate with accuracies above 75% on the test set.  But the best model is the logistic regression model.  The accuracy on the test data is 83%.  

### Random Forest ###

In [17]:
from sklearn import ensemble
from sklearn.model_selection import train_test_split

# Initiate Random Forest model and create dependent and independent features
rfc = ensemble.RandomForestClassifier()
Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))

# Test and train split 
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)

# Train Random Forest model on training set
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

Training set score: 0.98223733938

Test set score: 0.823129251701


### Logistic Regression ###

In [18]:
from sklearn.linear_model import LogisticRegression

# Initiate Logistic Regression model and then run Logistic Regression model on the training data
lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

(2646, 1896) (2646,)
Training set score: 0.908919123205

Test set score: 0.835600907029


### Gradient Boosting Model ###

In [19]:
# Initiate Gradient Boosting model and then run the Gradient Boosting model on the training data
clf = ensemble.GradientBoostingClassifier()
train = clf.fit(X_train, y_train)

print('Training set score:', clf.score(X_train, y_train))
print('\nTest set score:', clf.score(X_test, y_test))

Training set score: 0.793272864701

Test set score: 0.746598639456


### Closer Look at the Best Supervised Model ###
A Confusion matrix shows the number of correct and incorrect predictions on the test set.  Also, a cross-validation was run on the test set to make sure that no over-fitting was occuring.  The model was pretty consistent across the ten folds of cross validation with a variance of +/- 5%.  

In [20]:
print('\nTest set score:', lr.score(X_test, y_test))
lr_predicted = lr.predict(X_test)
pd.crosstab(y_test, lr_predicted)


Test set score: 0.835600907029


col_0,Adele,Bob Marley
text_source,Unnamed: 1_level_1,Unnamed: 2_level_1
Adele,769,137
Bob Marley,153,705


In [21]:
from sklearn.model_selection import cross_val_score
scores_LRtest = cross_val_score(lr, X_test, y_test, cv=10)
print('Cross Validation Accuracy Scores - Test Set: {:.5f}(+/- {:.2f})'.format(scores_LRtest.mean(), 
                                                                               scores_LRtest.std()*2))

Cross Validation Accuracy Scores - Test Set: 0.83280(+/- 0.05)


## Unsupervised Model ##
Lists of sentences were created were created for each corpus of lyrics.  A train/test split was conducted on each list of sentences and then each sentence was converted into a vector through the TfidfVectorizer package of sklearn.  Then a dataframe was created on the vectors of the lists of sentences.  Then the number of features was reduced to 130 features.  The vectorizer was done on the test data also to see if the vectorizer found the same sentences that were representative of the artist.  

In [22]:
adele_sents_list = list(adele_doc.sents)

In [23]:
adele_sents_list[0]

Looking for some education
Made my way into the night

In [24]:
adele_sentences = []
for sent in adele_sents_list:
    sentence = str(sent)
    sentence = [re.sub('(\\n)', ' ', word) for word in sentence]
    adele_sentences.append(''.join(sentence))

In [25]:
adele_sentences[:5]

['Looking for some education Made my way into the night ',
 "All that bullshit conversation Baby, can't you read the signs?",
 "I won't bore you with the details, baby ",
 "I don't even wanna waste your time ",
 "Let's just say that maybe You could help me ease my mind "]

In [26]:
adele_sentences[0]

'Looking for some education Made my way into the night '

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test = train_test_split(adele_sentences, test_size=0.4, random_state=0)

In [28]:
vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=2, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )

In [29]:
#Applying the vectorizer
adele_sents_tfidf=vectorizer.fit_transform(adele_sentences)
print("Number of features: %d" % adele_sents_tfidf.get_shape()[1])

Number of features: 707


In [30]:
#splitting into training and test sets
X_train_tfidf, X_test_tfidf= train_test_split(adele_sents_tfidf, test_size=0.4, random_state=0)


#Reshapes the vectorizer output into something people can read
X_train_tfidf_csr = X_train_tfidf.tocsr()

#number of paragraphs
n = X_train_tfidf_csr.shape[0]
#A list of dictionaries, one per paragraph
tfidf_bypara = [{} for _ in range(0,n)]
#List of features
terms = vectorizer.get_feature_names()
#for each paragraph, lists the feature words and their tf-idf scores
for i, j in zip(*X_train_tfidf_csr.nonzero()):
    tfidf_bypara[i][terms[j]] = X_train_tfidf_csr[i, j]

#Keep in mind that the log base 2 of 1 is 0, so a tf-idf score of 0 indicates that the word was present once in that sentence.
print('Original sentence:', X_train[5])
print('Tf_idf vector:', tfidf_bypara[5])

Original sentence: I'm going back to where I started The morning rain, the morning rain 
Tf_idf vector: {'started': 0.39069846463908886, 'going': 0.37592494534712279, 'morning': 0.62318203367003577, 'rain': 0.56363046236457071}


In [35]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

#Our SVD data reducer.  We are going to reduce the feature space from 1379 to 130.
svd= TruncatedSVD(130)
lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_train_tfidf)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

#Looking at what sorts of sentences our solution considers similar, for the first five identified topics
adele_train=pd.DataFrame(X_train_lsa,index=X_train)
for i in range(5):
    print('Component {}:'.format(i))
    print(adele_train.loc[:,i].sort_values(ascending=False)[0:10])

Percent variance captured by all components: 68.0533549241
Component 0:
I'm the only one, in love                    0.94885
If I'm in love with you Should I give up     0.94885
I will always love you                       0.94885
I'm the only one in love                     0.94885
Why do you love me, do you love me?          0.94885
I will always love you                       0.94885
Why do you love me, do you love me?          0.94885
I'm the only one in love                     0.94885
Do you love me?                              0.94885
Love to me,                                  0.94885
Name: 0, dtype: float64
Component 1:
We've both know we ain't kids no more     0.590701
We've both know we ain't kids no more     0.590701
But some of us don't know why I           0.548503
Do you even know that I can't let go      0.535549
Do you even know that I can't let go      0.535549
I've been here before                     0.526121
Could've had it all...                    0.526121
For 

In [33]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

#Our SVD data reducer.  We are going to reduce the feature space from 1379 to 130.
svd= TruncatedSVD(130)
lsa = make_pipeline(svd, Normalizer(copy=False))

# Remember, you will use the same model, only with the test set data.  Don't fit a new model by mistake!
# Run SVD on the training data, then project the training data.
X_test_lsa = lsa.fit_transform(X_test_tfidf)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

#Looking at what sorts of sentences our solution considers similar, for the first five identified topics
adele_test=pd.DataFrame(X_test_lsa,index=X_test)
for i in range(5):
    print('Component {}:'.format(i))
    print(adele_test.loc[:,i].sort_values(ascending=False)[0:10])

Percent variance captured by all components: 68.7490463061
Component 0:
Why do you love me, do you love me?     0.946444
Why do you love me?                     0.946444
I will always love you                  0.946444
Love to you,                            0.946444
Why do you love me, do you love me?     0.946444
Love to you,                            0.946444
Love you                                0.946444
Love to you,                            0.946444
Do you love me?                         0.946444
I'm the only one in love                0.946444
Name: 0, dtype: float64
Component 1:
We both know we ain't kids no more     0.544262
We both know we ain't kids no more     0.544262
We both know we ain't kids no more     0.544262
We both know we ain't kids no more     0.544262
We both know we ain't kids no more     0.544262
We both know we ain't kids no more     0.544262
We both know we ain't kids no more     0.544262
Say it ain't so, say it ain't so       0.538024
, say it ain't so

In [36]:
adele_train['Singer'] = 'Adele'

In [37]:
adele_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,121,122,123,124,125,126,127,128,129,Singer
It's above and beyond me,0.024201,0.056962,-0.058908,-0.01332,-0.07764,0.021372,-0.020693,0.039779,0.03883,-0.092165,...,0.066965,0.06691,-0.002436,0.010367,0.010694,-0.000349,0.054896,0.056482,0.007767,Adele
is the end I've drowned and dreamt this moment,0.016981,0.220528,0.297538,-0.081648,-0.004936,-0.030301,0.040969,-0.076614,0.013669,0.04188,...,-0.003845,0.014246,-0.018926,-0.038748,-0.098381,0.04402,0.00089,0.084948,0.070834,Adele
But like everything I've ever known You disappear one day,0.022684,0.272189,0.314003,-0.088766,-0.03181,-0.016091,0.11007,-0.184393,0.072353,-0.00515,...,0.016655,-0.005175,-0.043007,0.003413,0.059052,0.019415,-0.021103,-0.007212,0.024681,Adele
"It illuminates all of my doubts Pull me in, hold me tight",0.002327,0.007205,-0.004754,-0.001776,-0.002344,0.007492,0.006161,-0.010512,0.008446,-0.012808,...,-0.002398,0.072589,-0.084735,0.035143,0.004223,0.042171,-0.019861,0.027534,-0.076979,Adele
And watched you wave,3.4e-05,0.000862,1.1e-05,-0.000688,-0.00012,0.000796,-0.000958,0.003879,0.000232,-0.005266,...,-0.027109,-0.074369,-0.043441,-0.027245,-0.009218,0.055866,0.107946,-0.004781,-0.054873,Adele


In [38]:
marley_sents_list = list(marley_doc.sents)

In [39]:
marley_sents_list[0]

"Don't worry about a thing,
'Cause every little thing gonna be all right.

In [40]:
marley_sentences = []
for sent in marley_sents_list:
    sentence = str(sent)
    sentence = [re.sub('(\\n)', ' ', word) for word in sentence]
    #sentence = [re.sub('(\\)', '', word) for word in sentence]
    marley_sentences.append(''.join(sentence))

In [41]:
X_train, X_test = train_test_split(marley_sentences, test_size=0.4, random_state=0)

In [42]:
#Applying the vectorizer
marley_sents_tfidf=vectorizer.fit_transform(marley_sentences)
print("Number of features: %d" % marley_sents_tfidf.get_shape()[1])

Number of features: 776


In [43]:
#splitting into training and test sets
X_train_tfidf, X_test_tfidf= train_test_split(marley_sents_tfidf, test_size=0.4, random_state=0)


#Reshapes the vectorizer output into something people can read
X_train_tfidf_csr = X_train_tfidf.tocsr()

#number of paragraphs
n = X_train_tfidf_csr.shape[0]
#A list of dictionaries, one per paragraph
tfidf_bypara = [{} for _ in range(0,n)]
#List of features
terms = vectorizer.get_feature_names()
#for each paragraph, lists the feature words and their tf-idf scores
for i, j in zip(*X_train_tfidf_csr.nonzero()):
    tfidf_bypara[i][terms[j]] = X_train_tfidf_csr[i, j]

#Keep in mind that the log base 2 of 1 is 0, so a tf-idf score of 0 indicates that the word was present once in that sentence.
print('Original sentence:', X_train[6])
print('Tf_idf vector:', tfidf_bypara[6])

Original sentence: We're past the worse Hypocrites and parasites Will come up and take a bite 
Tf_idf vector: {'hypocrites': 0.66851004783734169, 'past': 0.61729668647951841, 'come': 0.41477598387795117}


In [44]:
#Our SVD data reducer.  We are going to reduce the feature space from 1379 to 130.
svd= TruncatedSVD(130)
lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_train_tfidf)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

#Looking at what sorts of paragraphs our solution considers similar, for the first five identified topics
marley_train=pd.DataFrame(X_train_lsa,index=X_train)
for i in range(5):
    print('Component {}:'.format(i))
    print(marley_train.loc[:,i].sort_values(ascending=False)[0:10])

Percent variance captured by all components: 69.7760318581
Component 0:
yeah:                     0.995375
yeah                      0.995375
-yeah,                    0.995375
yeah                      0.995375
yeah                      0.995375
Where we can be, yeah.    0.995375
yeah                      0.995375
yeah,                     0.995375
Yeah,                     0.995375
yeah                      0.995375
Name: 0, dtype: float64
Component 1:
Oh, no!     0.869609
Oh-oh!      0.869609
Oh I,       0.869609
oh no!      0.869609
Oh I,       0.869609
oh oh!      0.869609
Oh now!     0.869609
-oh!        0.869609
oh I,       0.869609
oh oh!      0.869609
Name: 1, dtype: float64
Component 2:
Everything's gonna be all right!     0.598775
Everything's gonna be all right!     0.598775
Everything's gonna be all right!     0.598775
Everything's gonna be all right!     0.598775
Everything's gonna be all right!     0.598775
Everything's gonna be all right!     0.598775
Everything's gonna

In [45]:
# Remember, you will use the same model, only with the test set data.  Don't fit a new model by mistake!
# Run SVD on the training data, then project the training data.
X_test_lsa = lsa.fit_transform(X_test_tfidf)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

#Looking at what sorts of paragraphs our solution considers similar, for the first five identified topics
marley_test=pd.DataFrame(X_test_lsa,index=X_test)
for i in range(5):
    print('Component {}:'.format(i))
    print(marley_test.loc[:,i].sort_values(ascending=False)[0:10])

Percent variance captured by all components: 72.0644840204
Component 0:
yeah,        0.939341
yeah         0.939341
yeah         0.939341
yeah,        0.939341
yeah)        0.939341
yeah         0.939341
yeah:        0.939341
yeah,        0.939341
yeah-yeah    0.939341
yeah         0.939341
Name: 0, dtype: float64
Component 1:
oh I,         0.931365
oh I,         0.931365
O-oh!         0.931365
-oh!          0.931365
oh            0.931365
Oh (part),    0.931365
O-oh!         0.931365
oh,           0.931365
Oh, no!       0.931365
oh-oh,        0.931365
Name: 1, dtype: float64
Component 2:
Let's get together and feel all right                     0.776416
Let's get together and feel all right                     0.776416
Let's get together and feel all right                     0.776416
Let's get together and feel all right                     0.776416
Let's get together and feel all right                     0.776416
What about the, let's get together and feel all right     0.776416
an

In [46]:
marley_train['Singer'] = 'Bob Marley'

In [47]:
marley_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,121,122,123,124,125,126,127,128,129,Singer
"Uh, open your eyes and look within",0.002953,0.014961,0.025718,0.002279,0.009936,-0.005375,-0.003413,0.026742,-0.005507,-0.008019,...,0.082938,-0.065735,-0.110302,-0.108918,0.015582,0.083468,-0.035502,0.074635,0.033,Bob Marley
"Could you be, could you be loved?",0.000254,0.008223,0.01481,-0.007188,-0.009914,0.029387,-0.058984,0.341906,0.134186,-0.304527,...,-0.007373,-0.001196,0.002151,0.005227,0.004731,0.005259,-0.001507,0.012185,0.007304,Bob Marley
"This fire (fire), this fire (fire)",0.000397,-0.09107,0.062259,-0.04207,-0.16985,0.269276,-0.024397,0.217806,0.072186,-0.087453,...,0.047491,-0.007575,0.058156,-0.086841,0.005824,-0.017446,-0.116623,0.006207,-0.032898,Bob Marley
"solo/ So we know we can't take your slogans no more, can't take your slogans no more, can't take your slogans no more, no more sweet talk from-a pulpit, no more sweet talk from the pulpit.",0.002553,0.029477,0.031844,0.018572,0.065032,0.026903,0.156391,0.035779,-0.005263,0.000502,...,-0.019015,0.008133,0.02926,0.015358,0.029315,0.055681,0.004294,0.082943,0.003542,Bob Marley
"Every minute All you got to do, baby, (oh-oh-oh-oh) Is keep it in, and (Stir it up)",0.065974,0.846715,-0.3185,-0.066196,-0.112346,0.116945,-0.017961,0.041308,-0.107682,0.090868,...,-0.009377,-0.009207,-0.017066,0.000714,0.014579,0.005314,-0.006128,0.013422,-0.009485,Bob Marley


### Supervised Models on the DataFrame created by Unsupervised Modeling ###
The dataframes of the vectorized sentences of each lyric corpus was combined to create one dataframe.  Then three supervised models were run on the dataframe to predict the singer based on the vectorized sentence.  The best performing models were the tree based models.  Gradient Boosting model was the best with a 98% accuracy on the test set.  

In [48]:
combined_train = adele_train.append(marley_train)

In [49]:
rfc = ensemble.RandomForestClassifier()
Y = combined_train['Singer']
X = np.array(combined_train.drop(['Singer'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

Training set score: 0.985507246377

Test set score: 0.969754253308


In [50]:
lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

(1587, 130) (1587,)
Training set score: 0.742281033396

Test set score: 0.703213610586


In [51]:
clf = ensemble.GradientBoostingClassifier()
train = clf.fit(X_train, y_train)

print('Training set score:', clf.score(X_train, y_train))
print('\nTest set score:', clf.score(X_test, y_test))

Training set score: 0.985507246377

Test set score: 0.973534971645


#### Closer Look at the Best Supervised Model ###
A Confusion matrix shows the number of correct and incorrect predictions on the test set.  Also, a cross-validation was run on the test set to make sure that no over-fitting was occuring.  The model was consistent across the ten folds of cross validation with a variance of +/- 4%. 

In [52]:
print('\nTest set score:', clf.score(X_test, y_test))
clf_predicted = clf.predict(X_test)
pd.crosstab(y_test, clf_predicted)


Test set score: 0.973534971645


col_0,Adele,Bob Marley
Singer,Unnamed: 1_level_1,Unnamed: 2_level_1
Adele,521,6
Bob Marley,22,509


In [53]:
scores_CLFtest = cross_val_score(clf, X_test, y_test, cv=10)
print('Cross Validation Accuracy Scores - Test Set: {:.5f}(+/- {:.2f})'.format(scores_CLFtest.mean(), 
                                                                               scores_CLFtest.std()*2))

Cross Validation Accuracy Scores - Test Set: 0.97070(+/- 0.04)
