In [1]:
%load_ext watermark

In [2]:
%watermark -d -v -a 'Sebastian Raschka' -p scikit-learn,nltk,numpy

Sebastian Raschka 12/12/2014 

CPython 2.7.8
IPython 2.1.0

scikit-learn 0.15.2
nltk 3.0.0
numpy 1.9.1


<font size="1.5em">[More information](https://github.com/rasbt/watermark) about the `watermark` magic command extension.</font>

<br>
<br>

# Lyrics Mood Classification - Preparing Pickle Classifier

<br>
<br>

### Reading the dataset

[[back to top](#Sections)]

In [1]:
import pandas as pd

df = pd.read_csv('./train_lyrics_1000.csv')
df2 = pd.read_csv('./valid_lyrics_200.csv')
df = pd.concat([df, df2])
df.tail()

Unnamed: 0,artist,file,genre,lyrics,mood,title,year
195,Prince,TRAKQEA128F1495E21.h5,Rock,{B-side of Glam Slam}\nSnare drum pounds on th...,happy,Escape ( LP Version),
196,Cavo,TRAKQLN128F932AC25.h5,Rock,Well I will rise\nThe morning comes\nNothing e...,sad,Over Again (Album Version),
197,AFI,TRAKQXJ128F147A028.h5,Rock,"Listen when I say, when I say it's real\nReal ...",happy,Summer Shudder,
198,Vitamin C,TRAKRQW128F427D6E3.h5,Pop,"Imagine a world where the girls, girls rule th...",happy,Girls Against Boys (LP Version),
199,Richard Burton,TRAKSRQ128F4269AE8.h5,Jazz,"Each evening, from December to December\nBefor...",happy,Camelot,


<br>
<br>

### Label Encoder

[[back to top](#Sections)]

In [2]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

X_train = df['lyrics'].values 
y_train = df['mood'].values

print('before: %s ...' %y_train[:5])

le = LabelEncoder()
le.fit(y_train)
y_train = le.transform(y_train)

print('after: %s ...' %y_train[:5])

before: ['sad' 'happy' 'sad' 'happy' 'sad'] ...
after: [1 0 1 0 1] ...


<br>
<br>

### Porter Stemmer

[[back to top](#Sections)]

In [3]:
# Porter Stemmer

import nltk
import string
import re

porter_stemmer = nltk.stem.porter.PorterStemmer()

def porter_tokenizer(text, stemmer=porter_stemmer):
    """
    A Porter-Stemmer-Tokenizer hybrid to splits sentences into words (tokens) 
    and applies the porter stemming algorithm to each of the obtained token. 
    Tokens that are only consisting of punctuation characters are removed as well.
    Only tokens that consist of more than one letter are being kept.
    
    Parameters
    ----------
        
    text : `str`. 
      A sentence that is to split into words.
        
    Returns
    ----------
    
    no_punct : `str`. 
      A list of tokens after stemming and removing Sentence punctuation patterns.
    
    """
    lower_txt = text.lower()
    tokens = nltk.wordpunct_tokenize(lower_txt)
    stems = [porter_stemmer.stem(t) for t in tokens]
    no_punct = [s for s in stems if re.match('^[a-zA-Z]+$', s) is not None]
    return no_punct


In [4]:
# Commented out to prevent overwriting files:
#
# stp = nltk.corpus.stopwords.words('english')
# with open('./stopwords_eng.txt', 'w') as outfile:
#    outfile.write('\n'.join(stp))
    
    
with open('./stopwords_eng.txt', 'r') as infile:
    stop_words = infile.read().splitlines()
print('stop words %s ...' %stop_words[:5])

stop words ['i', 'me', 'my', 'myself', 'we'] ...


### Lyrics Downloader

In [5]:
import urllib, re
import bs4
          
def songlyrics(artist,title):
    artist = urllib.quote(artist.lower().replace(' ','-'))
    title = urllib.quote(title.lower().replace(' ','-'))

    try:
        lyrics = urllib.urlopen("http://www.songlyrics.com/%s/%s-lyrics/" % (artist,title))
    except:
        return "Could not connect to songlyrics.com Exiting..."
    text = lyrics.read()
    soup = bs4.BeautifulSoup(text)
    lyrics = soup.findAll(attrs= {"id" : "songLyricsDiv"})
    if not lyrics:
        return "Lyrics not found."
    else:
        if str(lyrics[0]).startswith('<p class="songLyricsV14 iComment-text" id="songLyricsDiv"></p>'):

            return "Lyrics not found."
        try:
            return re.sub('<[^<]+?>', '', "".join(str(lyrics[0])))
        except:
            return 'Error in parsing the lyrics'
        
test = songlyrics('Bob Dylan','Blowing in the wind')
test = songlyrics('Pharrell', 'happy')
test

AttributeError: module 'urllib' has no attribute 'quote'

## Vectorizer

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

import re
"""
countv = CountVectorizer(
                  binary=False,
                  decode_error="replace",
                  stop_words=stop_words,
                  preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                  ngram_range=(1,1),
                  tokenizer=lambda text: text.split()
                )
"""
countv = CountVectorizer(
                  binary=False,
                  decode_error="replace",
                  stop_words=stop_words,
                  ngram_range=(1,1),

                )


In [7]:
countv = countv.fit(X_train)
X_train_countv = countv.transform(X_train)

In [8]:
#joblib.dump(tfidf, './lyrics_tfidf_jb.pkl') 

In [9]:
from sklearn.naive_bayes import MultinomialNB

clf_countv = MultinomialNB(alpha=1.0, fit_prior=False)
clf_countv = clf_countv.fit(X_train_countv, y_train)

In [10]:
# Test
x = songlyrics('Pharrell', 'happy')
x_countv = countv.transform([x])

le.inverse_transform(clf_countv.predict(x_countv))
clf_countv.predict_proba(x_countv)

AttributeError: module 'urllib' has no attribute 'quote'

In [11]:
#%matplotlib inline
#import matplotlib.pyplot as plt
#import seaborn as sns
from sklearn import metrics
import numpy as np
#import matplotlib as mpl

cm = metrics.confusion_matrix(y_train, clf_countv.predict(X_train_countv))

print(cm)


[[533  18]
 [ 37 612]]


In [12]:
import shelve
import dill

d = shelve.open('countv_clf')

d['label_encoder'] = le
d['lyrics_countv'] = countv
d['lyrics_clf'] = clf_countv
d.close()

ModuleNotFoundError: No module named 'dill'

In [13]:
import dill


try:
    d = open('label_encoder.p', 'wb')
    dill.dump(le, d)
finally:
    d.close()
    
    
try:
    d = open('countv.p', 'wb')
    dill.dump(countv, d)
finally:
    d.close()   

try:
    d = open('clf_countv.p', 'wb')
    dill.dump(clf_countv, d)
finally:
    d.close()  
    

ModuleNotFoundError: No module named 'dill'

In [12]:
import pickle


try:
    d = open('label_encoder.p', 'wb')
    pickle.dump(le, d)
finally:
    d.close()
    
    
try:
    d = open('countv.p', 'wb')
    pickle.dump(countv, d)
finally:
    d.close()   

try:
    d = open('clf_countv.p', 'wb')
    pickle.dump(clf_countv, d)
finally:
    d.close() 

# Testing

In [14]:
import shelve
import re


d = shelve.open('countv_clf.db', 'r')

def preprocess(text):
    return re.sub('[^a-zA-Z]', ' ', text.lower())

def tokenize(text):
    return text.split()

le = d['label_encoder']
countv = d['lyrics_countv']
clf = d['lyrics_clf'] 

d.close()

error: need 'c' or 'n' flag to open new db

In [15]:
# print number of mood labels
import dill

le = dill.load(open('label_encoder.p'))
countv = dill.load(open('countv.p'))
clf_countv = dill.load(open('clf_countv.p'))

ModuleNotFoundError: No module named 'dill'

In [16]:
x_countv = countv.transform([songlyrics('Bob Dylan', 'blowin in the wind')])
pred = clf.predict(x_countv)[0]
label = le.inverse_transform(pred)
label = 'Prediction: %s' % label
            
proba = clf.predict_proba(x_countv).ravel()[pred]
proba = round(proba*100)
proba = 'probability %.2f%% ' % (proba)
proba


AttributeError: module 'urllib' has no attribute 'quote'

In [18]:
import sys
import os
import pickle
import sqlite3

In [20]:
my_dir = os.path.dirname("/s1")
d = open('label_encoder.p', 'rb')
le = pickle.load(d)

d1 = open('countv.p','rb')
vect  = pickle.load(d1)

d = open('clf_countv.p', 'rb')
clf = pickle.load(d)



UnicodeDecodeError: 'ascii' codec can't decode byte 0xbe in position 0: ordinal not in range(128)