In [1]:
import os
import json
import string

import numpy as np

from nltk.tag import pos_tag

from sklearn_crfsuite import CRF, metrics
from sklearn.metrics import make_scorer,confusion_matrix
from sklearn.metrics import f1_score,classification_report
from sklearn.pipeline import Pipeline

from pprint import pprint

from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [2]:
train_loc = "snips/train_PlayMusic_full.json"
test_loc = "snips/validate_PlayMusic.json"

train_file = json.load(open(train_loc, encoding= "iso-8859-2"))
test_file = json.load(open(test_loc, encoding= "iso-8859-2"))

In [3]:
train_datafile = [i["data"] for i in train_file["PlayMusic"]]
test_datafile = [i["data"] for i in test_file["PlayMusic"]]

In [4]:
def convert_data(datalist):
    output = []
    for data in datalist:
        sent = []
        pos = []
        for phrase in data:
            words = phrase["text"].strip().split(" ")
            while "" in words:
                words.remove("")
            if "entity" in phrase.keys():
                label = phrase["entity"]
                labels = [label+"-{}".format(i+1) for i in range(len(words))]
            else:
                labels = ["O"] * len(words)
            sent.extend(words)
            pos.extend(labels)
        output.append([sent, pos])
        print(sent)
    return output

In [5]:
train_data = convert_data(train_datafile)
test_data = convert_data(test_datafile)

['I', 'need', 'to', 'hear', 'the', 'song', 'Aspro', 'Mavro', 'from', 'Bill', 'Szymczyk', 'on', 'Youtube']
['play', 'Yo', 'Ho', 'from', 'the', 'new', 'york', 'pops', 'on', 'Youtube']
['Play', 'some', 'seventies', 'music', 'by', 'Janne', 'Puurtinen', 'on', 'Youtube', '.']
['play', 'the', 'MĂşsica', 'Da', 'SĂŠrie', 'De', 'Filmes', 'O', 'Hobbit', 'album', 'by', 'Alex', 'Otaola']
['Play', 'Magic', 'Sam', 'from', 'the', 'thirties']
['Play', 'The', 'Soft', 'Parade', '.']
['Play', 'music', 'by', 'blowfly', 'from', 'the', 'seventies']
['play', 'Jeff', 'Pilson', 'on', 'Youtube']
['play', 'a', 'tune', 'by', 'Syreeta', 'Wright', 'from', 'twenties', 'from', 'the', 'top']
['Please', 'play', 'me', 'Jerry', 'Lee', 'Lewis', "'s", 'If', 'You', 'Say', 'So', 'track', '.']
['play', 'the', 'newest', 'by', 'Exuma']
['Play', 'the', 'album', 'alas', 'y', 'raĂ\xadces', 'by', 'Dave', 'Pybus', '.']
['play', 'music', 'by', 'Helen', 'Ward']
['Play', 'some', 'sixties', 'music.']
['Plan', 'an', 'album', 'by', 'Roni',

['Play', 'a', 'tune', 'or', 'two', 'from', 'Kansas', 'City,', 'Missouri']
['play', 'some', 'songs', 'from', 'the', 'fourties', 'by', 'yoshiki', 'fukuyama']
['play', 'Angela', 'Winbush', 'ep', 'that', 'is', 'popular']
['play', 'To', 'Be', 'Still']
['I', 'want', 'to', 'hear', 'Gloryhole', 'from', 'Mani', 'off', 'of', 'Lastfm']
['play', 'music', 'by', 'Charlie', 'Adams', 'from', '1954']
['play', 'the', 'It', 'Could', 'Only', 'Happen', 'With', 'You', 'album', 'by', 'Lawrence']
['Play', 'Moondog', "'s", 'Chupacabra', '.']
["I'd", 'like', 'to', 'hear', 'some', 'trip-hop']
['Play', 'a', '2001', 'tune']
['Can', 'you', 'pull', 'up', 'and', 'play', 'something', 'on', 'Itunes']
['I', 'want', 'to', 'hear', 'some', 'theme', 'music', 'by', 'Edsel', 'Dope']
['Play', 'some', 'music', 'from', '1993', 'on', 'Itunes', '.']
['I', 'want', 'to', 'hear', 'the', 'new', 'Vasilis', 'Tsitsanis', 'ep']
['play', 'a', 'sound', 'track', 'by', 'Vegard', 'Sverre', 'Tveitan']
['Play', 'Crucifixion', 'on', 'Deezer', '.'

In [6]:
BASE_DIR = 'D:/Programming/New Datasets/'
GLOVE_DIR = os.path.join(BASE_DIR, 'glove.6B')

MAX_SEQUENCE_LENGTH = 300
MAX_NUM_WORDS = 20000 
EMBEDDING_DIM = 100 
VALIDATION_SPLIT = 0.3

In [7]:
print('Preparing embedding matrix.')

# first, build index mapping words in the embeddings set
# to their embedding vector

embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'), encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors in Glove embeddings.' % len(embeddings_index))

def get_embeddings(word):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is None:
        # words not found in embedding index will be all-zeros.
        embedding_vector = np.zeros(shape=(EMBEDDING_DIM, ))
    return embedding_vector

Preparing embedding matrix.
Found 400000 word vectors in Glove embeddings.


In [8]:
train_texts = [" ".join(i[0]) for i in train_data]
test_texts = [" ".join(i[0]) for i in test_data]

train_texts[0]

'I need to hear the song Aspro Mavro from Bill Szymczyk on Youtube'

In [9]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(train_texts)
train_sequences = tokenizer.texts_to_sequences(train_texts) #Converting text to a vector of word indexes
test_sequences = tokenizer.texts_to_sequences(test_texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 2866 unique tokens.


In [10]:
"""
Get features for all words in the sentence
Features:
- word context: a window of 2 words on either side of the current word, and current word.
- POS context: a window of 2 POS tags on either side of the current word, and current tag. 
input: sentence as a list of tokens.
output: list of dictionaries. each dict represents features for that word.
"""
def sent2feats(sentence):
    feats = []
    sen_tags = pos_tag(sentence) #This format is specific to this POS tagger!
    for i in range(0,len(sentence)):
        word = sentence[i]
        wordfeats = {}
       #word features: word, prev 2 words, next 2 words in the sentence.
        wordfeats['word'] = word
        if i == 0:
            wordfeats["prevWord"] = wordfeats["prevSecondWord"] = "<S>"
        elif i==1:
            wordfeats["prevWord"] = sentence[0]
            wordfeats["prevSecondWord"] = "</S>"
        else:
            wordfeats["prevWord"] = sentence[i-1]
            wordfeats["prevSecondWord"] = sentence[i-2]
        #next two words as features
        if i == len(sentence)-2:
            wordfeats["nextWord"] = sentence[i+1]
            wordfeats["nextNextWord"] = "</S>"
        elif i==len(sentence)-1:
            wordfeats["nextWord"] = "</S>"
            wordfeats["nextNextWord"] = "</S>"
        else:
            wordfeats["nextWord"] = sentence[i+1]
            wordfeats["nextNextWord"] = sentence[i+2]
        
        #POS tag features: current tag, previous and next 2 tags.
        wordfeats['tag'] = sen_tags[i][1]
        if i == 0:
            wordfeats["prevTag"] = wordfeats["prevSecondTag"] = "<S>"
        elif i == 1:
            wordfeats["prevTag"] = sen_tags[0][1]
            wordfeats["prevSecondTag"] = "</S>"
        else:
            wordfeats["prevTag"] = sen_tags[i - 1][1]

            wordfeats["prevSecondTag"] = sen_tags[i - 2][1]
            # next two words as features
        if i == len(sentence) - 2:
            wordfeats["nextTag"] = sen_tags[i + 1][1]
            wordfeats["nextNextTag"] = "</S>"
        elif i == len(sentence) - 1:
            wordfeats["nextTag"] = "</S>"
            wordfeats["nextNextTag"] = "</S>"
        else:
            wordfeats["nextTag"] = sen_tags[i + 1][1]
            wordfeats["nextNextTag"] = sen_tags[i + 2][1]
            
        #Adding word vectors
        vector = get_embeddings(word)
        for iv,value in enumerate(vector):
            wordfeats['v{}'.format(iv)]=value
        
        
        feats.append(wordfeats)
    return feats

In [11]:
#Extract features from the conll data, after loading it.
def get_feats_conll(conll_data):
    feats = []
    labels = []
    for sentence in conll_data:
        feats.append(sent2feats(sentence[0]))
        labels.append(sentence[1])
    return feats, labels

In [12]:
#Train a sequence model
def train_seq(X_train,Y_train,X_dev,Y_dev):
    crf = CRF(algorithm='lbfgs', c1=0.1, c2=10, max_iterations=50)#, all_possible_states=True)
    #Just to fit on training data
    crf.fit(X_train, Y_train)
    labels = list(crf.classes_)
    #testing:
    y_pred = crf.predict(X_dev)
    sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))
    print(metrics.flat_f1_score(Y_dev, y_pred,average='weighted', labels=labels))
    print(metrics.flat_classification_report(Y_dev, y_pred, labels=sorted_labels, digits=3))
    #print(metrics.sequence_accuracy_score(Y_dev, y_pred))
    get_confusion_matrix(Y_dev, y_pred,labels=sorted_labels)


In [13]:
#source for this function: https://gist.github.com/zachguo/10296432
def print_cm(cm, labels):
    print("\n")
    """pretty print for confusion matrixes"""
    columnwidth = max([len(x) for x in labels] + [5])  # 5 is value length
    empty_cell = " " * columnwidth
    # Print header
    print("    " + empty_cell, end=" ")
    for label in labels:
        print("%{0}s".format(columnwidth) % label, end=" ")
    print()
    # Print rows
    for i, label1 in enumerate(labels):
        print("    %{0}s".format(columnwidth) % label1, end=" ")
        sum = 0
        for j in range(len(labels)):
            cell = "%{0}.0f".format(columnwidth) % cm[i, j]
            sum =  sum + int(cell)
            print(cell, end=" ")
        print(sum) #Prints the total number of instances per cat at the end.


In [14]:
#python-crfsuite does not have a confusion matrix function, 
#so writing it using sklearn's confusion matrix and print_cm from github
def get_confusion_matrix(y_true,y_pred,labels):
    trues,preds = [], []
    for yseq_true, yseq_pred in zip(y_true, y_pred):
        trues.extend(yseq_true)
        preds.extend(yseq_pred)
    print_cm(confusion_matrix(trues,preds,labels),labels)


In [15]:
print("Training a Sequence classification model with CRF")
feats, labels = get_feats_conll(train_data)
devfeats, devlabels = get_feats_conll(test_data)
train_seq(feats, labels, devfeats, devlabels)
print("Done with sequence model")

Training a Sequence classification model with CRF
0.8560889758746073
              precision    recall  f1-score   support

           O      0.976     0.983     0.980       418
      year-1      0.960     0.960     0.960        25
     genre-1      1.000     0.333     0.500         3
     genre-2      1.000     0.333     0.500         3
     genre-3      0.000     0.000     0.000         0
     genre-4      0.000     0.000     0.000         0
     genre-5      0.000     0.000     0.000         0
     genre-6      0.000     0.000     0.000         0
   service-1      0.857     0.923     0.889        39
   service-2      1.000     0.818     0.900        11
  playlist-1      1.000     0.333     0.500         9
  playlist-2      0.333     0.167     0.222         6
  playlist-3      0.000     0.000     0.000         4
  playlist-4      0.000     0.000     0.000         4
  playlist-5      0.000     0.000     0.000         2
  playlist-6      0.000     0.000     0.000         0
     album-1

  average, "true nor predicted", 'F-score is', len(true_sum)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
