In [1]:
import requests
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from bs4 import BeautifulSoup
from dateutil.parser import parse
import re
import nltk
import pycrfsuite

In [2]:
# read in manually parsed data for training
cervParse = pd.read_csv(r'C:\Users\jschlajo\Desktop\cervantesParses.csv', encoding='latin-1')

In [30]:
# in this example I would want 'Brightside, Dorfex Bos, GrymeTyme, Jordan Polovina' as artists
# many exmaples I made up based on common trends, so I could train the model accordingly due to a lack of data
cervParse['EventTitle'][0]

'Re:Search Feat. Brightside and Dorfex Bos w/ GrymeTyme  ( Late Set )  ,  Jordan Polovina'

In [3]:
cervParse.head()

Unnamed: 0,EventTitle,manualParses
0,Re:Search Feat. Brightside and Dorfex Bos w/ G...,"Brightside, Dorfex Bos, GrymeTyme, Jordan Polo..."
1,Re:Search Feat. Ablaze and Clinton Sly w/ Hero...,"Ablaze, Clinton Sly, Herobust, Zempra"
2,Re:Search Feat. Adventure Club and Smokestax w...,"Adventure Club, Smokestax, Purge, Dyatic"
3,Re:Search Feat. Kursa and Fuski w/ Drakk (Late...,"Kursa, Fuski, Drakk, Robotic Pirate Monkey"
4,Magnolia North feat. Steve Foltz of Trout Stea...,"Magnolia North, Steve Foltz, Liver Down The River"


In [4]:
# pad comma's with spaces so the comma and word are recognized as different tokens (and parentheses)
cervParse['EventTitle'] = cervParse['EventTitle'].map(lambda x : ' , '.join(x.split(',')))
cervParse['EventTitle'] = cervParse['EventTitle'].map(lambda x : ' ( '.join(x.split('(')))
cervParse['EventTitle'] = cervParse['EventTitle'].map(lambda x : ' ) '.join(x.split(')')))

In [5]:
cervParse['manualParses'].fillna('', inplace = True)

In [6]:
def labelTokens(row):
    tokenSet = []
    for x in row.EventTitle.split():
        if x in [',']:
            value = (x, 'S')
            tokenSet.append(value)
        # split on every space to tokenize every word or character in the event name
        elif x in row.manualParses:
            # if the word is in Parsed, label it as an artist
            value = (x, 'A')
            tokenSet.append(value)
            
        elif x in ['and','w/','/W','+','b2b','x','with']:
            # and, w/ and commas are commonly used as seperators, label these S = 'Seperators' (& and typically is part of an artist name)
            value = (x, 'S')
            tokenSet.append(value)
        else:
            # otherwise, it is 'Irrelevant'
            value = (x,'I')
            tokenSet.append(value)
    return tokenSet

In [7]:
cervParse['labeled'] = cervParse.apply(labelTokens, axis=1)

In [8]:
cervParse['labeled'][19]

[('RE:Search', 'I'),
 ('Feat.', 'I'),
 ('Homemade', 'A'),
 ('Spaceship', 'A'),
 ('w/', 'S'),
 ('Special', 'I'),
 ('Guest', 'I'),
 ('Pigeon', 'I'),
 ('Hole', 'A'),
 (',', 'S'),
 ('Kyral', 'A'),
 ('x', 'A'),
 ('Banko', 'A'),
 ('(', 'I'),
 ('Late', 'I'),
 ('Set', 'I'),
 (')', 'I'),
 (',', 'S'),
 ('Jordan', 'A'),
 ('Polovina', 'A')]

In [9]:
def POStags(labels):
    data = []
    tokens = [t for t, labeled in labels]
    # for every token in the event title, tag with a Part Of Speach label from the NLTK library
    tagged = nltk.pos_tag(tokens)
    data.append([(w, pos, label) for (w, label), (word, pos) in zip(labels,tagged)])
    
    return data

In [10]:
cervParse['POS'] = cervParse['labeled'].map(POStags)
cervParse['POS'] = cervParse['POS'].map(lambda x : x[0])

In [11]:
cervParse['POS'][0]

[('Re:Search', 'NNP', 'I'),
 ('Feat.', 'NNP', 'I'),
 ('Brightside', 'NNP', 'A'),
 ('and', 'CC', 'S'),
 ('Dorfex', 'NNP', 'A'),
 ('Bos', 'NNP', 'A'),
 ('w/', 'NN', 'S'),
 ('GrymeTyme', 'NNP', 'A'),
 ('(', '(', 'I'),
 ('Late', 'NNP', 'I'),
 ('Set', 'NNP', 'I'),
 (')', ')', 'I'),
 (',', ',', 'S'),
 ('Jordan', 'NNP', 'A'),
 ('Polovina', 'NNP', 'A')]

In [12]:
# pos data into a list to create features from
data = cervParse['POS'].tolist()

In [13]:
def word2features(doc, i):
    
    # create features for training based on charchteristics of the word, and its surrounding words/ charachters
    
    word = doc[i][0]
    postag = doc[i][1]
    features = [
        'word.lower=' + word.lower(),
        'bias',
        'postag=' + postag,
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle()
    ]
    
    if i > 0:
        word1 = doc[i-1][0]
        postag1 = doc[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:postag=' + postag1,
            '-1:postag[:2]=' + postag1[:2],
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper()
            
        ])
        
    else:
        features.append('BOS')
        
    if i < len(doc)-1:
        word1 = doc[i+1][0]
        postag1 = doc[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:postag=' + postag1,
            '+1:postag[:2]=' + postag1[:2],
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper()
        ])
    else:
        features.append('EOS')
                
    return features

In [14]:
from sklearn.model_selection import train_test_split

# A function for extracting features in documents
def extract_features(doc):
    return [word2features(doc, i) for i in range(len(doc))]

# A function fo generating the list of labels for each document
def get_labels(doc):
    return [label for (token, postag, label) in doc]

X = [extract_features(doc) for doc in data]
y = [get_labels(doc) for doc in data]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
import pycrfsuite
trainer = pycrfsuite.Trainer(verbose=True)

# Submit training data to the trainer
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

# Set the parameters of the model
trainer.set_params({
    # coefficient for L1 penalty
    'c1': 0.1,

    # coefficient for L2 penalty
    'c2': 0.01,  

    # maximum number of iterations
    'max_iterations': 200,

    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': True
})

# Provide a file name as a parameter to the train function, such that
# the model will be saved to the file when training is finished
trainer.train('crf.model')

In [17]:
tagger = pycrfsuite.Tagger()
tagger.open('crf.model')
y_pred = [tagger.tag(xseq) for xseq in X_test]

# Let's take a look at a random sample in the testing set

for i in range(0,2):
    print('-----')
    for x, y in zip(y_pred[i], [x[0].split("=")[1] for x in X_test[i]]):
        print("%s (%s)" % (y, x))

-----
fire (A)
& (A)
ice (A)
w/ (S)
johns (A)
band (A)
, (S)
the (A)
cattle (A)
( (I)
uk (I)
) (I)
-----
re:search (I)
feat. (I)
lord (A)
swan (A)
( (I)
blanke (A)
x (S)
louis (A)
futon (A)
x (S)
eprom (A)
) (I)
w/ (S)
vanic (A)
, (S)
audiowrx (A)
and (S)
special (I)
guests (I)


In [18]:
cervParse['features'] = cervParse['POS'].map(lambda x : extract_features(x))
# extract features for every event title

In [19]:
cervParse['preds'] = cervParse.features.map(lambda x : [tagger.tag(x)])
# map trained model the features of each event title
cervParse['toks'] = cervParse.POS.map(lambda x: [x[0] for x in x])
# get prediction tags and word tokens into lists so they can be combined

In [20]:
def zipper(row):
    return [(x,y) for x,y in list(zip(row.toks, row.preds[0])) if y in ['A','S']]

In [21]:
cervParse['artistPreds'] = cervParse.apply(zipper, axis=1)

In [22]:
def combine(row):
    results = [(x,y) for x,y in list(zip(row.toks, row.preds[0]))]
    artist = []
    for x in results:
        if x[1] == 'A':
            artist.append(x[0])
        else:
            artist.append(',')
    return ', '.join([y for y in [x.strip() for x in ' '.join(artist).split(',')] if y != ''])

In [23]:
cervParse['predictions'] = cervParse.apply(combine, axis=1)

In [28]:
pd.options.display.max_colwidth = 1000

In [29]:
cervParse[['EventTitle','predictions']]

Unnamed: 0,EventTitle,predictions
0,"Re:Search Feat. Brightside and Dorfex Bos w/ GrymeTyme ( Late Set ) , Jordan Polovina","Brightside, Dorfex Bos, GrymeTyme, Jordan Polovina"
1,"Re:Search Feat. Ablaze and Clinton Sly w/ Herobust ( Late Set ) , Zempra","Ablaze, Clinton Sly, Herobust, Zempra"
2,"Re:Search Feat. Adventure Club and Smokestax w/ Purge ( Late Set ) , Dyatic","Adventure Club, Smokestax, Purge, Dyatic"
3,"Re:Search Feat. Kursa and Fuski w/ Drakk ( Late Set ) , Robotic Pirate Monkey","Kursa, Fuski, Drakk, Robotic Pirate Monkey"
4,"Magnolia North feat. Steve Foltz of Trout Steak Revival and Special Guest Grace Clark w/ Liver Down The River ( Late Set ) , Thunder and Rain ( Patio Set ) , Jacob Moss & Matt Flaherty of Part & Parcel ( Patio Set )","Magnolia North, Steve Foltz, Liver Down The River"
5,"Wookiefoot and Mike Love w/ Yak Attack , Analog Son , A-Mac & The Height , Graham Good & The Painters , On the PATIO: Pick & Howl , Modern Whiskey Market - Silent Disco: Dozier , Oomah ( of Evanoff ) , Tropical Waffle","Wookiefoot, Mike Love, Yak Attack, Analog Son, A-Mac & The Height, Graham Good & The Painters, &, -"
6,Neetesh Jung Kunwar & Bartika Eam Rai with Jaanvi Gurung,"Neetesh Jung Kunwar & Bartika Eam, Jaanvi Gurung"
7,"Porter Neville Quartet feat. George Porter Jr ( The Meters ) , Ivan Neville ( Dumpstaphunk ) , Ian Neville ( Dumpstaphunk ) , Terrence Houston ( The Funky Meters ) w/ JoeBaby All-Star Jam ft. Jermal Watson ( Dirty Dozen ) , and More","Porter Neville Quartet, George Porter Jr, Ivan Neville, Ian Neville, Terrence Houston, JoeBaby All-Star Jam, Jermal Watson"
8,"Maydaze Music Fest ft. Stay Gypsy , Adamsite , Paradox , Feens! , Nothing's Permanent , Ain't From Here , Sunny Sideways , The Cereal Company , Stereo Indica Cinema , Messenger Music Collective","Maydaze Music Fest, Stay Gypsy, Adamsite, Paradox, Feens!, Nothing's Permanent, Ain't From Here, Sunny Sideways, The Cereal Company, Stereo Indica Cinema, Messenger Music Collective"
9,"The Elovaters w/ CollieRAD , MountainUs","The Elovaters, CollieRAD, MountainUs"


In [20]:
tagger.dump(filename='stdout')