In [244]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import nltk
from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
import re

from nltk.corpus import stopwords
import pickle
import string
from datetime import datetime

from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer

In [245]:
wordnet_lemmatizer = WordNetLemmatizer()
porter = PorterStemmer()
lancaster=LancasterStemmer()


persongazetteerfilenames = ['lexicon\\people.person.lastnames.modified', 'lexicon\\people.family_name', 'lexicon\\firstname.5000', 'lexicon\\lastname.5000'] 
companygazetteerfilenames = ['lexicon\\business.consumer_company', 'lexicon\\venture_capital.venture_funded_company', 'lexicon\\business.brand']
locationgazetteerfilenames = ['lexicon\\location.country','lexicon\\location','lexicon\\education.university','lexicon\\venues', 'lexicon\\architecture.museum']
productgazetteerfilenames =['lexicon\\product','lexicon\\business.consumer_product','lexicon\\automotive.model','lexicon\\automotive.make']
titlegazetteerfilenames = ['lexicon\\award.award','lexicon\\base.events.festival_series','lexicon\\book.newspaper', 'lexicon\\tv.tv_program']
groupgazetteerfilenames = ['lexicon\\sports.sports_team']
othergazetteerfilenames = ['lexicon\\time.holiday', 'lexicon\\time.recurring_event','lexicon\\base.events.festival_series','lexicon\\broadcast.tv_channel','lexicon\\cvg.cvg_platform','lexicon\\sports.sports_league', 'lexicon\\transportation.road', 'lexicon\\tv.tv_network']


def loadGazetteer(filenames):
    combinedlist = []
    for filename in filenames:
        linelist = [line.rstrip('\n').lower() for line in open(filename, encoding="utf8")]
        #print(len(linelist))
        combinedlist = set().union(combinedlist, linelist)
        #print(len(combinedlist))
    return combinedlist

def isWordInGazette(gazetteer,word):
    if word.lower() in gazetteer:
        return True
    
    return False
    
def isWordGroupInGazette(gazetteer,entity,sentence):
    for entry in gazetteer:
        if entity.lower() in entry:
            if entry in sentence: 
                return True
  
    return False  

persongazetteer = loadGazetteer(persongazetteerfilenames)
companygazetteer = loadGazetteer(companygazetteerfilenames)  
locationgazetteer = loadGazetteer(locationgazetteerfilenames)
productgazetteer = loadGazetteer(productgazetteerfilenames)
titlegazetteer = loadGazetteer(titlegazetteerfilenames)
groupgazetteer = loadGazetteer(groupgazetteerfilenames)
othergazetteer = loadGazetteer(othergazetteerfilenames)

In [246]:
def isURL(string):
    return len(re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+] |[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', string))>0

def isHashtagUserName(string):
    return (string[0]=='#' or string[0]=='@')

def isAnyDigit(s):
    return any(i.isdigit() for i in s)

def shape(string):
    t1 = re.sub('[A-Z]', 'X',string)
    t2 = re.sub('[a-z]', 'x', t1)
    return re.sub('[0-9]', 'd', t2)

def isAbbr(string):
    regex = re.compile('[A-Z]([A-Z]|\.|&)+') 
    if(regex.search(string) == None):
        return False
    return True

def isPostUpper(post):
    fullpost = ""
    for wordlabel in post:
        word,label = wordlabel
        fullpost += word + " "
    return fullpost.isupper()

def isStopWord(string):
    if string in stopwords.words('english'):
        return True
    return False

def poststring(post, separator=" "):
    fullpost = ""
    for wordlabel in post:
        word,label = wordlabel
        fullpost += word + separator
    return fullpost 

def poststringgazetteer(postwithgaz, separator=" "):
    fullpost = ""
    #print("post poststringgazetteer: ",postwithgaz)
    for wordlabel in postwithgaz:
        #print("wordlabel",wordlabel)
        word,label,persongaz, locationgaz, productgaz, titilegaz, groupgaz, othergaz,companygaz = wordlabel
        fullpost += word + separator
    return fullpost  

def sentenceTag(post):
    fullpost = ""
    for wordlabel in post:
        word,label = wordlabel
        fullpost += word + " "
    return nltk.pos_tag(fullpost.split())

def sentenceTagGazetteer(postwithgaz):
    fullpost = poststringgazetteer(postwithgaz)
    return nltk.pos_tag(fullpost.split()) 


#gaz_wiki_place = open("gazetteer\\wikipedia_place_titles.pickle", 'rb')
#gaz_wiki_place_db = list(pickle.load(gaz_wiki_place, encoding='bytes'))
#def isPlace(key):
    #for keys in gaz_wiki_place_db:
    #    if key in keys:#
#    if key in gaz_wiki_place_db:
#        return True
#    return False

def isromannum(word):
    validRomanNumerals = ["M", "D", "C", "L", "X", "V", "I"]
    for letters in word.upper():
        if letters not in validRomanNumerals:
            return False
    return True

def haspunctuation(word):
    punctuations = string.punctuation
    for letters in word:
        if letters  in punctuations:
            return True
    return False

def ispunctuation(word):
    punctuations = string.punctuation
    for letters in word:
        if letters not in punctuations:
            return False
    return True

triggerwordlist = [line.rstrip('\n').lower() for line in open("triggerwordlist.txt", encoding="utf8")]
def istriggerword(word):
    if word.lower() in set(triggerwordlist):
        return True
    return False

def wordtypepatterns(poststring):
    pattern =""
    #print("inside ",poststring)
    for word in poststring.split():
        
        if word.islower():
            pattern += "l"
        elif word.isupper():
            pattern += "C"
        elif word.istitle():
            pattern += "T"
        elif word in string.punctuation:
            pattern += "."
        else:
            pattern += "x"

    return pattern

def addgazetteer(posts):
    finalresult = []
    for post in posts:
        fullstring = poststring(post)
        result = []
        for line in post:
            word,label = line
            #persongaz, locationgaz, productgaz, titilegaz, groupgaz, othergaz
            newrecord = word, label, isWordInGazette(persongazetteer, word) ,isWordGroupInGazette(locationgazetteer, word, fullstring), \
                          isWordGroupInGazette(productgazetteer, word, fullstring),isWordGroupInGazette(titlegazetteer, word, fullstring),\
                         isWordGroupInGazette(groupgazetteer, word, fullstring),isWordGroupInGazette(othergazetteer, word, fullstring)
            result.append(newrecord)
        finalresult.append(result)    
        
    return finalresult   
        

def preprocess(raw_data):
    posts = raw_data.split("\n\n")
    output = []
    for post in posts:
        lines = post.split("\n")
        
        outputline = []
        for line in lines:
            #print(line)
            if line != "":
                word, label = tuple(line.split("\t"))
                #if(label[0]=='B' or label[0]=='I'):
                #    label = label[2:]
                outputline.append((word,label))
        output.append(outputline)  
    return output 

def preprocessnotag(raw_data):
    posts = raw_data.split("\n\n")
    output = []
    for post in posts:
        
        words = post.split("\n")
        #print(words)
        outputline = []
        for word in words:
            if word != "":
                outputline.append((word," "))
        output.append(outputline)
        #break
    return output 



# REMOVE DUPLICATE POSTS

def removeDuplicate(postswithgaz):
    stringlist = []
    result = []
    for postwithgaz in postswithgaz:
        string = poststringgazetteer(postwithgaz)
        if string not in stringlist:
            stringlist.append(string)
            result.append(postwithgaz)
    
    #print(len(posts),len(stringlist))
    return result    


def postPunctuationAsNER(postwithgaz):
    punctuations = string.punctuation
    for line in postwithgaz:
        word= line[0]
        label= line[1]
        if word in punctuations and label not in 'O':
            return True
    return False

def removePunctuationAsNER(postswithgaz):
    result = []
    
    for postwithgaz in postswithgaz:
        if not postPunctuationAsNER(postwithgaz):
            result.append(postwithgaz)
    return result     

In [247]:
"""train_raw_data = open("train.txt","r").read()
dev_raw_data = open("dev.txt","r").read()
test_raw_data = open("test_no_tag.txt","r",encoding="utf8").read()"""

'train_raw_data = open("train.txt","r").read()\ndev_raw_data = open("dev.txt","r").read()\ntest_raw_data = open("test_no_tag.txt","r",encoding="utf8").read()'

In [248]:
"""train_posts = preprocess(train_raw_data)
dev_posts = preprocess(dev_raw_data)
test_posts = preprocessnotag(test_raw_data)"""

'train_posts = preprocess(train_raw_data)\ndev_posts = preprocess(dev_raw_data)\ntest_posts = preprocessnotag(test_raw_data)'

In [249]:
with open('train_posts_with_gazetteer.data', 'rb') as filehandle:
    # read the data as binary data stream
    train_posts_with_gazetteer = pickle.load(filehandle)   
with open('dev_posts_with_gazetteer.data', 'rb') as filehandle:
    # read the data as binary data stream
    dev_posts_with_gazetteer = pickle.load(filehandle)       
with open('test_posts_with_gazetteer.data', 'rb') as filehandle:
    # read the data as binary data stream
    test_posts_with_gazetteer = pickle.load(filehandle)

In [250]:
train_posts_with_gazetteer[0][0]

('@SammieLynnsMom', 'O', False, False, False, False, False, False, False)

In [251]:
train_posts_with_gazetteer = removeDuplicate(train_posts_with_gazetteer)   
dev_posts_with_gazetteer = removeDuplicate(dev_posts_with_gazetteer)

train_posts_with_gazetteer =  removePunctuationAsNER(train_posts_with_gazetteer)  
dev_posts_with_gazetteer =  removePunctuationAsNER(dev_posts_with_gazetteer)    

In [252]:
"""%time
print(datetime.now().strftime("%Y%m%d_%H%M"))
train_posts_with_gazetteer = addgazetteer(train_posts)
with open('train_posts_with_gazetteer'+datetime.now().strftime("%Y%m%d_%H%M")+'.data', 'wb') as filehandle:
    # store the data as binary data stream
    pickle.dump(train_posts_with_gazetteer, filehandle)
print(datetime.now().strftime("%Y%m%d_%H%M"))"""

'%time\nprint(datetime.now().strftime("%Y%m%d_%H%M"))\ntrain_posts_with_gazetteer = addgazetteer(train_posts)\nwith open(\'train_posts_with_gazetteer\'+datetime.now().strftime("%Y%m%d_%H%M")+\'.data\', \'wb\') as filehandle:\n    # store the data as binary data stream\n    pickle.dump(train_posts_with_gazetteer, filehandle)\nprint(datetime.now().strftime("%Y%m%d_%H%M"))'

In [253]:
"""%time
print(datetime.now().strftime("%Y%m%d_%H%M"))
dev_posts_with_gazetteer = addgazetteer(dev_posts)
with open('dev_posts_with_gazetteer'+datetime.now().strftime("%Y%m%d_%H%M")+'.data', 'wb') as filehandle:
    # store the data as binary data stream
    pickle.dump(dev_posts_with_gazetteer, filehandle)
print(datetime.now().strftime("%Y%m%d_%H%M"))"""

'%time\nprint(datetime.now().strftime("%Y%m%d_%H%M"))\ndev_posts_with_gazetteer = addgazetteer(dev_posts)\nwith open(\'dev_posts_with_gazetteer\'+datetime.now().strftime("%Y%m%d_%H%M")+\'.data\', \'wb\') as filehandle:\n    # store the data as binary data stream\n    pickle.dump(dev_posts_with_gazetteer, filehandle)\nprint(datetime.now().strftime("%Y%m%d_%H%M"))'

In [254]:
"""%time
print(datetime.now().strftime("%Y%m%d_%H%M"))
test_posts_with_gazetteer = addgazetteer(test_posts)
with open('test_posts_with_gazetteer'+datetime.now().strftime("%Y%m%d_%H%M")+'.data', 'wb') as filehandle:
    # store the data as binary data stream
    pickle.dump(test_posts_with_gazetteer, filehandle)
print(datetime.now().strftime("%Y%m%d_%H%M"))"""

'%time\nprint(datetime.now().strftime("%Y%m%d_%H%M"))\ntest_posts_with_gazetteer = addgazetteer(test_posts)\nwith open(\'test_posts_with_gazetteer\'+datetime.now().strftime("%Y%m%d_%H%M")+\'.data\', \'wb\') as filehandle:\n    # store the data as binary data stream\n    pickle.dump(test_posts_with_gazetteer, filehandle)\nprint(datetime.now().strftime("%Y%m%d_%H%M"))'

In [316]:
def word2features(postwithgaz,i, postag, fullpost):
    word, label,persongaz, locationgaz, productgaz, titlegaz, groupgaz, othergaz,companygaz = postwithgaz[i]
    
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        #'word[-6:]': word[-6:],
        #'word[-5:]': word[-5:],
        'word[-4:]': word[-4:],
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word[:4]': word[:4],
        'word[:3]': word[:3],
        'word[:2]': word[:2],
        'len(word)': len(word),
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'word.isalpha()': word.isalpha(),
        #'word.isalnum()': word.isalnum(),
        'isHashTagUserName(word)':isHashtagUserName(word),
        'istriggerword(word)':istriggerword(word),
        'isAnyDigit(word)':isAnyDigit(word),
        'isPostUpper(post)':fullpost.isupper(),
       # 'isStopWord(word)':isStopWord(word),
       # 'isAbbr(word)':isAbbr(word),
        'shape(word)':shape(word),
        'isURL(word)':isURL(word),
        'postag': postag[i][1],
        'postag[:2]': postag[i][1][:2],
       # 'isPlace(word)':isPlace(word),
        'lemma':wordnet_lemmatizer.lemmatize(word),
        'stem.portar':porter.stem(word),
        'stem.lancaster':lancaster.stem(word),
        'word.isromannum':isromannum(word),
        #'word.haspunctuation':haspunctuation(word),
        'word.ispunctuation':ispunctuation(word),
        
        #'sentpattern':sentpattern,
        'person.gazetteer':persongaz,
        'company.gazetteer':companygaz,
        'location.gazetteer':locationgaz,
        'product.gazetteer':productgaz,
        'title.gazetteer':titlegaz,
        'group.gazetteer':groupgaz,
        'other.gazetteer':othergaz,
    }
    if i > 0:
        word1 = postwithgaz[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:word.isdigit()': word1.isdigit(),
            '-1:word.isalpha()': word1.isalpha(),
            #'-1:isAnyDigit(word)':isAnyDigit(word1),
            '-1:istriggerword(word)':istriggerword(word1),
            '-1:shape(word)':shape(word1),
            '-1:isURL(word)':isURL(word1),
            '-1:word[-4:]': word1[-4:],
            '-1:word[-3:]': word1[-3:],
            '-1:word[-2:]': word1[-2:],
            '-1:word[:4]': word1[:4],
            '-1:word[:3]': word1[:3],
            '-1:word[:2]': word1[:2],
            '-1:len(word)': len(word1),
            #'-1:isHashTagUserName(word)':isHashtagUserName(word1),
            '-1:postag': postag[i-1][1],
            '-1:postag[:2]': postag[i-1][1][:2]
        })
        if i>1:
            word2 = postwithgaz[i-2][0]
            features.update({
                '-2:word.lower()': word2.lower(),
                '-2:word.istitle()': word2.istitle(),
                '-2:word.isupper()': word2.isupper(),
                '-2:word.isdigit()': word2.isdigit(),
                '-2:word.isalpha()': word2.isalpha(),
                '-2:istriggerword(word)':istriggerword(word2),
                '-2:postag': postag[i-2][1],
                '-2:postag[:2]': postag[i-2][1][:2],
                
            })
            """if i>2:
                word3 = postwithgaz[i-3][0]
                features.update({
                '-3:word.lower()': word3.lower(),
                '-3:word.istitle()': word3.istitle(),
                '-3:word.isupper()': word3.isupper(),
                '-3:word.isdigit()': word3.isdigit(),
                '-3:word.isalpha()': word3.isalpha(),
                '-3:postag': postag[i-3][1],
                '-3:postag[:2]': postag[i-3][1][:2],
                })""" 
    else:
        features['BOS'] = True

    if i < len(postwithgaz)-1:
        word1 = postwithgaz[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:word.isdigit()': word1.isdigit(),
            '+1:word.isalpha()': word1.isalpha(),
            #'+1:isAnyDigit(word)':isAnyDigit(word1),
            '+1:shape(word)':shape(word1),
            '+1:isURL(word)':isURL(word1),
            '+1:word[-4:]': word1[-4:],
            '+1:word[-3:]': word1[-3:],
            '+1:word[-2:]': word1[-2:],
            '+1:word[:4]': word1[:4],
            '+1:word[:3]': word1[:3],
            '+1:word[:2]': word1[:2],
            '+1:len(word)': len(word1),
            #'+1:isHashTagUserName(word)':isHashtagUserName(word1),
            '+1:postag': postag[i+1][1],
            '+1:postag[:2]': postag[i+1][1][:2]
        })
        if i < len(postwithgaz) - 2:
            word2 = postwithgaz[i+2][0]
            features.update({
                '+2:word.lower()': word2.lower(),
                '+2:word.istitle()': word2.istitle(),
                '+2:word.isupper()': word2.isupper(),
                '+2:word.isdigit()': word2.isdigit(),
                '+2:word.isalpha()': word2.isalpha(),
                '+2:postag': postag[i+2][1],
                '+2:postag[:2]': postag[i+2][1][:2],
            })
            """if i < len(postwithgaz) - 3:
                word3= postwithgaz[i+3][0]
                features.update({
                '+3:word.lower()': word3.lower(),
                '+3:word.istitle()': word3.istitle(),
                '+3:word.isupper()': word3.isupper(),
                '+3:word.isdigit()': word3.isdigit(),
                '+3:word.isalpha()': word3.isalpha(),
                '+3:postag': postag[i+3][1],
                '+3:postag[:2]': postag[i+3][1][:2],
                })"""
    else:
        features['EOS'] = True

    return features


def post2features(postwithgaz):
    postag = sentenceTagGazetteer(postwithgaz)
    fullpost = poststringgazetteer(postwithgaz)
    #print("post",post)
    #print("postag in post2features: ",postag)
    return [word2features(postwithgaz, i, postag, fullpost) for i in range(len(postwithgaz))]

def post2labels(postwithgaz):
    return [label for word, label,persongaz, locationgaz, productgaz, titilegaz, groupgaz, othergaz,companygaz in postwithgaz]


In [317]:
%%time
post2features(train_posts_with_gazetteer[5])[5]

Wall time: 6 ms


{'bias': 1.0,
 'word.lower()': 'if',
 'word[-4:]': 'if',
 'word[-3:]': 'if',
 'word[-2:]': 'if',
 'word[:4]': 'if',
 'word[:3]': 'if',
 'word[:2]': 'if',
 'len(word)': 2,
 'word.isupper()': False,
 'word.istitle()': False,
 'word.isdigit()': False,
 'word.isalpha()': True,
 'isHashTagUserName(word)': False,
 'istriggerword(word)': False,
 'isAnyDigit(word)': False,
 'isPostUpper(post)': False,
 'shape(word)': 'xx',
 'isURL(word)': False,
 'postag': 'IN',
 'postag[:2]': 'IN',
 'lemma': 'if',
 'stem.portar': 'if',
 'stem.lancaster': 'if',
 'word.isromannum': False,
 'word.ispunctuation': False,
 'person.gazetteer': False,
 'company.gazetteer': False,
 'location.gazetteer': False,
 'product.gazetteer': False,
 'title.gazetteer': False,
 'group.gazetteer': False,
 'other.gazetteer': False,
 '-1:word.lower()': 'this',
 '-1:word.istitle()': False,
 '-1:word.isupper()': False,
 '-1:word.isdigit()': False,
 '-1:word.isalpha()': True,
 '-1:istriggerword(word)': False,
 '-1:shape(word)': 'xxxx',

In [318]:
%%time
post2labels(train_posts_with_gazetteer[5])[5]

Wall time: 0 ns


'O'

In [319]:
%%time
X_train = [post2features(s) for s in train_posts_with_gazetteer]
y_train = [post2labels(s) for s in train_posts_with_gazetteer]

Wall time: 12.3 s


In [320]:
print(datetime.now().strftime("%Y%m%d_%H%M"))

20200208_1130


In [321]:
%%time
X_dev = [post2features(s) for s in dev_posts_with_gazetteer]
y_dev = [post2labels(s) for s in dev_posts_with_gazetteer]

Wall time: 4.51 s


In [322]:
print(datetime.now().strftime("%Y%m%d_%H%M"))

20200208_1131


In [323]:
%%time
X_test = [post2features(s) for s in test_posts_with_gazetteer]

Wall time: 13.1 s


In [324]:
print(datetime.now().strftime("%Y%m%d_%H%M"))

20200208_1131


In [325]:
%%time
##(0.15,0.2)
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.15,
    c2=0.2,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

Wall time: 43.5 s


CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.15, c2=0.2, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [326]:
filename = 'crf_lbfgs_gazetteer_'+datetime.now().strftime("%Y%m%d_%H%M")+'.sav'
pickle.dump(crf, open(filename, 'wb'))

In [327]:
labels = list(crf.classes_)
labels.remove('O')
#labels

In [328]:
#0.9397331037451572

y_pred = crf.predict(X_dev)
metrics.flat_f1_score(y_dev, y_pred,
                      average='weighted', labels=labels)
#metrics.flat_accuracy_score(y_dev, y_pred)

0.30853797234946345

In [329]:
test_pred = crf.predict(X_test)

In [330]:
def savepredictions(test_posts, test_pred, filename):
    file = open(filename,"w", encoding="utf8")
    j=0
    for post in test_posts:
        predpostlabel = test_pred[j]
        #print(post)
        for i in range(len(post)):
            word,emptylabel = post[i]
            predlabel = predpostlabel[i]
          #  print(word, predlabel)
            file.write(word+" "+predlabel+"\n")
            
        #if j>2:
         #   break
        file.write("\n")    
        j+=1
    file.close()  
    
    
savepredictions(test_posts, test_pred, filename="test_prediction_"+datetime.now().strftime("%Y%m%d_%H%M")+".txt")        

In [331]:
"""precision    recall  f1-score   support

      person      0.588     0.429     0.496       266
       title      0.400     0.062     0.108        32
    location      0.616     0.430     0.506       235
     company      0.462     0.122     0.194        49
     product      0.350     0.044     0.079       158
       group      0.333     0.025     0.047       159
       other      0.214     0.183     0.198       229

   micro avg      0.457     0.245     0.319      1128
   macro avg      0.423     0.185     0.232      1128
weighted avg      0.438     0.245     0.292      1128"""
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_dev, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

   B-company      0.636     0.179     0.280        39
   I-company      0.500     0.100     0.167        10
     B-group      1.000     0.020     0.039       100
     I-group      0.000     0.000     0.000        43
  B-location      0.609     0.397     0.481       141
  I-location      0.650     0.388     0.486        67
     B-other      0.353     0.137     0.198       131
     I-other      0.189     0.247     0.214        93
    B-person      0.622     0.418     0.500       165
    I-person      0.607     0.622     0.614        82
   B-product      0.800     0.129     0.222        31
   I-product      0.500     0.028     0.053        72
     B-title      0.200     0.062     0.095        16
     I-title      0.200     0.083     0.118        12

   micro avg      0.489     0.260     0.340      1002
   macro avg      0.490     0.201     0.248      1002
weighted avg      0.541     0.260     0.309      1002



In [332]:
import seqeval.metrics  as seqevalmetrics
seqevalmetrics.f1_score(y_dev, y_pred)

0.3303769401330377

In [333]:
seqevalmetrics.accuracy_score(y_dev, y_pred)

0.946106648111835

In [334]:
print(seqevalmetrics.classification_report(y_dev, y_pred, digits=3))

           precision    recall  f1-score   support

  product      0.400     0.065     0.111        31
   person      0.613     0.412     0.493       165
    group      1.000     0.020     0.039       100
    other      0.302     0.122     0.174       131
 location      0.576     0.376     0.455       141
  company      0.636     0.179     0.280        39
    title      0.200     0.062     0.095        16

micro avg      0.534     0.239     0.330       623
macro avg      0.582     0.239     0.302       623



In [None]:
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    max_iterations=100, 
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation //flat_f1_score, average='weighted', labels=labels
f1_scorer = make_scorer(metrics.flat_accuracy_score)

# search
rs = RandomizedSearchCV(crf, params_space, 
                        cv=3, 
                        verbose=1, 
                        n_jobs=-1, 
                        n_iter=50, 
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

In [None]:
# crf = rs.best_estimator_
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

In [None]:
#filename = "crf_best_estimator_3_fold_"+datetime.now().strftime("%Y%m%d_%H%M")++".sav"
#pickle.dump(rs.best_estimator_, open(filename, 'wb'))



In [None]:
crf_best = rs.best_estimator_

from sklearn.externals import joblib
joblib.dump(crf_best, "crf_best_estimator_3_fold_"+datetime.now().strftime("%Y%m%d_%H%M")+'.pkl')

y_pred = crf_best.predict(X_dev)
print(metrics.flat_classification_report(
    y_dev, y_pred, labels=sorted_labels, digits=3
))

In [None]:
test_pred_best = crf_best.predict(X_test)
savepredictions(test_posts, test_pred_best, filename="test_prediction_3_gazetteer_best_"+datetime.now().strftime("%Y%m%d_%H%M")+".txt")    

In [238]:
from collections import Counter
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))    

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

Top positive:
4.742212 O        word.ispunctuation
3.054434 O        bias
2.868522 O        BOS
2.801316 O        isHashTagUserName(word)
2.502802 O        shape(word):xx
2.479192 O        EOS
2.367502 B-company company.gazetteer
2.147256 O        word[-3:]:day
2.127316 B-product word[:2]:iP
1.965681 O        shape(word):x
1.897500 O        postag[:2]:PR
1.860392 B-location word[-3:]:nia
1.858355 B-person person.gazetteer
1.857844 O        shape(word):xxx
1.851765 O        shape(word):xxxxx
1.829012 I-person person.gazetteer
1.795681 O        word[-2:]:ed
1.701880 B-person word[:2]:Je
1.682853 B-other  word[-2:]:GP
1.665031 B-company word.lower():twitter
1.665031 B-company stem.portar:twitter
1.609379 B-other  word[-3:]:mas
1.601490 O        shape(word):Xx
1.565082 B-person word[:2]:Jo
1.564179 O        shape(word):X
1.560749 I-location -2:word.lower():at
1.555447 B-location shape(word):XX
1.550178 B-location word[-2:]:ia
1.516298 B-location word[:2]:CA
1.496681 B-product product.gazet

In [240]:
import eli5
eli5.show_weights(crf, top=(5,5))



From \ To,O,B-company,I-company,B-group,I-group,B-location,I-location,B-other,I-other,B-person,I-person,B-product,I-product,B-title,I-title
O,2.024,0.574,-2.328,0.057,-2.908,0.291,-3.227,0.343,-4.044,1.096,-3.042,0.254,-2.866,0.305,-3.013
B-company,0.031,-0.135,3.844,0.0,-0.056,0.269,-0.282,0.0,-0.535,-0.163,-0.294,-0.003,-0.066,0.0,-0.037
I-company,-0.187,0.0,3.007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B-group,-0.631,0.0,0.0,0.0,5.381,0.0,-0.006,0.0,-0.392,0.0,-0.056,0.0,0.0,0.0,0.0
I-group,-0.028,0.0,0.0,-0.168,4.051,0.0,-0.162,0.0,-0.278,-0.049,-0.091,0.0,0.0,0.0,0.0
B-location,-0.125,-0.102,-0.278,-0.21,-0.474,0.144,4.812,-0.315,-1.114,-0.118,-0.615,-0.009,-0.346,0.0,-0.213
I-location,-0.185,0.0,0.0,0.002,0.0,0.0,3.66,0.0,-0.68,0.0,-0.341,0.0,-0.0,0.0,-0.007
B-other,-0.855,-0.0,0.0,0.0,-0.0,0.0,-0.244,0.0,4.971,0.0,-0.306,0.0,-0.158,0.0,0.0
I-other,-0.464,-0.164,0.0,-0.436,-0.285,-0.481,-0.345,0.0,4.533,0.0,-0.295,-0.0,-0.21,0.0,-0.241
B-person,-0.396,-0.134,-0.518,-0.116,-0.457,-0.155,-0.887,0.0,-1.015,-0.673,5.102,-0.162,-0.607,0.0,-0.62

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,Unnamed: 13_level_6,Unnamed: 14_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,Unnamed: 13_level_7,Unnamed: 14_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8,Unnamed: 13_level_8,Unnamed: 14_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9,Unnamed: 12_level_9,Unnamed: 13_level_9,Unnamed: 14_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10,Unnamed: 13_level_10,Unnamed: 14_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11,Unnamed: 12_level_11,Unnamed: 13_level_11,Unnamed: 14_level_11
Weight?,Feature,Unnamed: 2_level_12,Unnamed: 3_level_12,Unnamed: 4_level_12,Unnamed: 5_level_12,Unnamed: 6_level_12,Unnamed: 7_level_12,Unnamed: 8_level_12,Unnamed: 9_level_12,Unnamed: 10_level_12,Unnamed: 11_level_12,Unnamed: 12_level_12,Unnamed: 13_level_12,Unnamed: 14_level_12
Weight?,Feature,Unnamed: 2_level_13,Unnamed: 3_level_13,Unnamed: 4_level_13,Unnamed: 5_level_13,Unnamed: 6_level_13,Unnamed: 7_level_13,Unnamed: 8_level_13,Unnamed: 9_level_13,Unnamed: 10_level_13,Unnamed: 11_level_13,Unnamed: 12_level_13,Unnamed: 13_level_13,Unnamed: 14_level_13
Weight?,Feature,Unnamed: 2_level_14,Unnamed: 3_level_14,Unnamed: 4_level_14,Unnamed: 5_level_14,Unnamed: 6_level_14,Unnamed: 7_level_14,Unnamed: 8_level_14,Unnamed: 9_level_14,Unnamed: 10_level_14,Unnamed: 11_level_14,Unnamed: 12_level_14,Unnamed: 13_level_14,Unnamed: 14_level_14
+4.742,word.ispunctuation,,,,,,,,,,,,,
+3.054,bias,,,,,,,,,,,,,
+2.869,BOS,,,,,,,,,,,,,
+2.801,isHashTagUserName(word),,,,,,,,,,,,,
+2.503,shape(word):xx,,,,,,,,,,,,,
… 5825 more positive …,… 5825 more positive …,,,,,,,,,,,,,
… 4073 more negative …,… 4073 more negative …,,,,,,,,,,,,,
-1.453,word[:2]:Ki,,,,,,,,,,,,,
-1.757,word[:2]:iP,,,,,,,,,,,,,
-1.761,+2:word.lower():by,,,,,,,,,,,,,

Weight?,Feature
+4.742,word.ispunctuation
+3.054,bias
+2.869,BOS
+2.801,isHashTagUserName(word)
+2.503,shape(word):xx
… 5825 more positive …,… 5825 more positive …
… 4073 more negative …,… 4073 more negative …
-1.453,word[:2]:Ki
-1.757,word[:2]:iP
-1.761,+2:word.lower():by

Weight?,Feature
+2.368,company.gazetteer
+1.665,stem.portar:twitter
+1.665,word.lower():twitter
+1.381,stem.lancaster:facebook
+1.381,stem.portar:facebook
… 1883 more positive …,… 1883 more positive …
… 145 more negative …,… 145 more negative …
-0.735,location.gazetteer
-0.771,-2:postag:VBN
-0.807,person.gazetteer

Weight?,Feature
+0.966,-1:word.istitle()
+0.792,-1:word[:2]:Se
+0.759,shape(word):xxxx
+0.726,-1:shape(word):Xxxxx
+0.575,-2:word.lower():with
… 727 more positive …,… 727 more positive …
… 34 more negative …,… 34 more negative …
-0.166,+3:postag[:2]:IN
-0.278,+1:postag[:2]:VB
-0.300,+3:word.isupper()

Weight?,Feature
+1.370,word[-2:]:ks
+1.152,postag:NNS
+1.133,group.gazetteer
+1.002,word.isupper()
+0.932,+1:word[:3]:Bro
… 1658 more positive …,… 1658 more positive …
… 116 more negative …,… 116 more negative …
-0.393,postag:NN
-0.401,-2:postag[:2]:PR
-0.463,-1:word.isalpha()

Weight?,Feature
+1.154,word.istitle()
+1.088,-3:word.lower():vs
+0.986,-1:shape(word):xxxxx
+0.862,-1:word.lower():dj
+0.861,-1:shape(word):XX
… 1245 more positive …,… 1245 more positive …
… 39 more negative …,… 39 more negative …
-0.250,+2:postag:IN
-0.250,+2:postag[:2]:IN
-0.256,+3:word.isalpha()

Weight?,Feature
+1.860,word[-3:]:nia
+1.555,shape(word):XX
+1.550,word[-2:]:ia
+1.516,word[:2]:CA
+1.486,word.lower():uk
… 2953 more positive …,… 2953 more positive …
… 297 more negative …,… 297 more negative …
-0.878,shape(word):X
-0.881,-1:word.istitle()
-0.990,shape(word):Xxx

Weight?,Feature
+1.561,-2:word.lower():at
+1.149,word[:2]:Pa
+1.023,-3:word.lower():the
+0.990,-3:word.lower():at
+0.900,+1:shape(word):XXXX
… 1434 more positive …,… 1434 more positive …
… 119 more negative …,… 119 more negative …
-0.490,isAnyDigit(word)
-0.601,+3:postag:VBP
-0.645,-1:shape(word):Xxxxxxxxxx

Weight?,Feature
+1.683,word[-2:]:GP
+1.609,word[-3:]:mas
+1.223,shape(word):XXX
+1.093,shape(word):XxxxXxxxx
+1.030,-1:word[:2]:#n
… 2364 more positive …,… 2364 more positive …
… 176 more negative …,… 176 more negative …
-0.587,location.gazetteer
-0.605,-1:word.istitle()
-0.749,+1:shape(word):xxxxx

Weight?,Feature
+1.005,-1:word[-2:]:on
+0.965,word[-3:]:day
+0.931,stem.lancaster:fest
+0.870,-1:shape(word):xx
+0.856,-1:word[-2:]:st
… 2470 more positive …,… 2470 more positive …
… 176 more negative …,… 176 more negative …
-0.500,-1:word[:2]:Da
-0.654,word.ispunctuation
-0.783,-1:word[:3]:Day

Weight?,Feature
+1.858,person.gazetteer
+1.702,word[:2]:Je
+1.565,word[:2]:Jo
+1.440,word[-2:]:ie
+1.193,word[-2:]:en
… 3542 more positive …,… 3542 more positive …
… 350 more negative …,… 350 more negative …
-0.668,+1:shape(word):xxx
-0.810,-1:word.istitle()
-0.827,shape(word):Xx

Weight?,Feature
+1.829,person.gazetteer
+1.001,word[-2:]:on
+0.864,word[-2:]:ey
+0.842,-1:word[:2]:Do
+0.793,-1:word[:2]:An
… 1474 more positive …,… 1474 more positive …
… 166 more negative …,… 166 more negative …
-0.471,+1:shape(word):xxxx
-0.581,+2:postag:NN
-0.616,shape(word):xxxx

Weight?,Feature
+2.127,word[:2]:iP
+1.497,product.gazetteer
+1.027,-2:word.lower():antivirus
+0.996,+3:word.lower():buy
+0.959,word[:4]:iPho
… 1518 more positive …,… 1518 more positive …
… 101 more negative …,… 101 more negative …
-0.436,shape(word):Xxxxx
-0.464,-3:postag[:2]:PR
-0.712,location.gazetteer

Weight?,Feature
+0.909,word[-2:]:ld
+0.837,-1:word[-2:]:er
+0.735,-1:word[:2]:se
+0.727,word[-3:]:ter
+0.725,+2:word.lower():by
… 1125 more positive …,… 1125 more positive …
… 42 more negative …,… 42 more negative …
-0.200,-2:word.istitle()
-0.224,+2:postag:NN
-0.288,+2:word.isupper()

Weight?,Feature
+0.952,-1:word.lower():watch
+0.893,-2:word.lower():watching
+0.742,-2:word.lower():!
+0.733,+1:word[-2:]:re
+0.659,+1:word[-2:]:wn
… 1049 more positive …,… 1049 more positive …
… 70 more negative …,… 70 more negative …
-0.375,-1:word.isalpha()
-0.710,+1:shape(word):Xxxxxx
-0.712,-1:word.istitle()

Weight?,Feature
+1.191,other.gazetteer
+0.882,+1:shape(word):d
+0.795,-1:postag:NN
+0.705,-2:postag[:2]:VB
+0.688,+3:postag:VBP
… 1155 more positive …,… 1155 more positive …
… 53 more negative …,… 53 more negative …
-0.335,+3:postag:NNP
-0.386,-1:shape(word):Xxxxx
-0.481,postag:NNP


In [241]:
eli5.show_weights(crf, top=(10,10), feature_re='^word\.is',
                  horizontal_layout=False, show=['targets'])



Weight?,Feature
4.742,word.ispunctuation
0.105,word.isromannum
0.04,word.isdigit()
-0.171,word.isalpha()
-0.314,word.isupper()
-0.635,word.istitle()

Weight?,Feature
0.098,word.isalpha()
-0.31,word.istitle()
-0.419,word.isupper()

Weight?,Feature
0.464,word.istitle()
-0.034,word.isalpha()
-0.149,word.isupper()

Weight?,Feature
1.002,word.isupper()
0.592,word.istitle()
-0.012,word.isalpha()
-0.098,word.isromannum

Weight?,Feature
1.154,word.istitle()
0.002,word.isalpha()
-0.263,word.isupper()

Weight?,Feature
0.606,word.isupper()
0.598,word.istitle()
0.42,word.isromannum
0.011,word.isalpha()
-0.358,word.isdigit()

Weight?,Feature
0.148,word.istitle()
0.021,word.isupper()
-0.041,word.isalpha()
-0.102,word.isdigit()
-0.303,word.ispunctuation

Weight?,Feature
0.851,word.isupper()
0.001,word.isalpha()
-0.0,word.isromannum
-0.026,word.istitle()

Weight?,Feature
0.207,word.isdigit()
0.149,word.isromannum
0.06,word.istitle()
0.058,word.isalpha()
-0.002,word.isupper()
-0.654,word.ispunctuation

Weight?,Feature
0.338,word.istitle()
0.006,word.isalpha()
-0.46,word.isupper()
-0.577,word.isromannum

Weight?,Feature
0.51,word.isromannum
0.253,word.isupper()
0.1,word.istitle()
-0.036,word.isalpha()
-0.219,word.ispunctuation

Weight?,Feature
0.005,word.isalpha()
-0.008,word.istitle()
-0.12,word.isdigit()
-0.122,word.isupper()

Weight?,Feature
0.17,word.isdigit()
0.107,word.isupper()
0.004,word.isalpha()
-0.182,word.istitle()

Weight?,Feature
0.251,word.istitle()
0.006,word.isupper()
-0.006,word.isalpha()

Weight?,Feature
0.21,word.isdigit()
0.012,word.istitle()
0.0,word.isupper()
-0.007,word.isalpha()
-0.323,word.ispunctuation


In [242]:
eli5.show_weights(crf, top=(5,5), feature_re='.lower.',
                  horizontal_layout=False, show=['targets'])



Weight?,Feature
+1.272,-3:word.lower():get
+0.993,-2:word.lower():wintor
+0.980,-3:word.lower():anna
+0.979,-2:word.lower():&amp;
+0.872,+2:word.lower():of
… 1302 more positive …,… 1302 more positive …
… 962 more negative …,… 962 more negative …
-0.951,-3:word.lower():da
-0.957,-2:word.lower():big
-0.992,-2:word.lower():happy

Weight?,Feature
+1.665,word.lower():twitter
+1.381,word.lower():facebook
+1.056,-3:word.lower():i
+1.021,-2:word.lower():updates
+0.968,-2:word.lower():win
… 421 more positive …,… 421 more positive …
… 8 more negative …,… 8 more negative …
-0.093,+1:word.lower():.
-0.097,-1:word.lower():the
-0.114,+2:word.lower():the

Weight?,Feature
+0.575,-2:word.lower():with
+0.508,+3:word.lower():you
+0.312,word.lower():city
+0.290,-2:word.lower():.
+0.253,+1:word.lower():enlists
… 144 more positive …,… 144 more positive …

Weight?,Feature
+0.912,-1:word.lower():go
+0.831,-2:word.lower():vs
+0.823,+3:word.lower():can't
+0.820,+3:word.lower():only
+0.795,word.lower():dj
… 375 more positive …,… 375 more positive …
… 6 more negative …,… 6 more negative …
-0.095,"+3:word.lower():,"
-0.096,-1:word.lower():the
-0.115,-3:word.lower()::

Weight?,Feature
+1.088,-3:word.lower():vs
+0.862,-1:word.lower():dj
+0.722,-2:word.lower():kings
+0.562,-1:word.lower():green
+0.438,-2:word.lower():.
… 263 more positive …,… 263 more positive …
-0.053,-2:word.lower()::

Weight?,Feature
+1.486,word.lower():uk
+1.393,-2:word.lower():in
+1.091,-1:word.lower():at
+0.986,+3:word.lower():weekend
+0.973,+2:word.lower():where
… 703 more positive …,… 703 more positive …
… 39 more negative …,… 39 more negative …
-0.225,-2:word.lower():f
-0.234,+2:word.lower():is
-0.260,-3:word.lower():no

Weight?,Feature
+1.561,-2:word.lower():at
+1.023,-3:word.lower():the
+0.990,-3:word.lower():at
+0.896,-1:word.lower():new
+0.802,word.lower():lounge
… 309 more positive …,… 309 more positive …
… 2 more negative …,… 2 more negative …
-0.014,-3:word.lower():be
-0.050,+1:word.lower():(
-0.066,+1:word.lower()::

Weight?,Feature
+0.997,+3:word.lower():week
+0.933,+3:word.lower()::
+0.852,-2:word.lower():be
+0.828,word.lower():christmas
+0.745,+3:word.lower():tomorrow
… 556 more positive …,… 556 more positive …
… 16 more negative …,… 16 more negative …
-0.193,+2:word.lower():.
-0.224,-1:word.lower():of
-0.239,+2:word.lower():-

Weight?,Feature
+0.845,-3:word.lower():today
+0.811,"-2:word.lower():"""
+0.733,-2:word.lower():until
+0.719,-3:word.lower():-
+0.629,-3:word.lower():of
… 562 more positive …,… 562 more positive …
… 14 more negative …,… 14 more negative …
-0.068,-3:word.lower():the
-0.101,-2:word.lower()::
-0.116,-2:word.lower():from

Weight?,Feature
+1.187,word.lower():pope
+0.932,word.lower():taylor
+0.927,+2:word.lower():coming
+0.925,-3:word.lower():just
+0.884,+2:word.lower():had
… 845 more positive …,… 845 more positive …
… 36 more negative …,… 36 more negative …
-0.442,-1:word.lower():in
-0.461,+2:word.lower():.
-0.485,+3:word.lower():i

Weight?,Feature
+0.590,-3:word.lower():)
+0.480,word.lower():bieber
+0.461,-2:word.lower():by
+0.346,+2:word.lower():on
+0.346,+3:word.lower():gwen
… 308 more positive …,… 308 more positive …
… 4 more negative …,… 4 more negative …
-0.040,+1:word.lower():as
-0.044,+1:word.lower():.
-0.078,-2:word.lower():and

Weight?,Feature
+1.027,-2:word.lower():antivirus
+0.996,+3:word.lower():buy
+0.883,word.lower():ipad
+0.747,word.lower():xbox
+0.728,word.lower():ipod
… 328 more positive …,… 328 more positive …
-0.000,+3:word.lower():.
-0.001,+3:word.lower():...
-0.007,-1:word.lower():to
-0.033,"-1:word.lower():,"

Weight?,Feature
+0.725,+2:word.lower():by
+0.468,-3:word.lower():my
+0.445,word.lower():penguin
+0.444,+3:word.lower():i
+0.438,+3:word.lower():by
… 236 more positive …,… 236 more positive …
-0.007,+1:word.lower()::
-0.013,-2:word.lower():the
-0.089,+2:word.lower():in

Weight?,Feature
+0.952,-1:word.lower():watch
+0.893,-2:word.lower():watching
+0.742,-2:word.lower():!
+0.624,+3:word.lower():gulp
+0.607,-2:word.lower():story
… 226 more positive …,… 226 more positive …
-0.000,-1:word.lower()::
-0.001,+3:word.lower():.
-0.004,-1:word.lower():.
-0.094,"+2:word.lower():"""

Weight?,Feature
+0.617,-3:word.lower():!
+0.530,+2:word.lower():i
+0.515,+2:word.lower():week
+0.486,-3:word.lower():with
+0.435,-3:word.lower():video
… 237 more positive …,… 237 more positive …
-0.027,+2:word.lower():in
-0.042,-2:word.lower():the
-0.048,+1:word.lower():.


In [243]:
eli5.show_weights(crf, top=(5,5), feature_re='.gazetteer',
                  horizontal_layout=False, show=['targets'])



Weight?,Feature
0.532,title.gazetteer
0.013,other.gazetteer
-0.221,location.gazetteer
-0.449,group.gazetteer
-0.651,person.gazetteer
-0.69,product.gazetteer
-0.791,company.gazetteer

Weight?,Feature
2.368,company.gazetteer
-0.735,location.gazetteer
-0.807,person.gazetteer

Weight?,Feature
0.053,other.gazetteer
0.039,company.gazetteer

Weight?,Feature
1.133,group.gazetteer
0.585,title.gazetteer
0.029,location.gazetteer
-0.008,person.gazetteer

Weight?,Feature
0.532,group.gazetteer
0.064,location.gazetteer
0.018,person.gazetteer
-0.092,company.gazetteer

Weight?,Feature
1.134,location.gazetteer
0.537,person.gazetteer
-0.013,group.gazetteer
-0.103,other.gazetteer
-0.134,company.gazetteer
-0.595,title.gazetteer

Weight?,Feature
0.622,person.gazetteer
0.441,location.gazetteer
-0.031,other.gazetteer
-0.043,title.gazetteer
-0.133,company.gazetteer
-0.194,product.gazetteer

Weight?,Feature
-0.047,person.gazetteer
-0.15,title.gazetteer
-0.308,product.gazetteer
-0.587,location.gazetteer

Weight?,Feature
0.69,title.gazetteer
0.227,location.gazetteer
0.215,company.gazetteer
0.135,person.gazetteer
0.011,product.gazetteer

Weight?,Feature
1.858,person.gazetteer
0.637,location.gazetteer
0.104,title.gazetteer
-0.152,other.gazetteer
-0.301,product.gazetteer

Weight?,Feature
1.829,person.gazetteer
0.378,location.gazetteer
0.31,group.gazetteer
0.21,product.gazetteer
-0.261,title.gazetteer

Weight?,Feature
1.497,product.gazetteer
0.016,other.gazetteer
-0.329,person.gazetteer
-0.712,location.gazetteer

Weight?,Feature
0.359,title.gazetteer
0.238,product.gazetteer
0.134,location.gazetteer
0.004,person.gazetteer
0.001,other.gazetteer

Weight?,Feature
0.0,person.gazetteer
0.0,title.gazetteer
-0.049,location.gazetteer

Weight?,Feature
1.191,other.gazetteer
0.458,person.gazetteer
0.18,location.gazetteer
