In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import nltk
from itertools import chain

import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
import re

from nltk.corpus import stopwords
import pickle

from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer\

import string
from datetime import datetime

In [3]:
wordnet_lemmatizer = WordNetLemmatizer()
porter = PorterStemmer()
lancaster=LancasterStemmer()

persongazetteerfilenames = ['lexicon\\people.person.lastnames.modified', 'lexicon\\people.family_name', 'lexicon\\firstname.5000', 'lexicon\\lastname.5000'] 
companygazetteerfilenames = ['lexicon\\business.consumer_company', 'lexicon\\venture_capital.venture_funded_company', 'lexicon\\business.brand']
locationgazetteerfilenames = ['lexicon\\location.country','lexicon\\location','lexicon\\education.university','lexicon\\venues', 'lexicon\\architecture.museum']
productgazetteerfilenames =['lexicon\\product','lexicon\\business.consumer_product']
#,'lexicon\\automotive.model','lexicon\\automotive.make'
titlegazetteerfilenames = ['lexicon\\award.award','lexicon\\base.events.festival_series','lexicon\\book.newspaper', 'lexicon\\tv.tv_program']
groupgazetteerfilenames = ['lexicon\\sports.sports_team']
othergazetteerfilenames = ['lexicon\\time.holiday', 'lexicon\\time.recurring_event']

# other - 'lexicon\\base.events.festival_series','lexicon\\broadcast.tv_channel','lexicon\\cvg.cvg_platform','lexicon\\sports.sports_league', 
#           , 'lexicon\\transportation.road', 'lexicon\\tv.tv_network'

def loadGazetteer(filenames):
    combinedlist = []
    for filename in filenames:
        linelist = [line.rstrip('\n').lower() for line in open(filename, encoding="utf8")]
        #print(len(linelist))
        combinedlist = set().union(combinedlist, linelist)
        #print(len(combinedlist))
    return combinedlist

def isWordInGazette(gazetteer,word):
    if word.lower() in gazetteer:
        return True
    
    return False
    
def isWordGroupInGazette(gazetteer,entity,sentence):
    for entry in gazetteer:
        if entity.lower() in entry:
            if entry in sentence: 
                return True
  
    return False  

"""persongazetteer = loadGazetteer(persongazetteerfilenames)
companygazetteer = loadGazetteer(companygazetteerfilenames)  
locationgazetteer = loadGazetteer(locationgazetteerfilenames)
productgazetteer = loadGazetteer(productgazetteerfilenames)
titlegazetteer = loadGazetteer(titlegazetteerfilenames)
groupgazetteer = loadGazetteer(groupgazetteerfilenames)
othergazetteer = loadGazetteer(othergazetteerfilenames)"""


triggerwordlist = [line.rstrip('\n').lower() for line in open("triggerwordlist.txt", encoding="utf8")]

In [4]:
def isURL(string):
    return len(re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+] |[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', string))>0

def isHashtagUserName(string):
    return (string[0]=='#' or string[0]=='@')

def isAnyDigit(s):
    return any(i.isdigit() for i in s)

def shape(string):
    t1 = re.sub('[A-Z]', 'X',string)
    t2 = re.sub('[a-z]', 'x', t1)
    return re.sub('[0-9]', 'd', t2)

def isAbbr(string):
    regex = re.compile('[A-Z]([A-Z]|\.|&)+') 
    if(regex.search(string) == None):
        return False
    return True

def isPostUpper(post):
    fullpost = ""
    for wordlabel in post:
        word,label = wordlabel
        fullpost += word + " "
    return fullpost.isupper()

def isStopWord(string):
    if string in stopwords.words('english'):
        return True
    return False

def poststring(post, separator=" "):
    fullpost = ""
    for wordlabel in post:
        word,label = wordlabel
        fullpost += word + separator
    return fullpost    

def sentenceTag(post):
    fullpost = ""
    for wordlabel in post:
        word,label = wordlabel
        fullpost += word + " "
    return nltk.pos_tag(fullpost.split())

#gaz_wiki_place = open("gazetteer\\wikipedia_place_titles.pickle", 'rb')
#gaz_wiki_place_db = list(pickle.load(gaz_wiki_place, encoding='bytes'))
#def isPlace(key):
    #for keys in gaz_wiki_place_db:
    #    if key in keys:#
#    if key in gaz_wiki_place_db:
#        return True
#    return False

def isromannum(word):
    validRomanNumerals = ["M", "D", "C", "L", "X", "V", "I"]
    for letters in word.upper():
        if letters not in validRomanNumerals:
            return False
    return True

def haspunctuation(word):
    punctuations = string.punctuation
    for letters in word:
        if letters  in punctuations:
            return True
    return False

def ispunctuation(word):
    punctuations = string.punctuation
    for letters in word:
        if letters not in punctuations:
            return False
    return True

triggerwordlist = [line.rstrip('\n').lower() for line in open("triggerwordlist.txt", encoding="utf8")]
def istriggerword(word):
    if word.lower() in set(triggerwordlist):
        return True
    return False

def wordtypepatterns(poststring):
    pattern =""
    #print("inside ",poststring)
    for word in poststring.split():
        
        if word.islower():
            pattern += "l"
        elif word.isupper():
            pattern += "C"
        elif word.istitle():
            pattern += "T"
        elif word in string.punctuation:
            pattern += "."
        else:
            pattern += "x"

    return pattern

def preprocess(raw_data):
    posts = raw_data.split("\n\n")
    output = []
    for post in posts:
        lines = post.split("\n")
        outputline = []
        for line in lines:
            #print(line)
            if line != "":
                word, label = tuple(line.split("\t"))
                #if(label[0]=='B' or label[0]=='I'):
                #    label = label[2:]
                outputline.append((word,label))
        output.append(outputline)  
    return output 


def preprocessnotag(raw_data):
    posts = raw_data.split("\n\n")
    output = []
    for post in posts:
        
        words = post.split("\n")
        #print(words)
        outputline = []
        for word in words:
            if word != "":
                outputline.append((word," "))
        output.append(outputline)
        #break
    return output 

# REMOVE DUPLICATE POSTS

def removeDuplicate(posts):
    stringlist = []
    result = []
    for post in posts:
        string = poststring(post)
        if string not in stringlist:
            stringlist.append(string)
            result.append(post)
    
    #print(len(posts),len(stringlist))
    return result    


def postPunctuationAsNER(post):
    punctuations = string.punctuation
    for line in post:
        word, label = line
        if word in punctuations and label not in 'O':
            return True
    return False

def removePunctuationAsNER(posts):
    result = []
    
    for post in posts:
        if not postPunctuationAsNER(post):
            result.append(post)
    return result        
        
  



In [5]:
train_raw_data = open("train.txt","r").read()
dev_raw_data = open("dev.txt","r").read()
test_raw_data = open("test_no_tag.txt","r",encoding="utf8").read()

In [6]:
train_posts = preprocess(train_raw_data)
dev_posts = preprocess(dev_raw_data)
test_posts = preprocessnotag(test_raw_data)

train_posts = removeDuplicate(train_posts)   
dev_posts = removeDuplicate(dev_posts)

train_posts =  removePunctuationAsNER(train_posts)  
dev_posts =  removePunctuationAsNER(dev_posts)    


print(len(test_posts))

4271


In [7]:
def word2features(post,i, postag):
    word, label = post[i]
    
    #print(poststring(post))
    sentpattern = wordtypepatterns(poststring(post))
    
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        #'word[-6:]': word[-6:],
        #'word[-5:]': word[-5:],
        'word[-4:]': word[-4:],
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word[:4]': word[:4],
        'word[:3]': word[:3],
        'word[:2]': word[:2],
        'len(word)': len(word),
        'word.islower()': word.islower(),
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'word.isalpha()': word.isalpha(),
        'word.isalnum()': word.isalnum(),
        'isHashTagUserName(word)':isHashtagUserName(word),
        'istriggerword(word)':istriggerword(word),
       # 'isAnyDigit(word)':isAnyDigit(word),
        'isStopWord(word)':isStopWord(word),
       # 'isAbbr(word)':isAbbr(word),
        'shape(word)':shape(word),
        'isURL(word)':isURL(word),
        'postag': postag[i][1],
        'postag[:2]': postag[i][1][:2],
       # 'isPlace(word)':isPlace(word),
       # 'lemma':wordnet_lemmatizer.lemmatize(word),
       # 'stem.portar':porter.stem(word),
       #'stem.lancaster':lancaster.stem(word),
        'word.isromannum':isromannum(word),
        #'word.haspunctuation':haspunctuation(word),
        'word.ispunctuation':ispunctuation(word),
        
        'isPostUpper(post)':isPostUpper(post),
        #'sentpattern':sentpattern,
    }
    if i > 0:
        word1, label1 = post[i-1]
        features.update({
            '-1:word.islower()': word1.islower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:word.isdigit()': word1.isdigit(),
            '-1:word.isalpha()': word1.isalpha(),
            #'-1:isAnyDigit(word)':isAnyDigit(word1),
            '-1:istriggerword(word)':istriggerword(word1),
            '-1:shape(word)':shape(word1),
            '-1:isURL(word)':isURL(word1),
            '-1:word[-4:]': word1[-4:],
            '-1:word[-3:]': word1[-3:],
            '-1:word[-2:]': word1[-2:],
            '-1:word[:4]': word1[:4],
            '-1:word[:3]': word1[:3],
            '-1:word[:2]': word1[:2],
            '-1:len(word)': len(word1),
            #'-1:isHashTagUserName(word)':isHashtagUserName(word1),
            '-1:postag': postag[i-1][1],
            '-1:postag[:2]': postag[i-1][1][:2]
        })
        if i>1:
            word2, label2 = post[i-2]
            features.update({
                '-2:word.islower()': word2.islower(),
                '-2:word.istitle()': word2.istitle(),
                '-2:word.isupper()': word2.isupper(),
                '-2:word.isdigit()': word2.isdigit(),
                '-2:word.isalpha()': word2.isalpha(),
                '-2:istriggerword(word)':istriggerword(word2),
                '-2:postag': postag[i-2][1],
                '-2:postag[:2]': postag[i-2][1][:2],
                
                
            })
            """if i>2:
                word3, label3 = post[i-3]
                features.update({
                '-3:word.islower()': word3.islower(),
                '-3:word.istitle()': word3.istitle(),
                '-3:word.isupper()': word3.isupper(),
                '-3:word.isdigit()': word3.isdigit(),
                '-3:word.isalpha()': word3.isalpha(),
                '-3:postag': postag[i-3][1],
                '-3:postag[:2]': postag[i-3][1][:2],
                }) """
    else:
        features['BOS'] = True

    if i < len(post)-1:
        word1, label1 = post[i+1]
        features.update({
            '+1:word.islower()': word1.islower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:word.isdigit()': word1.isdigit(),
            '+1:word.isalpha()': word1.isalpha(),
            #'+1:isAnyDigit(word)':isAnyDigit(word1),
            '+1:istriggerword(word)':istriggerword(word1),
            '+1:shape(word)':shape(word1),
            '+1:isURL(word)':isURL(word1),
            '+1:word[-4:]': word1[-4:],
            '+1:word[-3:]': word1[-3:],
            '+1:word[-2:]': word1[-2:],
            '+1:word[:4]': word1[:4],
            '+1:word[:3]': word1[:3],
            '+1:word[:2]': word1[:2],
            '+1:len(word)': len(word1),
            #'+1:isHashTagUserName(word)':isHashtagUserName(word1),
            '+1:postag': postag[i+1][1],
            '+1:postag[:2]': postag[i+1][1][:2]
        })
        if i < len(post) - 2:
            word2, label2 = post[i+2]
            features.update({
                '+2:word.islower()': word2.islower(),
                '+2:word.istitle()': word2.istitle(),
                '+2:word.isupper()': word2.isupper(),
                '+2:word.isdigit()': word2.isdigit(),
                '+2:word.isalpha()': word2.isalpha(),
                '+2:istriggerword(word)':istriggerword(word2),
                '+2:postag': postag[i+2][1],
                '+2:postag[:2]': postag[i+2][1][:2],
            })
            """if i < len(post) - 3:
                word3, label3 = post[i+3]
                features.update({
                '+3:word.islower()': word3.islower(),
                '+3:word.istitle()': word3.istitle(),
                '+3:word.isupper()': word3.isupper(),
                '+3:word.isdigit()': word3.isdigit(),
                '+3:word.isalpha()': word3.isalpha(),
                '+3:postag': postag[i+3][1],
                '+3:postag[:2]': postag[i+3][1][:2],
                })"""
    else:
        features['EOS'] = True

    return features


def post2features(post):
    postag = sentenceTag(post)
    #print("post",post)
    #print("postag in post2features: ",postag)
    return [word2features(post, i, postag) for i in range(len(post))]

def post2labels(post):
    return [label for word, label in post]


In [8]:
%%time
post2features(train_posts[5])[5]

Wall time: 93.8 ms


{'bias': 1.0,
 'word.lower()': 'if',
 'word[-4:]': 'if',
 'word[-3:]': 'if',
 'word[-2:]': 'if',
 'word[:4]': 'if',
 'word[:3]': 'if',
 'word[:2]': 'if',
 'len(word)': 2,
 'word.islower()': True,
 'word.isupper()': False,
 'word.istitle()': False,
 'word.isdigit()': False,
 'word.isalpha()': True,
 'word.isalnum()': True,
 'isHashTagUserName(word)': False,
 'istriggerword(word)': False,
 'isStopWord(word)': True,
 'shape(word)': 'xx',
 'isURL(word)': False,
 'postag': 'IN',
 'postag[:2]': 'IN',
 'word.isromannum': False,
 'word.ispunctuation': False,
 'isPostUpper(post)': False,
 '-1:word.islower()': True,
 '-1:word.istitle()': False,
 '-1:word.isupper()': False,
 '-1:word.isdigit()': False,
 '-1:word.isalpha()': True,
 '-1:istriggerword(word)': False,
 '-1:shape(word)': 'xxxx',
 '-1:isURL(word)': False,
 '-1:word[-4:]': 'this',
 '-1:word[-3:]': 'his',
 '-1:word[-2:]': 'is',
 '-1:word[:4]': 'this',
 '-1:word[:3]': 'thi',
 '-1:word[:2]': 'th',
 '-1:len(word)': 4,
 '-1:postag': 'DT',
 '-

In [9]:
%%time
post2labels(train_posts[5])[5]

Wall time: 0 ns


'O'

In [10]:
%%time
X_train = [post2features(s) for s in train_posts]
y_train = [post2labels(s) for s in train_posts]

Wall time: 13.7 s


In [11]:
%%time
X_dev = [post2features(s) for s in dev_posts]
y_dev = [post2labels(s) for s in dev_posts]

Wall time: 5.84 s


In [12]:
%%time
X_test = [post2features(s) for s in test_posts]

Wall time: 16 s


In [13]:
%%time
#0.35000000000000003,0.05
#0.36, 0.08
crf = sklearn_crfsuite.CRF(
    algorithm='pa',
    #c1=0.36,
    #c2=0.08,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

Wall time: 14.6 s


CRF(algorithm='pa', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=None, c2=None, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [14]:
filename = 'crf_lbfgs_'+datetime.now().strftime("%Y%m%d_%H%M")+'.sav'
pickle.dump(crf, open(filename, 'wb'))

In [15]:
labels = list(crf.classes_)
labels.remove('O')
#labels

In [16]:
#0.9397331037451572

y_pred = crf.predict(X_dev)
print("f1 avg ",metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print("f1 micro ",metrics.flat_f1_score(y_dev, y_pred, average='micro', labels=labels))
print("accuracy",metrics.flat_accuracy_score(y_dev, y_pred))

f1 avg  0.2696552104648167
f1 micro  0.30240103828682674
accuracy 0.9428806376114871


In [17]:
test_pred = crf.predict(X_test)

In [18]:
test_posts[1]

[('I', ' '),
 ('want', ' '),
 ('to', ' '),
 ('be', ' '),
 ('the', ' '),
 ('joy', ' '),
 ('to', ' '),
 ('people', ' '),
 ('through', ' '),
 ('my', ' '),
 ('work', ' '),
 ('.', ' '),
 ('-', ' '),
 ('Gianni', ' '),
 ('Versace', ' ')]

In [19]:
test_pred[1]

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-person',
 'I-person']

In [20]:
def savepredictions(test_posts, test_pred, filename):
    file = open(filename,"w", encoding="utf8")
    j=0
    for post in test_posts:
        predpostlabel = test_pred[j]
        #print(post)
        for i in range(len(post)):
            word,emptylabel = post[i]
            predlabel = predpostlabel[i]
          #  print(word, predlabel)
            file.write(word+" "+predlabel+"\n")
            
        #if j>2:
         #   break
        file.write("\n")    
        j+=1
    file.close()  
    
    
savepredictions(test_posts, test_pred, filename="test prediction 1_"+datetime.now().strftime("%Y%m%d_%H%M")+".txt")        

In [21]:
"""precision    recall  f1-score   support

      person      0.588     0.429     0.496       266
       title      0.400     0.062     0.108        32
    location      0.616     0.430     0.506       235
     company      0.462     0.122     0.194        49
     product      0.350     0.044     0.079       158
       group      0.333     0.025     0.047       159
       other      0.214     0.183     0.198       229

   micro avg      0.457     0.245     0.319      1128
   macro avg      0.423     0.185     0.232      1128
weighted avg      0.438     0.245     0.292      1128"""
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_dev, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

   B-company      0.467     0.179     0.259        39
   I-company      0.000     0.000     0.000        10
     B-group      0.273     0.030     0.054       100
     I-group      0.182     0.047     0.074        43
  B-location      0.500     0.418     0.456       141
  I-location      0.417     0.299     0.348        67
     B-other      0.375     0.137     0.201       131
     I-other      0.163     0.075     0.103        93
    B-person      0.512     0.400     0.449       165
    I-person      0.524     0.537     0.530        82
   B-product      0.400     0.129     0.195        31
   I-product      0.250     0.014     0.026        72
     B-title      0.143     0.062     0.087        16
     I-title      0.111     0.083     0.095        12

   micro avg      0.432     0.233     0.302      1002
   macro avg      0.308     0.172     0.206      1002
weighted avg      0.377     0.233     0.270      1002



In [22]:
import seqeval.metrics  as seqevalmetrics
seqevalmetrics.f1_score(y_dev, y_pred)

0.2957600827300931

In [23]:
seqevalmetrics.accuracy_score(y_dev, y_pred)

0.9428806376114871

In [24]:
print(seqevalmetrics.classification_report(y_dev, y_pred, digits=3))

           precision    recall  f1-score   support

    other      0.312     0.115     0.168       131
    group      0.250     0.030     0.054       100
 location      0.445     0.376     0.408       141
   person      0.485     0.382     0.427       165
  company      0.375     0.154     0.218        39
    title      0.125     0.062     0.083        16
  product      0.182     0.065     0.095        31

micro avg      0.416     0.230     0.296       623
macro avg      0.371     0.230     0.270       623



In [52]:
seqevalmetrics.classification_report(y_dev, y_pred)

'           precision    recall  f1-score   support\n\n   person       0.47      0.35      0.40       165\n location       0.56      0.36      0.44       141\n    group       0.33      0.02      0.04       100\n    title       0.00      0.00      0.00        16\n    other       0.28      0.14      0.18       131\n  company       0.64      0.18      0.28        39\n  product       0.33      0.06      0.11        31\n\nmicro avg       0.46      0.22      0.30       623\nmacro avg       0.42      0.22      0.27       623\n'

In [30]:
import numpy
#c1list =  numpy.append((numpy.arange(0.05, 0.51, 0.05)),[1,2,4,7,10])
c1list =  numpy.arange(0.3, 0.41, 0.01)
#c2list = numpy.append((numpy.arange(0.05, 0.51, 0.05)),[1,2,4,7,10])
c2list =  numpy.arange(0.01, 0.11, 0.01)

#0.35000000000000003,0.05
print(c1list)
print(c2list)

[0.3  0.31 0.32 0.33 0.34 0.35 0.36 0.37 0.38 0.39 0.4 ]
[0.01 0.02 0.03 0.04 0.05 0.06 0.07 0.08 0.09 0.1 ]


In [31]:
%%time
maxaccuracy = 0
maxf1avg = 0
maxf1micro = 0

result = []
for c1val in c1list:
    for c2val in c2list:
        crf = sklearn_crfsuite.CRF(
            algorithm='lbfgs',
            c1=c1val,
            c2=c2val,
            max_iterations=100,
            all_possible_transitions=False
        )
        crf.fit(X_train, y_train)
        
        labels = list(crf.classes_)
        labels.remove('O')
        
        y_pred = crf.predict(X_dev)
        
        f1avg = metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels)
        f1micro = metrics.flat_f1_score(y_dev, y_pred, average='micro', labels=labels)
        accuracy = metrics.flat_accuracy_score(y_dev, y_pred)
        
        if(f1avg>maxf1avg):
            maxf1avg = f1avg
        if(f1micro>maxf1micro):
            maxf1micro = f1micro
        if(accuracy>maxaccuracy):
            maxaccuracy = accuracy
        
        print("f1 avg ",f1avg)
        print("f1 micro ",f1micro)
        print("accuracy",accuracy)
        
        tempresult = c1val, c2val, f1avg, f1micro, accuracy
        result.append(tempresult)
        
        print(datetime.now().strftime("%Y%m%d_%H%M")+" completed ("+str(c1val)+","+str(c2val)+")")

        
print("max f1avg", maxf1avg, ", max f1micro", maxf1micro, ", max accuracy,", maxaccuracy)        
filename = 'crf_compare_'+datetime.now().strftime("%Y%m%d_%H%M")+'.pkl'
pickle.dump(result, open(filename, 'wb'))

f1 avg  0.27960768069322056
f1 micro  0.2981818181818182
accuracy 0.9392118413561895
20200208_0748 completed (0.3,0.01)
f1 avg  0.27710891467100346
f1 micro  0.2953020134228188
accuracy 0.9395913720032892
20200208_0749 completed (0.3,0.02)
f1 avg  0.2828630929320414
f1 micro  0.3007518796992481
accuracy 0.9411727496995382
20200208_0750 completed (0.3,0.03)
f1 avg  0.27652267259519897
f1 micro  0.29528535980148884
accuracy 0.9404136884053387
20200208_0750 completed (0.3,0.04)
f1 avg  0.2679076934122152
f1 micro  0.2835913312693498
accuracy 0.9395281168954394
20200208_0751 completed (0.3,0.05)
f1 avg  0.2823164656384054
f1 micro  0.3036053130929791
accuracy 0.9416787905623379
20200208_0751 completed (0.3,0.060000000000000005)
f1 avg  0.25439753971222917
f1 micro  0.27616645649432536
accuracy 0.9396546271111392
20200208_0752 completed (0.3,0.06999999999999999)
f1 avg  0.27470162568141654
f1 micro  0.29479034307496826
accuracy 0.941552280346638
20200208_0753 completed (0.3,0.08)
f1 avg  0.

In [32]:
print(maxaccuracy, maxf1avg, maxf1micro)

0.9435764437978367 0.295614939444403 0.319693094629156


In [2]:
from collections import Counter
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))    

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

Top positive:


NameError: name 'crf' is not defined

In [None]:
#loaded_model = pickle.load(open("crf_best_estimator_3_fold_20200201_0055.sav", 'rb'))
#result = loaded_model.score(X_dev, y_dev)
#print(result)

In [None]:
#loaded_model = pickle.load(open("crf_lbfgs_i100.sav", 'rb'))
#result = loaded_model.score(X_dev, y_dev)
#print(result)

In [25]:
import eli5
eli5.show_weights(crf, top=30)

Using TensorFlow backend.


From \ To,O,B-company,I-company,B-group,I-group,B-location,I-location,B-other,I-other,B-person,I-person,B-product,I-product,B-title,I-title
O,0.324,0.179,-0.12,0.187,-0.199,0.154,-0.25,0.138,-0.268,0.255,-0.184,0.164,-0.147,0.165,-0.193
B-company,0.054,-0.126,0.279,-0.014,-0.031,0.045,-0.031,-0.011,-0.043,-0.04,-0.05,-0.015,-0.04,-0.011,-0.026
I-company,0.006,-0.021,0.112,-0.005,-0.004,-0.008,-0.02,-0.012,-0.014,-0.004,-0.01,-0.006,-0.01,0.0,-0.004
B-group,-0.004,-0.014,-0.01,-0.084,0.394,-0.011,-0.035,-0.019,-0.051,-0.022,-0.032,-0.005,-0.011,-0.01,-0.017
I-group,-0.04,-0.014,-0.019,-0.031,0.197,-0.025,-0.017,-0.008,-0.033,-0.015,-0.017,-0.01,-0.019,-0.001,-0.005
B-location,0.038,-0.028,-0.045,-0.031,-0.065,-0.067,0.419,-0.029,-0.1,-0.031,-0.061,-0.027,-0.049,-0.016,-0.041
I-location,-0.034,-0.008,-0.023,0.027,-0.022,-0.037,0.214,-0.011,-0.044,0.006,-0.047,-0.015,-0.037,-0.005,-0.038
B-other,-0.064,-0.021,-0.023,-0.013,-0.038,-0.016,-0.036,-0.075,0.483,0.003,-0.03,-0.013,-0.028,-0.005,-0.038
I-other,-0.03,-0.024,-0.035,-0.021,-0.042,-0.044,-0.05,-0.013,0.387,-0.001,-0.036,-0.031,-0.02,-0.009,-0.038
B-person,0.044,-0.023,-0.042,-0.021,-0.049,-0.021,-0.075,-0.028,-0.079,-0.144,0.48,-0.024,-0.036,-0.008,-0.052

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,Unnamed: 13_level_6,Unnamed: 14_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,Unnamed: 13_level_7,Unnamed: 14_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8,Unnamed: 13_level_8,Unnamed: 14_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9,Unnamed: 12_level_9,Unnamed: 13_level_9,Unnamed: 14_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10,Unnamed: 13_level_10,Unnamed: 14_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11,Unnamed: 12_level_11,Unnamed: 13_level_11,Unnamed: 14_level_11
Weight?,Feature,Unnamed: 2_level_12,Unnamed: 3_level_12,Unnamed: 4_level_12,Unnamed: 5_level_12,Unnamed: 6_level_12,Unnamed: 7_level_12,Unnamed: 8_level_12,Unnamed: 9_level_12,Unnamed: 10_level_12,Unnamed: 11_level_12,Unnamed: 12_level_12,Unnamed: 13_level_12,Unnamed: 14_level_12
Weight?,Feature,Unnamed: 2_level_13,Unnamed: 3_level_13,Unnamed: 4_level_13,Unnamed: 5_level_13,Unnamed: 6_level_13,Unnamed: 7_level_13,Unnamed: 8_level_13,Unnamed: 9_level_13,Unnamed: 10_level_13,Unnamed: 11_level_13,Unnamed: 12_level_13,Unnamed: 13_level_13,Unnamed: 14_level_13
Weight?,Feature,Unnamed: 2_level_14,Unnamed: 3_level_14,Unnamed: 4_level_14,Unnamed: 5_level_14,Unnamed: 6_level_14,Unnamed: 7_level_14,Unnamed: 8_level_14,Unnamed: 9_level_14,Unnamed: 10_level_14,Unnamed: 11_level_14,Unnamed: 12_level_14,Unnamed: 13_level_14,Unnamed: 14_level_14
+0.451,bias,,,,,,,,,,,,,
+0.422,word.ispunctuation,,,,,,,,,,,,,
+0.333,isHashTagUserName(word),,,,,,,,,,,,,
+0.320,shape(word):Xx,,,,,,,,,,,,,
+0.297,word.islower(),,,,,,,,,,,,,
+0.276,isStopWord(word),,,,,,,,,,,,,
+0.260,word[-3:]:day,,,,,,,,,,,,,
+0.248,EOS,,,,,,,,,,,,,
+0.226,shape(word):X,,,,,,,,,,,,,
+0.225,shape(word):Xxx,,,,,,,,,,,,,

Weight?,Feature
+0.451,bias
+0.422,word.ispunctuation
+0.333,isHashTagUserName(word)
+0.320,shape(word):Xx
+0.297,word.islower()
+0.276,isStopWord(word)
+0.260,word[-3:]:day
+0.248,EOS
+0.226,shape(word):X
+0.225,shape(word):Xxx

Weight?,Feature
+0.307,word.lower():twitter
+0.196,word.lower():facebook
+0.181,word[-4:]:tter
+0.141,word[:4]:twit
+0.138,word[:3]:twi
+0.132,-2:postag:VBP
+0.129,word[:2]:tw
+0.115,word[-2:]:co
+0.113,word[-4:]:book
+0.113,word[-3:]:get

Weight?,Feature
+0.103,shape(word):xxxx
+0.089,-1:word.istitle()
+0.074,-1:word[:2]:Se
+0.064,word[-2:]:er
+0.060,word[:2]:In
+0.056,-1:word[:3]:Mar
+0.055,-1:shape(word):Xxxxx
+0.053,-1:word[:2]:Ga
+0.053,word.lower():city
+0.052,word[-3:]:ner

Weight?,Feature
+0.127,word[-2:]:ks
+0.119,postag:NNS
+0.109,word[:4]:Eagl
+0.109,word[:3]:Eag
+0.106,word.lower():eagles
+0.104,shape(word):#Xxxxxx
+0.103,word[:2]:Ea
+0.101,word.isupper()
+0.090,+1:postag:VBP
+0.089,word[-2:]:es

Weight?,Feature
+0.101,word.istitle()
+0.099,-1:word.isalpha()
+0.088,-1:shape(word):xxxxx
+0.080,-1:word[:2]:Br
+0.077,+1:word[:2]:le
+0.075,-1:shape(word):XX
+0.074,word[:3]:Bro
+0.072,+1:word[-2:]:ng
+0.071,postag:NNS
+0.068,-1:word[:4]:king

Weight?,Feature
+0.197,word[-2:]:ia
+0.176,shape(word):XX
+0.173,word.lower():uk
+0.166,word[-3:]:nia
+0.149,shape(word):Xxxxxxx
+0.147,shape(word):xx
+0.131,word[-2:]:as
+0.130,word[:2]:CA
+0.128,word[-3:]:ton
+0.127,word[:4]:lhs

Weight?,Feature
+0.106,word[-2:]:rk
+0.091,+2:postag:CD
+0.091,+2:postag[:2]:CD
+0.086,word[:2]:BL
+0.082,word.isalpha()
+0.080,+2:word.isupper()
+0.080,word[:2]:Pa
+0.080,word[:3]:Sta
+0.079,word.lower():lounge
+0.078,-1:shape(word):Xxxxxxxxx

Weight?,Feature
+0.211,word[-3:]:mas
+0.152,word[-2:]:as
+0.125,word[-2:]:GP
+0.112,word[-4:]:tmas
+0.112,word.lower():christmas
+0.109,word[:3]:dyn
+0.109,word.lower():dynamite
+0.109,word[-4:]:mite
+0.109,word[:4]:dyna
+0.109,word[:2]:dy

Weight?,Feature
+0.135,-1:word[-2:]:on
+0.121,word[-2:]:ay
+0.117,-1:word.istitle()
+0.111,-1:word[-3:]:ion
+0.097,word[-3:]:day
+0.097,word[:4]:Fest
+0.097,word[:3]:Fes
+0.096,-2:postag:DT
+0.096,-2:postag[:2]:DT
+0.091,+1:word[-2:]:ng

Weight?,Feature
+0.180,word[:2]:Je
+0.166,word.lower():pope
+0.150,word[:2]:Jo
+0.143,postag[:2]:VB
+0.142,word[-2:]:en
+0.135,word[-3:]:nce
+0.130,word[-2:]:ie
+0.129,word.lower():taylor
+0.121,word.lower():justin
+0.120,word[-4:]:stin

Weight?,Feature
+0.127,word[-2:]:on
+0.105,word[-2:]:ey
+0.097,word.lower():bieber
+0.097,word[-4:]:eber
+0.084,-1:word[-4:]:stin
+0.081,postag:NNP
+0.080,-1:word[:2]:Je
+0.079,-1:word[:2]:An
+0.079,-1:word[-2:]:in
+0.078,word[-3:]:ber

Weight?,Feature
+0.184,word[:2]:iP
+0.122,word[:4]:iPho
+0.122,word[:3]:iPh
+0.114,word.lower():ipod
+0.102,shape(word):xXxxxx
+0.098,shape(word):xxxxxxxxxx
+0.097,word[-3:]:Pad
+0.097,word.lower():ipad
+0.093,word.isalnum()
+0.091,word[-3:]:den

Weight?,Feature
+0.092,+2:postag:NNP
+0.077,-1:word[-2:]:er
+0.076,word[-2:]:ld
+0.069,+1:shape(word):xxxx
+0.065,+2:postag:VBN
+0.065,-1:word[:2]:se
+0.064,-1:shape(word):Xxxxxx
+0.061,postag:CD
+0.061,postag[:2]:CD
+0.060,word[-4:]:guin

Weight?,Feature
+0.078,+1:word[:3]:pre
+0.077,+1:word[-2:]:re
+0.075,word[:3]:#BB
+0.075,word[:4]:#BB1
+0.075,word[-3:]:B11
+0.075,word[-4:]:BB11
+0.075,word.lower():#bb11
+0.075,shape(word):#XXdd
+0.075,word[:2]:#B
+0.072,word[-2:]:11

Weight?,Feature
+0.097,-1:word.isalpha()
+0.073,-1:shape(word):Xxxxxx
+0.065,+1:shape(word):d
+0.064,-1:word[:2]:ho
+0.062,word[:2]:To
+0.062,-2:postag[:2]:VB
+0.061,-1:word[:2]:Pr
+0.061,+1:shape(word):xxxxxx
+0.059,word[:2]:Li
+0.059,-1:word.istitle()


In [26]:
eli5.show_weights(crf, top=(10,10), feature_re='^word\.is',
                  horizontal_layout=False, show=['targets'])

Weight?,Feature
0.422,word.ispunctuation
0.297,word.islower()
0.081,word.isdigit()
0.042,word.isromannum
0.013,word.isupper()
-0.037,word.isalpha()
-0.116,word.istitle()
-0.16,word.isalnum()

Weight?,Feature
0.04,word.islower()
0.031,word.isalpha()
0.009,word.isalnum()
0.004,word.isromannum
-0.015,word.isdigit()
-0.023,word.ispunctuation
-0.067,word.isupper()
-0.075,word.istitle()

Weight?,Feature
0.041,word.istitle()
0.027,word.isalnum()
0.005,word.islower()
0.005,word.isdigit()
0.004,word.isalpha()
-0.005,word.isromannum
-0.005,word.ispunctuation
-0.023,word.isupper()

Weight?,Feature
0.101,word.isupper()
0.025,word.istitle()
-0.014,word.ispunctuation
-0.018,word.isdigit()
-0.029,word.isromannum
-0.03,word.isalpha()
-0.052,word.isalnum()
-0.056,word.islower()

Weight?,Feature
0.101,word.istitle()
0.036,word.isalnum()
0.023,word.isdigit()
0.013,word.isromannum
0.007,word.isalpha()
-0.015,word.ispunctuation
-0.025,word.islower()
-0.034,word.isupper()

Weight?,Feature
0.121,word.istitle()
0.09,word.isupper()
0.086,word.isromannum
0.075,word.isalpha()
0.014,word.islower()
-0.016,word.ispunctuation
-0.056,word.isalnum()
-0.071,word.isdigit()

Weight?,Feature
0.082,word.isalpha()
0.037,word.istitle()
0.037,word.isalnum()
0.01,word.islower()
-0.019,word.isromannum
-0.02,word.isupper()
-0.027,word.isdigit()
-0.043,word.ispunctuation

Weight?,Feature
0.04,word.isupper()
0.005,word.isalpha()
-0.009,word.isromannum
-0.019,word.isalnum()
-0.02,word.ispunctuation
-0.035,word.isdigit()
-0.135,word.istitle()
-0.148,word.islower()

Weight?,Feature
0.046,word.isalnum()
0.038,word.isromannum
0.027,word.isdigit()
0.022,word.istitle()
0.019,word.isalpha()
-0.026,word.isupper()
-0.041,word.islower()
-0.097,word.ispunctuation

Weight?,Feature
0.063,word.istitle()
0.027,word.isalnum()
0.015,word.islower()
-0.014,word.isdigit()
-0.052,word.isalpha()
-0.079,word.isupper()
-0.099,word.ispunctuation
-0.11,word.isromannum

Weight?,Feature
0.037,word.isromannum
0.026,word.istitle()
0.014,word.isupper()
0.002,word.isalpha()
-0.015,word.isdigit()
-0.023,word.islower()
-0.024,word.isalnum()
-0.039,word.ispunctuation

Weight?,Feature
0.093,word.isalnum()
-0.014,word.isdigit()
-0.016,word.ispunctuation
-0.024,word.isromannum
-0.058,word.isalpha()
-0.081,word.isupper()
-0.081,word.islower()
-0.132,word.istitle()

Weight?,Feature
0.053,word.isdigit()
0.044,word.isalnum()
0.025,word.isupper()
0.007,word.isromannum
-0.006,word.islower()
-0.014,word.ispunctuation
-0.023,word.isalpha()
-0.037,word.istitle()

Weight?,Feature
0.052,word.isupper()
0.022,word.istitle()
-0.009,word.ispunctuation
-0.013,word.isromannum
-0.014,word.isdigit()
-0.024,word.isalpha()
-0.034,word.islower()
-0.055,word.isalnum()

Weight?,Feature
0.048,word.isalnum()
0.039,word.istitle()
0.036,word.isdigit()
0.034,word.islower()
-0.001,word.isalpha()
-0.006,word.isupper()
-0.012,word.ispunctuation
-0.019,word.isromannum


In [27]:
eli5.show_weights(crf, top=(10,10), feature_re='.lower.',
                  horizontal_layout=False, show=['targets'])

Weight?,Feature
+0.297,word.islower()
+0.116,word.lower():tonight
+0.105,word.lower():go
+0.101,word.lower():lol
+0.095,word.lower():rt
+0.093,word.lower():someone
+0.087,word.lower():free
+0.087,word.lower():scm
+0.086,word.lower():doctor
+0.081,word.lower():prison

Weight?,Feature
+0.307,word.lower():twitter
+0.196,word.lower():facebook
+0.108,word.lower():youtube
+0.107,word.lower():fb
+0.099,word.lower():ione
+0.092,word.lower():msn
+0.090,word.lower():yahoo
+0.090,word.lower():tumblr
+0.083,word.lower():walmart
+0.080,word.lower():ufc

Weight?,Feature
+0.053,word.lower():city
+0.044,word.lower():clair
+0.042,word.lower():navy
+0.042,word.lower():subaru
+0.042,word.lower():java
+0.039,word.lower():container
+0.038,word.lower():eats
+0.037,word.lower():intervention
+0.034,word.lower():studios
+0.033,word.lower():tube

Weight?,Feature
+0.106,word.lower():eagles
+0.076,word.lower():kings
+0.076,word.lower():#padres
+0.072,word.lower():dj
+0.070,word.lower():green
+0.056,word.lower():delirious
+0.053,word.lower():shpongle
+0.052,word.lower():ut
+0.052,word.lower():metallica
+0.051,word.lower():#astros

Weight?,Feature
+0.057,word.lower():soccer
+0.056,word.lower():lions
+0.053,word.lower():leon
+0.050,word.lower():maccabees
+0.049,word.lower():rza
+0.047,word.lower():day
+0.045,word.lower():tomorrow
+0.045,word.lower():wolves
+0.043,word.lower():entering
+0.042,word.lower():finese

Weight?,Feature
+0.173,word.lower():uk
+0.127,word.lower():lhs
+0.109,word.lower():london
+0.109,word.lower():america
+0.103,word.lower():jupiter
+0.096,word.lower():vancouver
+0.083,word.lower():corbin
+0.082,word.lower():miami
+0.080,word.lower():cwe
+0.080,word.lower():peppers

Weight?,Feature
+0.079,word.lower():lounge
+0.075,word.lower():park
+0.072,word.lower():building
+0.071,word.lower():york
+0.060,word.lower():bay
+0.060,word.lower():campus
+0.060,word.lower():cafe
+0.057,word.lower():blast
+0.056,word.lower():states
+0.053,word.lower():mountain

Weight?,Feature
+0.112,word.lower():christmas
+0.109,word.lower():dynamite
+0.103,word.lower():xmas
+0.093,word.lower():dems
+0.082,word.lower():treasury
+0.078,word.lower():imdb
+0.075,word.lower():yom
+0.073,word.lower():#vh1
+0.070,word.lower():doe
+0.069,word.lower():unsung

Weight?,Feature
+0.084,word.lower():week
+0.075,word.lower():day
+0.074,word.lower():kippur
+0.064,word.lower():festival
+0.063,word.lower():war
+0.062,-2:word.islower()
+0.057,word.lower():fashion
+0.053,word.lower():moon
+0.052,word.lower():blog
+0.047,word.lower():sacrifice

Weight?,Feature
+0.166,word.lower():pope
+0.129,word.lower():taylor
+0.121,word.lower():justin
+0.119,word.lower():john
+0.117,word.lower():steve
+0.115,word.lower():4dbling
+0.107,word.lower():jfk
+0.106,word.lower():lindsay
+0.103,word.lower():rashad
+0.101,word.lower():mike

Weight?,Feature
+0.097,word.lower():bieber
+0.049,word.lower():rice
+0.045,word.lower():holly
+0.044,word.lower():hilton
+0.044,word.lower():&quot;
+0.043,word.lower():lewis
+0.041,word.lower():lohan
+0.041,word.lower():blair
+0.041,word.lower():rodriguez
+0.040,word.lower():collins

Weight?,Feature
+0.114,word.lower():ipod
+0.097,word.lower():ipad
+0.089,word.lower():xbox
+0.082,word.lower():pringles
+0.080,word.lower():cachupas
+0.072,word.lower():iphone
+0.069,word.lower():halo
+0.069,word.lower():m&amp;m
+0.067,word.lower():dabigatran
+0.067,word.lower():coke

Weight?,Feature
+0.060,word.lower():penguin
+0.051,word.lower():replacement
+0.043,+2:word.islower()
+0.040,word.lower():jump
+0.040,word.lower():eventide
+0.036,word.lower():5800
+0.035,word.lower():music
+0.035,word.lower():nintendo
+0.033,word.lower():vodka
+0.033,word.lower():chard

Weight?,Feature
+0.075,word.lower():#bb11
+0.068,word.lower():tmz
+0.067,-2:word.islower()
+0.066,word.lower():gigli
+0.062,word.lower():kick-ass
+0.056,word.lower():k-on
+0.053,+1:word.islower()
+0.050,word.lower():flikken
+0.048,word.lower():eurovision
+0.045,word.lower():nightmare

Weight?,Feature
+0.046,word.lower():town
+0.043,word.lower():show
+0.043,word.lower():christmas
+0.041,word.lower():sunny
+0.041,word.lower():blood
+0.040,word.lower():pass
+0.039,word.lower():lights
+0.035,word.lower():practice
+0.034,word.islower()
+0.033,word.lower():always


In [None]:
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    max_iterations=100, 
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation //flat_f1_score, average='weighted', labels=labels
f1_scorer = make_scorer(metrics.flat_accuracy_score)

# search
rs = RandomizedSearchCV(crf, params_space, 
                        cv=3, 
                        verbose=1, 
                        n_jobs=-1, 
                        n_iter=50, 
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

In [None]:
#crf = rs.best_estimator_
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

In [None]:
filename = 'crf_best_estimator_3_fold_'+datetime.now().strftime("%Y%m%d_%H%M")+'.sav'
pickle.dump(rs.best_estimator_, open(filename, 'wb'))

In [None]:
crf_best = rs.best_estimator_
y_pred = crf_best.predict(X_dev)
print(metrics.flat_classification_report(
    y_dev, y_pred, labels=sorted_labels, digits=3
))

In [None]:
seqevalmetrics.f1_score(y_dev, y_pred)

In [None]:
seqevalmetrics.accuracy_score(y_dev, y_pred)

In [None]:
print(seqevalmetrics.classification_report(y_dev, y_pred, digits=3))

In [None]:
test_pred_best = crf_best.predict(X_test)
savepredictions(test_posts, test_pred_best, filename="test prediction 1_best_"+datetime.now().strftime("%Y%m%d_%H%M")+".txt")    

In [None]:
def comparepredictions(pred1, pred2):
    
    #print(pred1[:2])
    
    match = True
    
    if(len(pred1) != len(pred2)):
        print("error")
        return
    
    for i in range(len(pred1)):
        for j in range(len(pred1[i])):
            pred1result = pred2[i][j]
            pred2result = pred2[i][j]
            if pred1result != pred2result:
                print(pred1result,pred2result)
                match = False
       
    if(match):
        print("All same")
        
comparepredictions(test_pred, test_pred_best)

In [None]:
from sklearn.model_selection import cross_val_predict
crosspred = cross_val_predict(estimator=crf, X=X_train, y=y_train, cv=5)

In [None]:
print(seqevalmetrics.classification_report(y_dev, crosspred, digits=3))

In [None]:
crosspred[0]

In [None]:
loaded_model = pickle.load(open("crf_best_estimator_3_fold_20200207_1221.pkl", 'rb'))
result = loaded_model.score(X_dev, y_dev)
print(result)