In [1]:
import collections
from sklearn.metrics import classification_report, confusion_matrix
import pycrfsuite
import numpy as np
import joblib

In [2]:
with open('data/ner.final.features', 'r') as fp:
    uniqueTweets = joblib.load(fp)

# Data stats

In [3]:
sum(map(lambda x : len(x), uniqueTweets))

35374

In [4]:
lang = []

for tw in uniqueTweets:
    for token in tw:
        lang.append(token[1])

In [5]:
collections.Counter(lang)

Counter({'en': 13860, 'hi': 11391, 'rest': 10123})

In [6]:
len(uniqueTweets)

2079

# Feature fucntions

In [7]:
def asciiPercentage(s):
	count = 0.
	for char in s:
		if ord(char) < 128:
			count += 1
	return count/len(s)

def vowelPercentage(s):
	vowels = "aeiou"
	count = 0.
	for char in s:
		if char in vowels:
			count += 1
	return count/len(s)

def capPercentage(s):
    count = 0.
    for ch in s:
        if ch.isupper():
            count += 1
    return count / len(s)

# Feature extractor

In [8]:
# [TOKEN, LANG, EPOS, EPOSSCORE, CHUNK, POS, HPOS, NORM, LABEL]

for i in range(len(uniqueTweets[0][0])):
    print i, uniqueTweets[-1][0][i], uniqueTweets[0][0][i]

0 #MaheshBabu Gully
1 rest en
2 # A
3 0.0 0.6312
4 B-NP B-NP
5 X ADJ
6  UNK
7  Gully
8 O O


In [9]:
def getWordShape(token):
    wordTransform = ''
    
    for ch in token:
        if ch.isalpha():
            if ch.isupper():
                wordTransform += 'X'
            if ch.islower():
                wordTransform += 'x'
        else:
            try:
                int(ch)
                wordTransform += 'O'
            except ValueError:
                wordTransform += ch
    return wordTransform

In [10]:
def word2features(sent, i):

    # feature vector
    # [TOKEN, LANG, EPOS, EPOSSCORE, CHUNK, POS, HPOS, NORM, LABEL]
    
    # Tweet level features
    allTokens = [sent[k][0] for k in range(len(sent))]
    
    tweetTitlePer = 0.
    for word in allTokens:
        if word.istitle():
            tweetTitlePer += 1
    tweetTitlePer /= len(allTokens)
    
    tweetCapPer = 0.
    
    for word in allTokens:
        tweetCapPer += capPercentage(word)
    tweetCapPer /= len(allTokens)
    
    word = sent[i][0]
    wordShape = getWordShape(word)
    cleanWord = ''.join([ch for ch in word if ch in 'asdfghjklqwertyuiopzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM'])
    normalizedWord = cleanWord.lower()
    
    lang = sent[i][1]
    anyCap = any(char.isupper() for char in word)
    allCap = all(char.isupper() for char in word)
    hasSpecial = any( ord(char) > 32 and ord(char) < 65 for char in word)
    
    
    hashTag = word[0] == '#'
    mention = word[0] == '@'

    epos = sent[i][2]
    chunk = sent[i][4]
    pos = sent[i][5]
    
    
    features = {
                'token' : word,
                'wordShape' : wordShape,
                'cleanWord' : cleanWord,
                'normalizedWord' : normalizedWord,
                
                'lang' : lang,
                'isTitle' : word.istitle(),
                'wordLength' : len(word),
                'anyCap' : anyCap, 
                'allCap' : word.isupper(),
                'hasSpecial' : hasSpecial, 
                'asciiPer' : asciiPercentage(word),\
                
                'epos' : epos, 
                'hashtag' : hashTag, 
                'mention' : mention,
                'tweetCapPer' : tweetCapPer,
                'tweetTitlePer' : tweetTitlePer,
               }
    
    features['suffix5'] = word[-5:]
    features['prefix5'] = word[:5]
    features['suffix4'] = word[-4:]
    features['prefix4'] = word[:4]
    features['suffix3'] = word[-3:]
    features['prefix3'] = word[:3]
    features['suffix2'] = word[-2:]
    features['prefix2'] = word[:2]
    features['suffix1'] = word[-1:]
    features['prefix1'] = word[:1]  
    
    if i > 0:
        word1 = sent[i - 1][0]
        lang1 = sent[i - 1][1]

        features['-1:word.lang'] = lang1
        features['-1:word.lower'] = word1.lower()
        features['-1:word.epos'] = sent[i - 1][2]
        features['-1.BOS'] = False

    else:

        features['-1:word.lang'] = ''
        features['-1:word.lower'] = ''
        features['-1:word.epos'] = ''
        features['-1:BOS'] = True

    if i < len(sent) - 1:

        word1 = sent[i + 1][0]
        lang1 = sent[i + 1][1]

        features['+1:word.lang'] = lang1
        features['+1:word.lower'] = word1.lower()
        features['+1:word.epos'] = sent[i + 1][2]
        features['+1:EOS'] = False
    else:
        features['+1:word.lang'] = ''
        features['+1:word.lower'] = ''
        features['+1:word.epos'] = ''
        features['+1:EOS'] = True
        

    return features


In [11]:
def sent2features(sent):
	features = []

	for i in range(len(sent)):
		features.append(word2features(sent, i))

	return features

def sent2labels(sent):
    allLabels = []

    for i in sent:
        currLabel = i[-1]
        if currLabel == '@' or currLabel == 'B-@':
            currLabel = 'O'
        else:
            pass
        allLabels.append(currLabel)
            
    return allLabels

def sent2tokens(sent):

	allTokens = []

	for i in sent:
		allTokens.append(i[0])

	return allTokens

In [12]:
for c1 in [.01]:
    for c2 in [.01]:
    
        k = 5

        chunk = len(uniqueTweets)/k
        results = []

        allTestPredictions = []
        allTestGroundTruth = []
        allTokens = []
        
        for i in range(k):

            print "cross validation", i, 'for', c1, c2

            test_sents = uniqueTweets[i * chunk : (i + 1) * chunk]
            train_sents = uniqueTweets[:i * chunk] + uniqueTweets[(i + 1) * chunk:]

            X_train = [sent2features(s) for s in train_sents]
            y_train = [sent2labels(s) for s in train_sents]

            X_test = [sent2features(s) for s in test_sents]
            y_test = [sent2labels(s) for s in test_sents]
            X_test_tokens = []
            
            trainer = pycrfsuite.Trainer(verbose=False)

            for xseq, yseq in zip(X_train, y_train):
                trainer.append(xseq, yseq)

            trainer.set_params({
                'c1': c1,   # coefficient for L1 penalty
                'c2': c2,  # coefficient for L2 penalty
                'max_iterations': 10200,  # stop earlier

                # include transitions that are possible, but not observed
                'feature.possible_transitions': True,
                'feature.possible_states' : False
            })

            print "training"
            trainer.train('ner_t_f' + str(i))


            print "testing"
            tagger = pycrfsuite.Tagger()
            tagger.open('ner_t_f' + str(i))

            y_pred = []

            for xseq in X_test:
                y_pred.append(tagger.tag(xseq))


            """ CRF based classification """

            predictedLabels = []
            correctLabels = []
            xTokens = []
            
            for i in y_pred:
                for j in i:
                    predictedLabels.append(j)

            for i in y_test:
                for j in i:
                    correctLabels.append(j)

            for i in X_test:
                for j in i:
                    xTokens.append(j['token'])
            allTestPredictions += predictedLabels
            allTestGroundTruth += correctLabels
            print """ CRF Classification"""
            
        print c1, c2
        print classification_report(allTestGroundTruth, allTestPredictions)

cross validation 0 for 0.01 0.01
training
testing
 CRF Classification
cross validation 1 for 0.01 0.01
training
testing
 CRF Classification
cross validation 2 for 0.01 0.01
training
testing
 CRF Classification
cross validation 3 for 0.01 0.01
training
testing
 CRF Classification
cross validation 4 for 0.01 0.01
training
testing
 CRF Classification
0.01 0.01
                precision    recall  f1-score   support

B-ORGANISATION       0.89      0.62      0.73       375
      B-PERSON       0.81      0.70      0.75      1638
       B-PLACE       0.82      0.69      0.75       738
I-ORGANISATION       0.66      0.33      0.44       111
      I-PERSON       0.79      0.75      0.77       701
       I-PLACE       0.66      0.46      0.54       178
             O       0.97      0.99      0.98     31561

   avg / total       0.95      0.96      0.95     35302



# Classification result, running the conll eval script

In [13]:
fp = open('crf.classify.compare', 'w')

mark = 0
for i in range(k):

    test_sents = uniqueTweets[i * chunk : (i + 1) * chunk]
    
    for sent in test_sents:
        labels = [token[-1] for token in sent]
        for _ in range(len(labels) - 1):
            if labels[_] == 'O' and labels[_ + 1][:2] == 'I-':
                for tt in sent:
                    print tt
                print '\n'
                    
                
        for token in sent:
            fp.write('_' + ' ' + allTestGroundTruth[mark] + ' ' + allTestPredictions[mark] + '\n')
            mark += 1
            
fp.close()

In [14]:
! ./conlleval < crf.classify.compare 

processed 35302 tokens with 2754 phrases; found: 2291 phrases; correct: 1799.
accuracy:  95.71%; precision:  78.52%; recall:  65.32%; FB1:  71.32
     ORGANISATION: precision:  87.12%; recall:  61.33%; FB1:  71.99  264
           PERSON: precision:  76.99%; recall:  66.14%; FB1:  71.15  1408
            PLACE: precision:  78.35%; recall:  65.54%; FB1:  71.38  619


# Segmentation results, running the conll eval script

In [15]:
for i in range(len(allTestGroundTruth)):
    if allTestGroundTruth[i] == 'O':
        pass
    else:
        allTestGroundTruth[i] = allTestGroundTruth[i][:2] + 'ENTITY'
    if allTestPredictions[i] == 'O':
        pass
    else:
        allTestPredictions[i] = allTestPredictions[i][:2] + 'ENTITY'
        
print classification_report(allTestGroundTruth, allTestPredictions)

             precision    recall  f1-score   support

   B-ENTITY       0.89      0.74      0.81      2751
   I-ENTITY       0.82      0.70      0.76       990
          O       0.97      0.99      0.98     31561

avg / total       0.96      0.96      0.96     35302



In [16]:
fp = open('crf.segment.compare', 'w')

mark = 0
for i in range(k):

    print "cross validation", i, 'for', c1, c2

    test_sents = uniqueTweets[i * chunk : (i + 1) * chunk]
    
    for sent in test_sents:
        for token in sent:
            fp.write(token[0] + ' ' + allTestGroundTruth[mark] + ' ' + allTestPredictions[mark] + '\n')
            mark += 1
            
fp.close()

cross validation 0 for 0.01 0.01
cross validation 1 for 0.01 0.01
cross validation 2 for 0.01 0.01
cross validation 3 for 0.01 0.01
cross validation 4 for 0.01 0.01


In [17]:
! ./conlleval < crf.segment.compare 

processed 35302 tokens with 2751 phrases; found: 2290 phrases; correct: 1924.
accuracy:  96.29%; precision:  84.02%; recall:  69.94%; FB1:  76.33
           ENTITY: precision:  84.02%; recall:  69.94%; FB1:  76.33  2290
