In [1]:
import pycrfsuite
import numpy as np
from sklearn.metrics import classification_report

# Prepare data

In [2]:
with open("finalData.tsv", 'r') as fp:
	data = fp.readlines()

In [3]:
for i in range(len(data)):
	data[i] = data[i].strip('\n')
	data[i] = data[i].split('\t')

In [4]:
tweets = []
currPoint = []

for token in data:
	if token[0] == '':
		if len(currPoint) > 0:
			tweets.append(currPoint)
			currPoint = []
	else:
		currPoint.append(token)
print len(tweets)

1489


In [5]:
tweets = np.array(tweets)
np.random.seed(52)
np.random.shuffle(tweets)
tweets = tweets.tolist()

# Feature functions

In [6]:
def asciiPercentage(s):
	count = 0.
	for char in s:
		if ord(char) < 128:
			count += 1
	return count/len(s)

def vowelPercentage(s):
	vowels = "aeiou"
	count = 0.
	for char in s:
		if char in vowels:
			count += 1
	return count/len(s)

# Feature extractor

In [8]:
def word2features(sent, i):

	# feature vector
	# word, pos, lang

    word = sent[i][0]
    wordClean = ''.join([ch for ch in word if ch in 'asdfghjklqwertyuiopzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM']).lower()
    normalizedWord = wordClean.lower()
    
    anyCap = any(char.isupper() for char in word)
    allCap = all(char.isupper() for char in word)
    hasSpecial = any(ord(char) > 32 and ord(char) < 65 for char in word)
    lang = sent[i][1]
    
    hashTag = word[0] == '#'
    mention = word[0] == '@'
    
    
    features = {'word' : word, 'wordClean' : wordClean, 'normalizedWord' : normalizedWord, \
                'lang' : lang,
                'isTitle' : word.istitle(), 'wordLength' : len(word), \
                'anyCap' : anyCap, 'allCap' : word.isupper(),
                'hasSpecial' : hasSpecial, 'asciiPer' : asciiPercentage(word)}
    
    
#     features['suffix5'] = word[-5:]
#     features['prefix5'] = word[:5]
#     features['suffix4'] = word[-4:]
#     features['prefix4'] = word[:4]
    features['suffix3'] = word[-3:]
    features['prefix3'] = word[:3]
    features['suffix2'] = word[-2:]
    features['prefix2'] = word[:2]
    features['suffix1'] = word[-1:]
    features['prefix1'] = word[:1]  
    
    return features

# Extracting features from Sequences

In [9]:
def sent2features(sent):
	features = []

	for i in range(len(sent)):
		features.append(word2features(sent, i))

	return features

def sent2labels(sent):
	allLabels = []

	for i in sent:
		allLabels.append(i[2])

	return allLabels

def sent2tokens(sent):

	allTokens = []

	for i in sent:
		allTokens.append(i[0])

	return allTokens

In [10]:
# Params; obtained from Grid Search

c1 = 0.0001
c2 = 0.1

In [11]:
tweetsVal = tweets[int(len(tweets) * 0.8):]
tweets = tweets[:int(len(tweets) * 0.8)]

In [12]:
len(tweetsVal), len(tweets)

(298, 1191)

# Training model

In [58]:
k = 5

chunk = len(tweets) / k
results = []

allTestPredictions = []
allTestGroundTruth = []

for i in range(k):

    print "cross validation", i, 'for', 'c1 :', c1, 'c2 :', c2

    test_sents = tweets[i * chunk : (i + 1) * chunk]
    train_sents = tweets[:i * chunk] + tweets[(i + 1) * chunk:]

    print "--> Extracting Train Set ..."
    X_train = [sent2features(s) for s in train_sents]
    y_train = [sent2labels(s) for s in train_sents]

    print "--> Extracting Test Set ..."
    X_test = [sent2features(s) for s in test_sents]
    y_test = [sent2labels(s) for s in test_sents]

    print "--> Loading CRF module ..."
    trainer = pycrfsuite.Trainer(verbose=False)

    for xseq, yseq in zip(X_train, y_train):
        trainer.append(xseq, yseq)

    trainer.set_params({
        'c1': c1,   # coefficient for L1 penalty
        'c2': c2,  # coefficient for L2 penalty
        'max_iterations': 1000,  # stop earlier

        # include transitions that are possible, but not observed
        'feature.possible_transitions': True,
        'feature.possible_states' : True
    })

    print "Training ..."
    trainer.train('pos_crf_' + str(i))

    print "Testing ..."
    tagger = pycrfsuite.Tagger()
    tagger.open('pos_crf_' + str(i))

    y_pred = []

    for xseq in X_test:
        y_pred.append(tagger.tag(xseq))

    """ CRF based classification """

    predictedLabels = []
    correctLabels = []

    for i in y_pred:
        for j in i:
            predictedLabels.append(j)
            allTestPredictions.append(j)

    for i in y_test:
        for j in i:
            correctLabels.append(j)
            allTestGroundTruth.append(j)

print """ CRF Classification"""
print 'c1 :', c1, 'c2 :', c2
print classification_report(allTestGroundTruth, allTestPredictions, digits = 4)

cross validation 0 for c1 : 0.0001 c2 : 0.1
--> Extracting Train Set ...
--> Extracting Test Set ...
--> Loading CRF module ...
Training ...
Testing ...
cross validation 1 for c1 : 0.0001 c2 : 0.1
--> Extracting Train Set ...
--> Extracting Test Set ...
--> Loading CRF module ...
Training ...
Testing ...
cross validation 2 for c1 : 0.0001 c2 : 0.1
--> Extracting Train Set ...
--> Extracting Test Set ...
--> Loading CRF module ...
Training ...
Testing ...
cross validation 3 for c1 : 0.0001 c2 : 0.1
--> Extracting Train Set ...
--> Extracting Test Set ...
--> Loading CRF module ...
Training ...
Testing ...
cross validation 4 for c1 : 0.0001 c2 : 0.1
--> Extracting Train Set ...
--> Extracting Test Set ...
--> Loading CRF module ...
Training ...
Testing ...
 CRF Classification
c1 : 0.0001 c2 : 0.1
             precision    recall  f1-score   support

        ADJ     0.7218    0.6756    0.6979      1233
        ADP     0.9273    0.9273    0.9273      2379
        ADV     0.8472    0.7861  

# Validation

In [13]:
print "--> Extracting Train Set ..."
X_train = [sent2features(s) for s in tweets]
y_train = [sent2labels(s) for s in tweets]

X_test = [sent2features(s) for s in tweetsVal]
y_test = [sent2labels(s) for s in tweetsVal]

print "--> Loading CRF module ..."
trainer = pycrfsuite.Trainer(verbose=False)
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

trainer.set_params({
    'c1': c1,   # coefficient for L1 penalty
    'c2': c2,  # coefficient for L2 penalty
    'max_iterations': 1000,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True,
    'feature.possible_states' : True
})

print "Training ..."
trainer.train('pos_crf')
print "Testing ..."
tagger = pycrfsuite.Tagger()
tagger.open('pos_crf')

allTestPredictions = []
allTestGroundTruth = []
y_pred = []

for xseq in X_test:
    y_pred.append(tagger.tag(xseq))
    
predictedLabels = []
correctLabels = []

for i in y_pred:
    for j in i:
        predictedLabels.append(j)
        allTestPredictions.append(j)

for i in y_test:
    for j in i:
        correctLabels.append(j)
        allTestGroundTruth.append(j)

print """ CRF Classification"""
print 'c1 :', c1, 'c2 :', c2
print classification_report(allTestGroundTruth, allTestPredictions, digits = 4)

--> Extracting Train Set ...
--> Loading CRF module ...
Training ...
Testing ...
 CRF Classification
c1 : 0.0001 c2 : 0.1
             precision    recall  f1-score   support

        ADJ     0.7492    0.7492    0.7492       303
        ADP     0.9283    0.9468    0.9375       602
        ADV     0.8653    0.7877    0.8247       212
       CONJ     0.9542    0.9182    0.9359       159
        DET     0.9091    0.9009    0.9050       222
       NOUN     0.8242    0.8581    0.8408       994
        NUM     0.9710    0.8481    0.9054        79
       PART     0.8249    0.7465    0.7837       284
   PART_NEG     0.9659    1.0000    0.9827        85
       PRON     0.9412    0.8481    0.8922       283
    PRON_WH     0.9500    0.9620    0.9560        79
      PROPN     0.9265    0.9180    0.9222       549
       VERB     0.8581    0.8997    0.8784      1156
          X     0.9930    0.9831    0.9880      1593

avg / total     0.9029    0.9020    0.9020      6600

