In [1]:
import pycrfsuite
import numpy as np
from sklearn.metrics import classification_report

# Prepare data

In [2]:
with open("finalData.tsv", 'r') as fp:
	data = fp.readlines()

In [3]:
for i in range(len(data)):
	data[i] = data[i].strip('\n')
	data[i] = data[i].split('\t')

In [4]:
tweets = []
currPoint = []

for token in data:
	if token[0] == '':
		if len(currPoint) > 0:
			tweets.append(currPoint)
			currPoint = []
	else:
		currPoint.append(token)

print len(tweets)

1489


In [5]:
tweets = np.array(tweets)
np.random.seed(52)
np.random.shuffle(tweets)
tweets = tweets.tolist()

In [6]:
tweetsVal = tweets[int(len(tweets) * 0.8):]
tweets = tweets[:int(len(tweets) * 0.8)]

In [7]:
len(tweetsVal), len(tweets)

(298, 1191)

# Baseline performance (token)

In [9]:
print "--> Creating map ..."

tagDict = {}

allTags = []
for tweet in tweets:
    for token in tweet:
        allTags.append(token[2])
tags, counts = np.unique(allTags, return_counts = True)
mostFrequentTag = tags[np.argmax(counts)]

for tweet in tweets:
    for token in tweet:
        if token[0] in tagDict:
            pass
        else:
            tagDict[token[0]] = []
        
        tagDict[token[0]].append(token[2])
        
print "--> Testing ..."

predictedLabels = []
correctLabels = []
count = 0

for tweet in tweetsVal:
    for token in tweet:
        correctLabels.append(token[2])
        
        if token[0] in tagDict:
            tags, counts = np.unique(tagDict[token[0]], return_counts = True)
            predictedLabels.append(tags[np.argmax(counts)])
        else:
            count += 1
            predictedLabels.append(mostFrequentTag)

print """ CRF Classification"""
print classification_report(y_true = correctLabels, y_pred = predictedLabels, digits = 4)

--> Creating map ...
--> Testing ...
 CRF Classification
             precision    recall  f1-score   support

        ADJ     0.8210    0.6205    0.7068       303
        ADP     0.9435    0.9153    0.9292       602
        ADV     0.9085    0.7028    0.7926       212
       CONJ     0.9603    0.9119    0.9355       159
        DET     0.9321    0.9279    0.9300       222
       NOUN     0.9221    0.5956    0.7237       994
        NUM     0.9841    0.7848    0.8732        79
       PART     0.8052    0.7570    0.7804       284
   PART_NEG     0.9659    1.0000    0.9827        85
       PRON     0.9542    0.8092    0.8757       283
    PRON_WH     0.9595    0.8987    0.9281        79
      PROPN     0.9515    0.7140    0.8158       549
       VERB     0.9518    0.7353    0.8297      1156
          X     0.6163    0.9950    0.7611      1593

avg / total     0.8522    0.8061    0.8077      6600



# Baseline performance (token + language)

In [10]:
print "--> Creating map ..."

tagDict = {}

allTags = []
for tweet in tweets:
    for token in tweet:
        if token[1] == 'en':
            allTags.append(token[2])
tags, counts = np.unique(allTags, return_counts = True)
mostFrequentTagEn = tags[np.argmax(counts)]

allTags = []
for tweet in tweets:
    for token in tweet:
        if token[1] == 'hi':
            allTags.append(token[2])
tags, counts = np.unique(allTags, return_counts = True)
mostFrequentTagHi = tags[np.argmax(counts)]

allTags = []
for tweet in tweets:
    for token in tweet:
        if token[1] == 'rest':
            allTags.append(token[2])
tags, counts = np.unique(allTags, return_counts = True)
mostFrequentTagRest = tags[np.argmax(counts)]

for tweet in tweets:
    for token in tweet:
        if token[0] + '__' + token[1] in tagDict:
            pass
        else:
            tagDict[token[0] + '__' + token[1]] = []
        
        tagDict[token[0] + '__' + token[1]].append(token[2])
        
print "--> Testing ..."

predictedLabels = []
correctLabels = []
count = 0

for tweet in tweetsVal:
    for token in tweet:
        correctLabels.append(token[2])
        
        if token[0] + '__' + token[1] in tagDict:
            tags, counts = np.unique(tagDict[token[0] + '__' + token[1]], return_counts = True)
            predictedLabels.append(tags[np.argmax(counts)])
        else:
            if token[1] == 'en':
                predictedLabels.append(mostFrequentTagEn)
            elif token[1] == 'hi':
                predictedLabels.append(mostFrequentTagHi)
            if token[1] == 'rest':
                predictedLabels.append(mostFrequentTagRest)

print """ CRF Classification"""
print classification_report(y_true = correctLabels, y_pred = predictedLabels, digits = 4)

--> Creating map ...
--> Testing ...
 CRF Classification
             precision    recall  f1-score   support

        ADJ     0.8186    0.6106    0.6994       303
        ADP     0.9635    0.9203    0.9414       602
        ADV     0.9255    0.7028    0.7989       212
       CONJ     0.9730    0.9057    0.9381       159
        DET     0.9631    0.9414    0.9522       222
       NOUN     0.7198    0.8038    0.7595       994
        NUM     0.9841    0.7848    0.8732        79
       PART     0.8139    0.7852    0.7993       284
   PART_NEG     0.9659    1.0000    0.9827        85
       PRON     0.9796    0.8481    0.9091       283
    PRON_WH     0.9600    0.9114    0.9351        79
      PROPN     0.9676    0.7067    0.8168       549
       VERB     0.7508    0.8443    0.7948      1156
          X     0.9144    0.9856    0.9486      1593

avg / total     0.8649    0.8570    0.8564      6600

