In [13]:
import os
import sys
import json
import re

from glob import glob
from logging import error, warning

from common import CLASS_MAP


def get_docid_class_map():
    docid_class_map = {}
    with open('../data/docid-label-map.tsv') as f:
        for ln, l in enumerate(f, start=1):
            l = l.rstrip()
            fields = l.split('\t')
            docid, class_ = fields
            if docid in docid_class_map:
                raise ValueError('duplicate docid: {}'.format(docid))
            docid_class_map[docid] = class_
    return docid_class_map


def load_biarc_data(dirpath):
    docid_class_map = get_docid_class_map()
    biarcs, classes = [], []
    for fn in glob('{}/*.txt'.format(dirpath)):
        m = re.match(r'^(.*?)-barcs\.', os.path.basename(fn))
        docid = m.group(1)
        class_ = docid_class_map[docid]
        with open(fn) as f:
            data = f.read().strip().split('\t')
        biarcs.append(data)
        classes.append(CLASS_MAP[class_])
    return biarcs, classes


train_biarcs, train_classes = load_biarc_data('../data/split-biarcs/train/')
devel_biarcs, devel_classes = load_biarc_data('../data/split-biarcs/dev/')

In [14]:
from collections import Counter


MIN_EXAMPLES = 25    # filter classes with fewer


class_count = Counter()
for c in train_classes:
    class_count[c] += 1
target_class = set(c for c, v in class_count.items() if v >= MIN_EXAMPLES)


def filter_by_class(data, classes, targets):
    filtered_data, filtered_classes = [], []
    for t, c in zip(data, classes):
        if c in targets:
            filtered_data.append(t)
            filtered_classes.append(c)
    return filtered_data, filtered_classes


train_biarcs, train_classes = filter_by_class(train_biarcs, train_classes, target_class)
devel_biarcs, devel_classes = filter_by_class(devel_biarcs, devel_classes, target_class)

In [15]:
from pprint import pprint

def class_counts(classes):
    counter = Counter()
    for c in classes:
        counter[c] += 1
    return counter


pprint(class_counts(train_classes))

Counter({'MT/Gen': 296,
         'D-Sell': 243,
         'News': 184,
         'B-Personal': 139,
         'D-Thing': 103,
         'How-to': 86,
         'Religious': 75,
         'B-Personal-Opinion': 70,
         'Forums': 65,
         'B-Community': 49,
         'Reviews': 44,
         'A-Encyclopedia': 35,
         'Sports': 30,
         'Editorials': 26})


In [17]:
def biarcs_to_text(biarcs):
    return ['\t'.join(b) for b in biarcs]


train_texts = biarcs_to_text(train_biarcs)
devel_texts = biarcs_to_text(devel_biarcs)

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC


def biarc_tokenizer(text):
    return text.split('\t')


vectorizer = TfidfVectorizer(analyzer='word', tokenizer=biarc_tokenizer, lowercase=False, ngram_range=(1,1))
vectorizer.fit(train_texts)

train_X = vectorizer.transform(train_texts)
devel_X = vectorizer.transform(devel_texts)

classifier = LinearSVC(C=1.0)
classifier.fit(train_X, train_classes)

classifier.score(devel_X, devel_classes)

0.5436893203883495

In [30]:
import eli5


eli5.show_weights(classifier, vec=vectorizer, top=100)

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,Unnamed: 13_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,Unnamed: 13_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8,Unnamed: 13_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9,Unnamed: 12_level_9,Unnamed: 13_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10,Unnamed: 13_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11,Unnamed: 12_level_11,Unnamed: 13_level_11
Weight?,Feature,Unnamed: 2_level_12,Unnamed: 3_level_12,Unnamed: 4_level_12,Unnamed: 5_level_12,Unnamed: 6_level_12,Unnamed: 7_level_12,Unnamed: 8_level_12,Unnamed: 9_level_12,Unnamed: 10_level_12,Unnamed: 11_level_12,Unnamed: 12_level_12,Unnamed: 13_level_12
Weight?,Feature,Unnamed: 2_level_13,Unnamed: 3_level_13,Unnamed: 4_level_13,Unnamed: 5_level_13,Unnamed: 6_level_13,Unnamed: 7_level_13,Unnamed: 8_level_13,Unnamed: 9_level_13,Unnamed: 10_level_13,Unnamed: 11_level_13,Unnamed: 12_level_13,Unnamed: 13_level_13
+1.941,flat:name/2 conj/0 flat:name/2,,,,,,,,,,,,
+0.816,appos/0 appos/1 flat:name/2,,,,,,,,,,,,
+0.776,nummod/0 flat:name/1 flat:name/1,,,,,,,,,,,,
+0.705,ROOT/0 flat:name/1 appos/1,,,,,,,,,,,,
+0.615,ROOT/0 conj/1 conj/1 case/1,,,,,,,,,,,,
+0.510,nmod/0 nmod:poss/1 flat:name/2,,,,,,,,,,,,
+0.505,flat:name/3 flat:name/3 conj/0,,,,,,,,,,,,
+0.494,ROOT/0 appos/1 flat:name/2,,,,,,,,,,,,
+0.469,compound/0 flat/1 flat/1,,,,,,,,,,,,
+0.468,nmod:poss/3 case/1 obj/0 conj/3,,,,,,,,,,,,

Weight?,Feature
+1.941,flat:name/2 conj/0 flat:name/2
+0.816,appos/0 appos/1 flat:name/2
+0.776,nummod/0 flat:name/1 flat:name/1
+0.705,ROOT/0 flat:name/1 appos/1
+0.615,ROOT/0 conj/1 conj/1 case/1
+0.510,nmod/0 nmod:poss/1 flat:name/2
+0.505,flat:name/3 flat:name/3 conj/0
+0.494,ROOT/0 appos/1 flat:name/2
+0.469,compound/0 flat/1 flat/1
+0.468,nmod:poss/3 case/1 obj/0 conj/3

Weight?,Feature
+0.773,acl/0 advcl/1 obl/2 case/3
+0.765,xcomp/0 advmod/1 obl/1 case/3
+0.739,parataxis/0 flat:name/1 flat:name/1
+0.623,nmod:gobj/2 nsubj:cop/0 conj/2
+0.604,nsubj/0 conj/1 flat:name/2
+0.604,nsubj/0 flat:name/1 conj/1
+0.582,xcomp:ds/0 xcomp/1 obl/2
+0.571,compound:nn/3 flat:name/1 compound:nn/0
+0.571,obl/0 acl/1 obl/2
+0.570,advmod/0 appos/1 conj/2

Weight?,Feature
+0.934,advmod/2 advcl/0 conj/2
+0.915,ROOT/0 advmod/3 acl:relcl/1
+0.848,advmod/3 advmod/3 conj/0
+0.845,advmod/2 ccomp/0 conj/2
+0.816,nsubj/0 conj/1 conj/1
+0.759,advmod/2 amod/3 ROOT/0
+0.746,cc/2 ROOT/0 conj/2 conj/2
+0.739,discourse/2 ROOT/0 conj/2
+0.731,ROOT/0 advmod/3 advmod/1
+0.722,ROOT/0 advmod/3 conj/1

Weight?,Feature
+0.783,flat:name/0 flat:name/1 flat:name/1
+0.771,advmod/3 nsubj/3 ROOT/0
+0.718,nmod/0 advmod/3 ccomp/1
+0.715,nsubj/2 parataxis/0 conj/2
+0.696,advmod/2 acl/3 obj/0
+0.685,ROOT/0 cop/3 ccomp/1
+0.641,compound:nn/2 nmod:poss/3 ccomp/0
+0.620,obl/2 ROOT/0 ccomp/2
+0.576,advcl/0 conj/1 conj/1
+0.556,appos/0 flat:foreign/1 flat:name/1

Weight?,Feature
+1.213,advmod/2 appos/0 conj/2
+1.194,ROOT/0 amod/3 obj/1
+1.116,amod/3 conj/1 obj/0
+1.106,obj/0 appos/1 flat:name/2
+1.060,xcomp/0 conj/1 conj/1
+1.059,nsubj/3 conj/1 ROOT/0
+1.028,ROOT/0 obl/1 conj/2
+0.936,parataxis/0 obl/1 conj/2
+0.924,appos/0 conj/1 flat:name/1
+0.892,obl/0 conj/1 conj/1 case/1

Weight?,Feature
+1.163,obl/0 appos/1 conj/2
+0.787,nsubj/0 nmod/1 conj/2
+0.787,nmod:poss/2 ROOT/0 cop/2
+0.779,obl/0 flat/1 flat/1
+0.771,nmod:gobj/2 obj/0 conj/2
+0.722,csubj:cop/0 obj/1 conj/1
+0.712,conj/0 obj/1 nummod/2
+0.706,acl/2 nmod:poss/3 ROOT/0
+0.702,obj/2 ROOT/0 obl/2
+0.685,nsubj:cop/0 conj/1 conj/1

Weight?,Feature
+1.059,parataxis/0 nummod/1 conj/1
+0.971,flat:name/2 flat:name/0 flat:name/2
+0.796,obl/0 flat:name/1 flat:name/1
+0.490,nmod:gobj/2 ccomp/0 conj/2
+0.466,nmod:poss/2 obl/3 ROOT/0
+0.432,acl/0 xcomp/1 conj/1
+0.386,csubj:cop/0 obl/1 obl/1
+0.383,amod/2 ROOT/0 parataxis/2
+0.372,amod/2 ccomp/0 conj/2
+0.370,parataxis/0 conj/1 flat:name/2

Weight?,Feature
+1.234,nummod/2 obl/4 case/2 ROOT/0
+0.912,advmod/2 nummod/3 obl/0 case/3
+0.869,obj/0 flat:name/1 flat:name/1
+0.841,obl/2 appos/0 conj/2
+0.827,parataxis/0 flat:name/1 conj/1
+0.701,appos/0 flat:foreign/1 flat:foreign/1
+0.632,nmod/3 case/1 ROOT/0 flat:name/3
+0.615,nsubj/0 flat:name/1 nmod/1
+0.607,nmod/2 ROOT/0 parataxis/2
+0.607,advcl/3 advmod/3 ROOT/0

Weight?,Feature
+1.165,advcl/2 ROOT/0 obl/2
+0.930,advcl/3 obj/1 ROOT/0
+0.873,advcl/2 ROOT/0 obj/2
+0.856,compound:nn/0 flat:foreign/1 flat:name/1
+0.763,nmod:poss/2 appos/0 conj/2
+0.651,obj/2 ROOT/0 obl/2
+0.649,advcl/2 ROOT/0 conj/2
+0.619,ROOT/0 obl/1 conj/1
+0.605,obj/2 ROOT/0 advmod/2
+0.585,acl:relcl/0 xcomp/1 obl/2

Weight?,Feature
+1.615,obj/2 ROOT/0 obj/2
+1.593,conj/0 flat:name/1 flat:name/1
+1.403,ROOT/0 obj/1 obj/1
+1.385,obj/2 conj/0 obj/2
+1.126,conj/0 obj/1 obj/1
+1.071,nsubj/2 acl:relcl/0 obj/2
+1.067,ROOT/0 obj/1 conj/1
+0.958,obj/2 acl:relcl/0 obj/2
+0.940,obj/0 acl/1 obj/2
+0.933,ROOT/0 obj/1 flat:name/2

Weight?,Feature
+1.501,ROOT/0 obl/1 flat/2
+1.345,ROOT/0 nsubj/3 parataxis/1
+1.168,compound:nn/3 flat:name/1 nsubj/0
+1.152,compound:nn/3 flat:name/1 obl/0
+1.138,obj/0 nmod:gobj/3 nmod/1
+1.095,obl/3 flat/1 ROOT/0
+1.029,compound:nn/2 nsubj/0 flat:name/2
+0.913,obl/2 acl/3 obl/0 case/3
+0.898,nummod/2 nmod:poss/3 obj/0
+0.897,compound:nn/2 nsubj/3 parataxis/0

Weight?,Feature
+0.939,ROOT/0 xcomp/1 nsubj/1
+0.929,nsubj/2 ROOT/0 parataxis/2
+0.883,ROOT/0 xcomp/1 obl/1
+0.823,ROOT/0 obl/1 nsubj/1
+0.792,obj/3 nsubj/3 acl:relcl/0
+0.772,ROOT/0 advmod/1 xcomp/1
+0.758,obj/0 conj/1 conj/1
+0.720,ROOT/0 ccomp/1 obl/2
+0.713,ROOT/0 nsubj/1 nmod/2
+0.703,advcl/3 nsubj/3 ROOT/0

Weight?,Feature
+1.044,nmod:poss/3 conj/1 nsubj/0
+1.041,acl/2 nmod:poss/0 conj/2
+0.936,advmod/2 nmod:poss/0 conj/2
+0.816,nmod:poss/0 nmod/3 conj/1
+0.742,nmod:poss/0 conj/1 nmod/2
+0.735,nmod:poss/2 obj/0 conj/1
+0.726,obl/0 appos/1 flat:name/2
+0.633,case/2 nmod/3 ROOT/0 flat:name/3
+0.622,amod/2 nsubj:cop/0 flat:name/2
+0.622,nmod/2 conj/0 nmod/2

Weight?,Feature
+0.969,nsubj/2 ROOT/0 obl/2
+0.882,nsubj/3 flat:name/1 ROOT/0
+0.633,ROOT/0 obl/1 obl/1
+0.631,amod/2 nsubj/0 flat:name/2
+0.611,appos/0 conj/1 flat:name/2
+0.568,nummod/2 nmod:poss/3 obl/0
+0.554,parataxis/0 orphan/1 conj/1
+0.510,nmod:poss/2 compound:nn/3 nsubj/0
+0.507,nsubj/0 appos/1 conj/2
+0.483,nmod:poss/2 amod/3 obl/0


In [29]:
import numpy as np

from sklearn.metrics import confusion_matrix
from pandas import DataFrame


pred_Y = classifier.predict(devel_X)
cm = confusion_matrix(devel_classes, pred_Y)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]    # normalize
df = DataFrame(cm * 100, index=classifier.classes_, columns=classifier.classes_)
df.round(2)

Unnamed: 0,A-Encyclopedia,B-Community,B-Personal,B-Personal-Opinion,D-Sell,D-Thing,Editorials,Forums,How-to,MT/Gen,News,Religious,Reviews,Sports
A-Encyclopedia,0.0,0.0,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,80.0,0.0,0.0,0.0
B-Community,0.0,0.0,25.0,0.0,37.5,12.5,0.0,0.0,0.0,0.0,25.0,0.0,0.0,0.0
B-Personal,5.0,0.0,70.0,0.0,5.0,5.0,0.0,5.0,0.0,5.0,0.0,5.0,0.0,0.0
B-Personal-Opinion,0.0,0.0,30.0,30.0,0.0,0.0,0.0,10.0,0.0,0.0,10.0,10.0,10.0,0.0
D-Sell,0.0,0.0,9.09,0.0,54.55,6.06,0.0,0.0,3.03,12.12,12.12,0.0,3.03,0.0
D-Thing,0.0,0.0,0.0,0.0,46.67,13.33,0.0,0.0,0.0,13.33,26.67,0.0,0.0,0.0
Editorials,0.0,0.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,25.0,0.0,0.0
Forums,0.0,0.0,44.44,11.11,11.11,0.0,0.0,22.22,0.0,11.11,0.0,0.0,0.0,0.0
How-to,0.0,0.0,16.67,0.0,16.67,8.33,0.0,0.0,33.33,8.33,16.67,0.0,0.0,0.0
MT/Gen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
