In [8]:
import os
import sys
import json
import re

from glob import glob
from logging import error, warning

from common import FINNISH_MAIN_REGISTER, CLASS_ABBREV_MAP


def get_docid_class_map():
    docid_class_map = {}
    with open('../data/docid-label-map.tsv') as f:
        for ln, l in enumerate(f, start=1):
            l = l.rstrip()
            fields = l.split('\t')
            docid, class_ = fields
            if docid in docid_class_map:
                raise ValueError('duplicate docid: {}'.format(docid))
            docid_class_map[docid] = class_
    return docid_class_map


def load_biarc_data(dirpath):
    docid_class_map = get_docid_class_map()
    biarcs, classes = [], []
    for fn in glob('{}/*.txt'.format(dirpath)):
        m = re.match(r'^(.*?)-barcs\.', os.path.basename(fn))
        docid = m.group(1)
        class_ = docid_class_map[docid]
        with open(fn) as f:
            data = f.read().strip().split('\t')
        class_ = FINNISH_MAIN_REGISTER[class_]
        if class_ is None:
            continue    # class doesn't map across languages
        biarcs.append(data)
        classes.append(CLASS_ABBREV_MAP[class_])
    return biarcs, classes


train_biarcs, train_classes = load_biarc_data('../data/split-biarcs/train/')
devel_biarcs, devel_classes = load_biarc_data('../data/split-biarcs/dev/')

In [9]:
from collections import Counter


MIN_EXAMPLES = 25    # filter classes with fewer


class_count = Counter()
for c in train_classes:
    class_count[c] += 1
target_class = set(c for c, v in class_count.items() if v >= MIN_EXAMPLES)


def filter_by_class(data, classes, targets):
    filtered_data, filtered_classes = [], []
    for t, c in zip(data, classes):
        if c in targets:
            filtered_data.append(t)
            filtered_classes.append(c)
    return filtered_data, filtered_classes


train_biarcs, train_classes = filter_by_class(train_biarcs, train_classes, target_class)
devel_biarcs, devel_classes = filter_by_class(devel_biarcs, devel_classes, target_class)

In [10]:
from pprint import pprint

def class_counts(classes):
    counter = Counter()
    for c in classes:
        counter[c] += 1
    return counter


pprint(class_counts(train_classes))

Counter({'Narrative': 422,
         'Info-Persuasion': 269,
         'D-Informational': 218,
         'Opinion': 199,
         'How-to': 86,
         'Discussion': 84})


In [11]:
def biarcs_to_text(biarcs):
    return ['\t'.join(b) for b in biarcs]


train_texts = biarcs_to_text(train_biarcs)
devel_texts = biarcs_to_text(devel_biarcs)

In [12]:
from itertools import combinations

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC


def biarc_tokenizer(text):
    for biarc in text.split('\t'):
        yield biarc
        parts = biarc.split(' ')
        for twoparts in combinations(parts, 2):
            yield '---'.join(twoparts)
        for part in parts:
            yield part


# detailed classes: 0.5582

#vectorizer = CountVectorizer(analyzer='word', tokenizer=biarc_tokenizer, lowercase=False, ngram_range=(1,1))
vectorizer = TfidfVectorizer(analyzer='word', tokenizer=biarc_tokenizer, lowercase=False, ngram_range=(1,1))
vectorizer.fit(train_texts)

train_X = vectorizer.transform(train_texts)
devel_X = vectorizer.transform(devel_texts)

classifier = LinearSVC(C=1)
classifier.fit(train_X, train_classes)

classifier.score(devel_X, devel_classes)

0.5248618784530387

In [13]:
import eli5


eli5.show_weights(classifier, vec=vectorizer, top=100)

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5
+1.080,csubj:cop/0---conj/1,,,,
+0.973,advcl/0---obl/1,,,,
+0.948,xcomp:ds/2,,,,
+0.936,ROOT/0---appos/1,,,,
+0.912,nmod:gobj/3,,,,
+0.878,obl/0 appos/1 conj/2,,,,
+0.803,advmod/3---appos/0,,,,
+0.763,flat:name/2---flat:name/2,,,,
+0.760,acl/3,,,,
+0.756,conj/0---flat:name/1,,,,

Weight?,Feature
+1.080,csubj:cop/0---conj/1
+0.973,advcl/0---obl/1
+0.948,xcomp:ds/2
+0.936,ROOT/0---appos/1
+0.912,nmod:gobj/3
+0.878,obl/0 appos/1 conj/2
+0.803,advmod/3---appos/0
+0.763,flat:name/2---flat:name/2
+0.760,acl/3
+0.756,conj/0---flat:name/1

Weight?,Feature
+0.984,obj/0---flat:name/1
+0.945,parataxis/0
+0.909,case/3
+0.760,conj/0
+0.751,nsubj:cop/1
+0.742,parataxis/0---flat:name/1
+0.732,advmod/2
+0.728,xcomp/0---ccomp/1
+0.720,nmod/3---ROOT/0
+0.701,ROOT/0---case/3

Weight?,Feature
+1.190,advcl/2---ROOT/0
+1.122,obj/2
+0.767,advcl/3
+0.753,advcl/2
+0.745,advcl/3---ROOT/0
+0.715,obl/2
+0.713,compound:nn/1
+0.677,ROOT/0---compound:nn/1
+0.671,obj/2---ROOT/0
+0.632,appos/1

Weight?,Feature
+1.439,amod/3
+1.124,ROOT/0---obj/1
+1.040,obj/0 appos/1 flat:name/2
+0.980,conj/2
+0.965,obj/1
+0.928,compound:nn/2---ROOT/0
+0.903,acl/3---ROOT/0
+0.879,amod/3---conj/1
+0.877,ROOT/0---acl:relcl/1
+0.813,amod/3 conj/1 obj/0

Weight?,Feature
+2.034,nummod/3
+1.597,discourse/3
+1.571,discourse/2
+1.466,obl/1
+1.347,flat/1
+1.073,ROOT/0---parataxis/1
+1.043,nsubj/3---parataxis/1
+0.971,parataxis/1
+0.934,ROOT/0 nsubj/3 parataxis/1
+0.932,conj/0

Weight?,Feature
+1.798,nsubj/2
+1.273,nsubj/3
+1.267,nmod:poss/0---conj/1
+1.197,cc/2---ROOT/0
+1.074,obl/3
+1.035,obl/0---case/1
+1.020,conj/2
+1.012,obj/0---conj/1
+0.957,advmod/3---ROOT/0
+0.937,ROOT/0---conj/2


In [15]:
import numpy as np

from sklearn.metrics import confusion_matrix
from pandas import DataFrame


pred_Y = classifier.predict(devel_X)
cm = confusion_matrix(devel_classes, pred_Y)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]    # normalize
df = DataFrame(cm * 100, index=classifier.classes_, columns=classifier.classes_)
df.round(2)

Unnamed: 0,D-Informational,Discussion,How-to,Info-Persuasion,Narrative,Opinion
D-Informational,22.58,3.23,0.0,32.26,29.03,12.9
Discussion,0.0,9.09,0.0,0.0,63.64,27.27
How-to,8.33,0.0,16.67,25.0,41.67,8.33
Info-Persuasion,5.41,0.0,0.0,54.05,35.14,5.41
Narrative,8.2,0.0,0.0,6.56,73.77,11.48
Opinion,0.0,0.0,0.0,10.34,20.69,68.97
