In [19]:
import os
import sys
import json
import re

from glob import glob
from logging import error, warning

from common import CLASS_MAP, load_conllu


CLASS_COMMENT_RE = re.compile(r'^#.*?\bregister:(.*)')


def get_class_from_comments(comments):
    class_ = None
    for comment in comments:
        m = CLASS_COMMENT_RE.match(comment)
        if m:
            if class_:
                raise ValueError('duplicate class')
            class_ = m.group(1)
    return CLASS_MAP[class_]


def load_conllu_with_class(fn):
    sentences, class_ = [], None
    for comments, sentence in load_conllu(fn):
        c = get_class_from_comments(comments)
        if c is not None:
            if class_ is not None:
                raise ValueError('duplicate class')
            class_ = c
        sentences.append(sentence)
    if class_ is None:
        raise ValueError('missing class in {}'.format(fn))
    return sentences, class_


def load_parsed_data(dirpath):
    parses, classes = [], []
    for fn in glob('{}/*.conllu'.format(dirpath)):
        sentences, class_ = load_conllu_with_class(fn)
        parses.append(sentences)
        classes.append(class_)
    return parses, classes


train_parses, train_classes = load_parsed_data('../data/split-parsed/train/')
devel_parses, devel_classes = load_parsed_data('../data/split-parsed/dev/')

KeyError: None

In [5]:
from collections import Counter


MIN_EXAMPLES = 25    # filter classes with fewer


class_count = Counter()
for c in train_classes:
    class_count[c] += 1
target_class = set(c for c, v in class_count.items() if v >= MIN_EXAMPLES)


def filter_by_class(parses, classes, targets):
    filtered_parses, filtered_classes = [], []
    for t, c in zip(parses, classes):
        if c in targets:
            filtered_parses.append(t)
            filtered_classes.append(c)
    return filtered_parses, filtered_classes


train_parses, train_classes = filter_by_class(train_parses, train_classes, target_class)
devel_parses, devel_classes = filter_by_class(devel_parses, devel_classes, target_class)

In [9]:
from pprint import pprint

def class_counts(classes):
    counter = Counter()
    for c in classes:
        counter[c] += 1
    return counter


pprint(class_counts(train_classes))

Counter({'Machine-translated / generated texts': 296,
         'Description with intent to sell': 243,
         'Personal blog': 139,
         'Description of a thing': 103,
         'News reports / news blogs': 93,
         'News reports / News blogs': 91,
         'How-to/instructions': 86,
         'Religious blogs/sermons': 75,
         'Personal opinion blogs': 70,
         'Discussion forums': 65,
         'Reviews': 44,
         'Encyclopedia articles': 35,
         'Community blogs': 33,
         'Sports reports': 30,
         'News+Opinion blogs / Editorials': 26})


In [15]:
def words_from_sentences(sentences):
    words = []
    for sentence in sentences:
        for word in sentence:
            words.append(word.deprel)
    return words


def texts_from_parses(parses):
    texts = []
    for sentences in parses:
        texts.append(' '.join(words_from_sentences(sentences)))
    return texts


train_texts = texts_from_parses(train_parses)
devel_texts = texts_from_parses(devel_parses)

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC


vectorizer = TfidfVectorizer(analyzer='word', lowercase=False, ngram_range=(1,3))
vectorizer.fit(train_texts)

train_X = vectorizer.transform(train_texts)
devel_X = vectorizer.transform(devel_texts)

classifier = LinearSVC(C=1.0)
classifier.fit(train_X, train_classes)

classifier.score(devel_X, devel_classes)

0.5615763546798029

In [17]:
import eli5


eli5.show_weights(classifier, vec=vectorizer, top=(100,100))

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,Unnamed: 13_level_6,Unnamed: 14_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,Unnamed: 13_level_7,Unnamed: 14_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8,Unnamed: 13_level_8,Unnamed: 14_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9,Unnamed: 12_level_9,Unnamed: 13_level_9,Unnamed: 14_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10,Unnamed: 13_level_10,Unnamed: 14_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11,Unnamed: 12_level_11,Unnamed: 13_level_11,Unnamed: 14_level_11
Weight?,Feature,Unnamed: 2_level_12,Unnamed: 3_level_12,Unnamed: 4_level_12,Unnamed: 5_level_12,Unnamed: 6_level_12,Unnamed: 7_level_12,Unnamed: 8_level_12,Unnamed: 9_level_12,Unnamed: 10_level_12,Unnamed: 11_level_12,Unnamed: 12_level_12,Unnamed: 13_level_12,Unnamed: 14_level_12
Weight?,Feature,Unnamed: 2_level_13,Unnamed: 3_level_13,Unnamed: 4_level_13,Unnamed: 5_level_13,Unnamed: 6_level_13,Unnamed: 7_level_13,Unnamed: 8_level_13,Unnamed: 9_level_13,Unnamed: 10_level_13,Unnamed: 11_level_13,Unnamed: 12_level_13,Unnamed: 13_level_13,Unnamed: 14_level_13
Weight?,Feature,Unnamed: 2_level_14,Unnamed: 3_level_14,Unnamed: 4_level_14,Unnamed: 5_level_14,Unnamed: 6_level_14,Unnamed: 7_level_14,Unnamed: 8_level_14,Unnamed: 9_level_14,Unnamed: 10_level_14,Unnamed: 11_level_14,Unnamed: 12_level_14,Unnamed: 13_level_14,Unnamed: 14_level_14
+0.570,conj obj ccomp,,,,,,,,,,,,,
+0.481,obj ccomp,,,,,,,,,,,,,
+0.463,advmod root obj,,,,,,,,,,,,,
+0.458,obj compound nummod,,,,,,,,,,,,,
+0.452,advmod advmod advmod,,,,,,,,,,,,,
+0.429,case cop nmod,,,,,,,,,,,,,
+0.422,ccomp obl obj,,,,,,,,,,,,,
+0.421,ccomp,,,,,,,,,,,,,
+0.420,poss nmod poss,,,,,,,,,,,,,
+0.414,obj ccomp punct,,,,,,,,,,,,,

Weight?,Feature
+0.570,conj obj ccomp
+0.481,obj ccomp
+0.463,advmod root obj
+0.458,obj compound nummod
+0.452,advmod advmod advmod
+0.429,case cop nmod
+0.422,ccomp obl obj
+0.421,ccomp
+0.420,poss nmod poss
+0.414,obj ccomp punct

Weight?,Feature
+0.725,relcl obl cc
+0.719,punct nsubj cop
+0.713,poss obj acl
+0.687,poss root cop
+0.664,obj root
+0.660,obj root nmod
+0.648,obl root obj
+0.638,acl
+0.633,acl nmod punct
+0.632,obj punct obl

Weight?,Feature
+1.195,cop punct nmod
+1.149,cc conj
+0.948,obj advmod
+0.926,conj punct obl
+0.916,root
+0.891,cop amod
+0.872,amod cc conj
+0.846,amod cc
+0.803,parataxis punct advmod
+0.775,gobj nsubj

Weight?,Feature
+0.767,advcl punct advmod
+0.750,nsubj advmod
+0.746,advmod
+0.746,punct advmod
+0.742,det
+0.711,nummod obl case
+0.668,case nsubj cop
+0.619,case nsubj
+0.605,punct root
+0.603,advcl

Weight?,Feature
+1.419,punct punct punct
+1.055,punct punct appos
+1.022,punct punct
+0.932,punct flat name
+0.833,punct flat
+0.711,name punct flat
+0.633,punct obl punct
+0.608,punct
+0.521,punct appos
+0.516,appos punct punct

Weight?,Feature
+0.906,advcl
+0.679,advcl obj
+0.668,obj aux root
+0.648,obl punct obj
+0.647,cc conj obj
+0.647,obl
+0.638,obj aux
+0.591,advmod aux acl
+0.575,advcl obj punct
+0.565,obj

Weight?,Feature
+3.589,obj
+1.612,obj punct
+1.476,conj
+1.283,compound nn
+1.283,nn
+1.218,aux
+1.190,compound
+1.175,punct mark
+1.151,appos
+1.093,obj punct nsubj

Weight?,Feature
+0.901,nummod amod
+0.786,obl appos nmod
+0.730,flat name obl
+0.730,name obl
+0.725,obj obl compound
+0.712,nsubj parataxis obl
+0.690,obl
+0.665,obl flat
+0.643,obl compound
+0.622,flat

Weight?,Feature
+1.075,nsubj parataxis punct
+0.830,parataxis punct
+0.821,obj nmod gobj
+0.816,flat punct
+0.790,compound nummod nmod
+0.787,obl nummod nmod
+0.785,obl obl flat
+0.666,nummod nmod punct
+0.657,root cop csubj
+0.648,root cop advmod

Weight?,Feature
+0.463,aux
+0.412,nmod aux
+0.376,root xcomp punct
+0.364,root
+0.363,poss obl
+0.363,nmod poss obl
+0.361,nsubj root obj
+0.342,nummod conj
+0.340,amod
+0.335,nmod gsubj nmod

Weight?,Feature
+2.013,discourse
+1.416,advmod
+1.072,advmod advmod
+0.980,obl
+0.892,punct discourse
+0.861,obl punct discourse
+0.767,flat flat
+0.760,xcomp
+0.731,advmod conj
+0.662,advmod advmod punct

Weight?,Feature
+0.697,punct cop
+0.674,punct advmod
+0.669,acl punct
+0.647,punct aux
+0.603,aux
+0.597,ccomp advmod
+0.572,nmod punct mark
+0.562,ccomp
+0.562,advmod nsubj
+0.558,advmod acl obj

Weight?,Feature
+1.995,punct
+1.620,nsubj
+1.167,xcomp obl nsubj
+1.114,obl nsubj det
+0.877,punct nsubj
+0.828,nmod poss
+0.828,poss
+0.827,nsubj det nmod
+0.801,advmod xcomp obl
+0.750,punct obj nsubj

Weight?,Feature
+0.803,advcl root xcomp
+0.708,xcomp advmod nmod
+0.705,nmod advmod conj
+0.683,amod
+0.666,advmod det conj
+0.651,cc advmod det
+0.639,advmod nmod poss
+0.590,xcomp
+0.587,advmod amod obl
+0.578,poss punct conj

Weight?,Feature
+1.098,nummod
+0.711,nsubj flat name
+0.703,nsubj flat
+0.701,obl
+0.644,flat name root
+0.644,name root
+0.587,nummod obl
+0.556,nummod obj
+0.534,advmod nummod obj
+0.518,nummod nmod poss


In [18]:
import numpy as np

from sklearn.metrics import confusion_matrix
from pandas import DataFrame


pred_Y = classifier.predict(devel_X)
cm = confusion_matrix(devel_classes, pred_Y)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]    # normalize
df = DataFrame(cm * 100, index=classifier.classes_, columns=classifier.classes_)
df.round(2)

Unnamed: 0,Community blogs,Description of a thing,Description with intent to sell,Discussion forums,Encyclopedia articles,How-to/instructions,Machine-translated / generated texts,News reports / News blogs,News reports / news blogs,News+Opinion blogs / Editorials,Personal blog,Personal opinion blogs,Religious blogs/sermons,Reviews,Sports reports
Community blogs,0.0,20.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,0.0,0.0,20.0
Description of a thing,0.0,13.33,53.33,0.0,0.0,6.67,6.67,0.0,13.33,0.0,0.0,0.0,6.67,0.0,0.0
Description with intent to sell,0.0,3.03,75.76,0.0,0.0,6.06,3.03,3.03,3.03,0.0,3.03,0.0,3.03,0.0,0.0
Discussion forums,0.0,0.0,11.11,0.0,0.0,0.0,11.11,0.0,0.0,0.0,66.67,11.11,0.0,0.0,0.0
Encyclopedia articles,0.0,0.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,0.0,0.0
How-to/instructions,0.0,8.33,33.33,0.0,0.0,8.33,16.67,8.33,0.0,0.0,25.0,0.0,0.0,0.0,0.0
Machine-translated / generated texts,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
News reports / News blogs,0.0,7.69,7.69,0.0,0.0,0.0,7.69,38.46,23.08,0.0,0.0,0.0,15.38,0.0,0.0
News reports / news blogs,0.0,7.69,23.08,0.0,0.0,15.38,7.69,7.69,23.08,0.0,15.38,0.0,0.0,0.0,0.0
News+Opinion blogs / Editorials,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25.0,0.0,25.0,0.0,50.0,0.0,0.0
