In [1]:
import os
import sys
import json
import re

from glob import glob
from logging import error, warning

from common import CLASS_ABBREV_MAP, FINNISH_MAIN_REGISTER, load_conllu


CLASS_COMMENT_RE = re.compile(r'^#.*?\bregister:(.*)')


def get_class_from_comments(comments):
    class_ = None
    for comment in comments:
        m = CLASS_COMMENT_RE.match(comment)
        if m:
            if class_:
                raise ValueError('duplicate class')
            class_ = m.group(1)
    return class_


def load_conllu_with_class(fn):
    sentences, class_ = [], None
    for comments, sentence in load_conllu(fn):
        c = get_class_from_comments(comments)
        if c is not None:
            if class_ is not None:
                raise ValueError('duplicate class')
            class_ = c
        sentences.append(sentence)
    if class_ is None:
        raise ValueError('missing class in {}'.format(fn))
    class_ = FINNISH_MAIN_REGISTER[class_]
    return sentences, CLASS_ABBREV_MAP[class_]


def load_parsed_data(dirpath):
    parses, classes = [], []
    for fn in glob('{}/*.conllu'.format(dirpath)):
        sentences, class_ = load_conllu_with_class(fn)
        if class_ is None:
            continue    # class doesn't map across languages
        parses.append(sentences)
        classes.append(class_)
    return parses, classes


train_parses, train_classes = load_parsed_data('../data/split-parsed/train/')
devel_parses, devel_classes = load_parsed_data('../data/split-parsed/dev/')

In [2]:
from collections import Counter


MIN_EXAMPLES = 25    # filter classes with fewer


class_count = Counter()
for c in train_classes:
    class_count[c] += 1
target_class = set(c for c, v in class_count.items() if v >= MIN_EXAMPLES)


def filter_by_class(parses, classes, targets):
    filtered_parses, filtered_classes = [], []
    for t, c in zip(parses, classes):
        if c in targets:
            filtered_parses.append(t)
            filtered_classes.append(c)
    return filtered_parses, filtered_classes


train_parses, train_classes = filter_by_class(train_parses, train_classes, target_class)
devel_parses, devel_classes = filter_by_class(devel_parses, devel_classes, target_class)

In [3]:
from pprint import pprint

def class_counts(classes):
    counter = Counter()
    for c in classes:
        counter[c] += 1
    return counter


pprint(class_counts(train_classes))

Counter({'Narrative': 422,
         'Info-Persuasion': 269,
         'D-Informational': 218,
         'Opinion': 199,
         'How-to': 86,
         'Discussion': 84})


In [16]:
def words_from_sentences(sentences):
    words = []
    for sentence in sentences:
        for word in sentence:
            words.append(word.form)
    return words


def texts_from_parses(parses):
    texts = []
    for sentences in parses:
        texts.append(' '.join(words_from_sentences(sentences)))
    return texts


train_texts = texts_from_parses(train_parses)
devel_texts = texts_from_parses(devel_parses)

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC


vectorizer = TfidfVectorizer(analyzer='word', lowercase=False, ngram_range=(1,3))
vectorizer.fit(train_texts)

train_X = vectorizer.transform(train_texts)
devel_X = vectorizer.transform(devel_texts)

classifier = LinearSVC(C=1.0)
classifier.fit(train_X, train_classes)

classifier.score(devel_X, devel_classes)

0.5966850828729282

In [18]:
import eli5


eli5.show_weights(classifier, vec=vectorizer, top=(100,100))

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5
+0.828,tai,,,,
+0.581,McCartney,,,,
+0.551,lt,,,,
+0.542,sekä,,,,
+0.533,Healing,,,,
+0.496,2000,,,,
+0.462,eri,,,,
+0.452,lt ref,,,,
+0.452,ref,,,,
+0.452,Kazanin,,,,

Weight?,Feature
+0.828,tai
+0.581,McCartney
+0.551,lt
+0.542,sekä
+0.533,Healing
+0.496,2000
+0.462,eri
+0.452,lt ref
+0.452,ref
+0.452,Kazanin

Weight?,Feature
+0.904,sitten
+0.670,niin
+0.663,jos
+0.597,Melkein vuotta
+0.597,Melkein vuotta sitten
+0.575,en
+0.574,klo
+0.569,Vastaus
+0.532,kun
+0.491,Papu

Weight?,Feature
+0.725,Jos
+0.562,voi
+0.545,tai
+0.512,voit
+0.469,Santa
+0.450,Mikäli
+0.448,Jäsenen
+0.438,lähettää
+0.424,au
+0.406,olet

Weight?,Feature
+1.049,ja
+0.907,voit
+0.792,LR
+0.789,DVD
+0.764,Oy
+0.722,tarjoaa
+0.646,avulla
+0.632,sekä
+0.620,Hotel
+0.605,VIP

Weight?,Feature
+2.179,oli
+0.955,nyt
+0.865,kun
+0.769,Falun
+0.729,jo
+0.700,sai
+0.672,vielä
+0.657,vähän
+0.601,ihan
+0.590,mun

Weight?,Feature
+1.707,Jumalan
+1.388,ei
+0.920,Jeesus
+0.889,Jeesuksen
+0.852,että
+0.738,kuin
+0.703,ole
+0.684,Jumala
+0.659,on
+0.642,ja


In [20]:
import numpy as np

from sklearn.metrics import confusion_matrix
from pandas import DataFrame


pred_Y = classifier.predict(devel_X)
cm = confusion_matrix(devel_classes, pred_Y)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]    # normalize
df = DataFrame(cm * 100, index=classifier.classes_, columns=classifier.classes_)
df.round(2)

Unnamed: 0,D-Informational,Discussion,How-to,Info-Persuasion,Narrative,Opinion
D-Informational,16.13,0.0,0.0,29.03,38.71,16.13
Discussion,0.0,0.0,0.0,0.0,81.82,18.18
How-to,25.0,0.0,16.67,25.0,25.0,8.33
Info-Persuasion,2.7,0.0,0.0,72.97,16.22,8.11
Narrative,0.0,0.0,0.0,6.56,81.97,11.48
Opinion,0.0,0.0,0.0,3.45,13.79,82.76
