In [20]:
import os
import sys
import json

from logging import error, warning


CLASS_MAP = {
    'Machine-translated / generated texts': 'MT/Gen',
    'Description with intent to sell': 'D-Sell',
    'Personal blog': 'B-Personal',
    'Description of a thing': 'D-Thing',
    'News reports / News blogs': 'News',
    'News reports / news blogs': 'News',    # typo fix
    'How-to/instructions': 'How-to',
    'Religious blogs/sermons': 'Religious',
    'Personal opinion blogs': 'B-Personal-Opinion',
    'Discussion forums': 'Forums',
    'Reviews': 'Reviews',
    'Encyclopedia articles': 'A-Encyclopedia',
    'Community blogs': 'B-Community',
    'Community blog': 'B-Community',    # typo fix
    'Sports reports': 'Sports',
    'News+Opinion blogs / Editorials': 'Editorials',
    'Description of a person': 'D-Person',
    'Information blogs': 'B-Information',
    'Online article': 'A-Online',
    'Research articles': 'A-Research',
    'Historical article': 'A-Historical',
    'Question-Answer forums': 'QA-forums',
    'Advice': 'Advice',
    'Travel blog': 'B-Travel',
    'Narrative general': 'Narrative',
    'Interactive discussion general': 'Discussion',
    'Fiction': 'Fiction',
    'FAQs': 'FAQs',
    'Legal terms / conditions': 'Legal',
    'Informational Description general': 'D-Informational',
    'Course materials': 'Course',
    'Interviews': 'Interviews',
    'Report': 'Report',
    'Formal speeches': 'Speeches',
    'Recipes': 'Recipes',
    'Informational Persuasion general': 'Info-Persuasion',
}


def load_data(fn):
    texts, classes = [], []
    with open(fn) as f:
        data = json.load(f)
    for doc_id, doc_data in data.items():
        try:
            text = doc_data['text']
        except KeyError:
            warning('no text for {}, skipping'.format(doc_id))
            continue
        class_ = doc_data['register-1']
        try:
            class_ = CLASS_MAP[class_]
        except KeyError:
            warning('no mapping for class {}'.format(class_))
        texts.append(text)
        classes.append(class_)
    return texts, classes


train_texts, train_classes = load_data('../data/annotations-train.json')
devel_texts, devel_classes = load_data('../data/annotations-dev.json')



In [24]:
from collections import Counter


MIN_EXAMPLES = 25    # filter classes with fewer


class_count = Counter()
for c in train_classes:
    class_count[c] += 1
target_class = set(c for c, v in class_count.items() if v >= MIN_EXAMPLES)


def filter_by_class(texts, classes, targets):
    filtered_texts, filtered_classes = [], []
    for t, c in zip(texts, classes):
        if c in targets:
            filtered_texts.append(t)
            filtered_classes.append(c)
    return filtered_texts, filtered_classes


train_texts, train_classes = filter_by_class(train_texts, train_classes, target_class)
devel_texts, devel_classes = filter_by_class(devel_texts, devel_classes, target_class)

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC


vectorizer = TfidfVectorizer(analyzer='word', lowercase=False, ngram_range=(1,3))
vectorizer.fit(train_texts)

train_X = vectorizer.transform(train_texts)
devel_X = vectorizer.transform(devel_texts)

classifier = LinearSVC(C=1.0)
classifier.fit(train_X, train_classes)

classifier.score(devel_X, devel_classes)

0.6116504854368932

In [26]:
import eli5


eli5.show_weights(classifier, vec=vectorizer, top=(100,100))

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,Unnamed: 13_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,Unnamed: 13_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8,Unnamed: 13_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9,Unnamed: 12_level_9,Unnamed: 13_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10,Unnamed: 13_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11,Unnamed: 12_level_11,Unnamed: 13_level_11
Weight?,Feature,Unnamed: 2_level_12,Unnamed: 3_level_12,Unnamed: 4_level_12,Unnamed: 5_level_12,Unnamed: 6_level_12,Unnamed: 7_level_12,Unnamed: 8_level_12,Unnamed: 9_level_12,Unnamed: 10_level_12,Unnamed: 11_level_12,Unnamed: 12_level_12,Unnamed: 13_level_12
Weight?,Feature,Unnamed: 2_level_13,Unnamed: 3_level_13,Unnamed: 4_level_13,Unnamed: 5_level_13,Unnamed: 6_level_13,Unnamed: 7_level_13,Unnamed: 8_level_13,Unnamed: 9_level_13,Unnamed: 10_level_13,Unnamed: 11_level_13,Unnamed: 12_level_13,Unnamed: 13_level_13
+0.772,lt,,,,,,,,,,,,
+0.645,gt,,,,,,,,,,,,
+0.529,lt ref,,,,,,,,,,,,
+0.529,ref,,,,,,,,,,,,
+0.516,Kazanin,,,,,,,,,,,,
+0.428,Luokka,,,,,,,,,,,,
+0.395,data rte,,,,,,,,,,,,
+0.395,rte,,,,,,,,,,,,
+0.382,Pietarin,,,,,,,,,,,,
+0.381,kirkko,,,,,,,,,,,,

Weight?,Feature
+0.772,lt
+0.645,gt
+0.529,lt ref
+0.529,ref
+0.516,Kazanin
+0.428,Luokka
+0.395,data rte
+0.395,rte
+0.382,Pietarin
+0.381,kirkko

Weight?,Feature
+0.673,Perttu
+0.365,Vilkkilä
+0.348,Nokia
+0.343,WordPress
+0.341,offline
+0.337,Shatillan
+0.302,Haapasaaren
+0.268,Salla
+0.268,My Genius
+0.253,Sallan

Weight?,Feature
+1.444,oli
+1.023,ihan
+0.968,kun
+0.869,kyllä
+0.859,en
+0.803,mun
+0.799,En
+0.793,mutta
+0.744,vähän
+0.732,olen

Weight?,Feature
+0.736,ei
+0.547,että
+0.523,Venäjä
+0.409,on
+0.399,OLL
+0.378,SDP
+0.365,kuin
+0.347,Pirkkala
+0.333,ole
+0.312,Jokipuiston

Weight?,Feature
+0.840,sekä
+0.812,LR
+0.714,Oy
+0.588,Hotel
+0.572,VIP
+0.559,asiakkaan
+0.558,lennot
+0.539,Ladbrokes
+0.517,Online
+0.490,ja

Weight?,Feature
+0.592,Healing
+0.493,Tuoteväylä
+0.450,sekä
+0.429,Yhdistyksen
+0.421,Työpajan
+0.372,yhdistyksen
+0.364,Rex
+0.351,Haifan
+0.345,EY
+0.336,esimerkiksi

Weight?,Feature
+0.487,Somalian
+0.438,DVD
+0.384,TE
+0.265,kirjallisuuden
+0.256,TE keskus
+0.247,työpaikkoja
+0.246,Oy hallituksen
+0.241,TP
+0.238,Harjamäki
+0.232,Kiven

Weight?,Feature
+0.803,sitten
+0.633,niin
+0.621,Melkein vuotta sitten
+0.621,Melkein vuotta
+0.614,en
+0.570,klo
+0.533,kun
+0.497,Papu
+0.491,Melkein
+0.479,Kajo

Weight?,Feature
+0.586,Jos
+0.526,voi
+0.525,tai
+0.506,Mikäli
+0.451,Jäsenen
+0.447,Santa
+0.434,lähettää
+0.427,au
+0.412,tavaran
+0.408,MX

Weight?,Feature
+1.984,Jatkaa
+1.984,Jatkaa lukemista
+1.942,lukemista
+1.496,voit
+1.105,jotka
+1.093,tehdä
+1.052,online
+0.915,dollaria
+0.910,ja
+0.891,Casino

Weight?,Feature
+1.071,Falun
+0.796,Suomen
+0.704,000
+0.687,2011
+0.655,sanoo
+0.645,Oy
+0.600,kertoo
+0.599,Kouvolan
+0.584,mukaan
+0.582,Etelä

Weight?,Feature
+2.460,Jumalan
+1.162,Jeesus
+1.132,Jeesuksen
+1.073,Jumala
+1.033,hän
+0.673,Kristuksen
+0.639,hänen
+0.604,Hän
+0.548,Herran
+0.505,Kristus

Weight?,Feature
+0.548,tekijänoikeudet
+0.415,kuuluvat
+0.411,Girl
+0.337,hahmojen
+0.336,Kanerva
+0.336,Bjork
+0.336,lasit
+0.327,ei
+0.310,mutta
+0.299,Brant Bjork

Weight?,Feature
+0.500,SM
+0.470,joukkue
+0.435,joukkueen
+0.398,ja kapteeni
+0.383,pelannut
+0.351,Mika
+0.349,Vestan
+0.344,HIFK
+0.333,kapteeni
+0.331,on pelannut


In [27]:
import numpy as np

from sklearn.metrics import confusion_matrix
from pandas import DataFrame


pred_Y = classifier.predict(devel_X)
cm = confusion_matrix(devel_classes, pred_Y)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]    # normalize
df = DataFrame(cm * 100, index=classifier.classes_, columns=classifier.classes_)
df.round(2)

Unnamed: 0,A-Encyclopedia,B-Community,B-Personal,B-Personal-Opinion,D-Sell,D-Thing,Editorials,Forums,How-to,MT/Gen,News,Religious,Reviews,Sports
A-Encyclopedia,20.0,0.0,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,20.0,40.0,0.0,0.0
B-Community,0.0,0.0,37.5,0.0,0.0,0.0,0.0,0.0,0.0,37.5,25.0,0.0,0.0,0.0
B-Personal,0.0,0.0,90.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0
B-Personal-Opinion,0.0,0.0,20.0,50.0,0.0,0.0,0.0,0.0,0.0,20.0,10.0,0.0,0.0,0.0
D-Sell,0.0,0.0,0.0,3.03,81.82,0.0,0.0,0.0,0.0,9.09,6.06,0.0,0.0,0.0
D-Thing,0.0,0.0,0.0,0.0,60.0,6.67,0.0,0.0,0.0,20.0,6.67,6.67,0.0,0.0
Editorials,0.0,0.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,25.0,0.0,0.0,0.0
Forums,0.0,0.0,77.78,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.11,11.11,0.0,0.0
How-to,0.0,0.0,8.33,0.0,16.67,0.0,0.0,0.0,25.0,41.67,8.33,0.0,0.0,0.0
MT/Gen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
