In [1]:
from __future__ import division, print_function

import json
import logging
import re
import pymorphy2

import fasttext
import matplotlib.pyplot as plt
import numpy as np
import scipy.sparse
from pandas.io.json import json_normalize
from scipy.interpolate import interp2d
from sklearn.metrics import f1_score
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from tqdm import tqdm_notebook

from fasttext_loader import FasttextLoader
from preprocessing_tools import inds_texts_labels, MultiLabelEncoder, MultilabelStratifiedKFold, DictTransformer
%matplotlib inline

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

In [3]:
seed = 7
np.random.seed(seed)

### Data loading

In [4]:
w2v_loader = FasttextLoader()
w2v_loader.load("/Users/Ruslan/Downloads/fasttext.bin")

In [5]:
_, X, y = inds_texts_labels("../data/all_xeno_labeled.json", w2v_loader)
mlb = MultiLabelEncoder()
y = mlb.fit_transform(y)

Num of data < min_len: 0
Num of multilabel data: 457
Num of not_xeno or unknown: 368
Num of duplicates: 0


In [6]:
y.shape

(3113, 7)

In [7]:
with open('nations.json', 'r') as fp:
    label_to_nation = json.load(fp)
label_to_nation = {int(k):v for k,v in label_to_nation.items()}

### SVM

In [8]:
class MultilabelLinearSVC(BaseEstimator, ClassifierMixin):  
    """An example of classifier"""

    def __init__(self):
        self.svms = {}

    def fit(self, X, y):
        for i in range(len(y.T)):
            self.svms[i] = LinearSVC()
            self.svms[i].fit(X, y[:, i])
        return self

    def predict(self, X, y=None):
        return np.array([self.svms[i].predict(X) for i in range(len(self.svms))]).T

    def score(self, X, y):
        return f1_score(y, self.predict(X), average="samples")

In [9]:
pipeline = Pipeline([
    ('vect', DictTransformer("cleaned.csv")),
    ('clf', MultilabelLinearSVC()),
])
scores = cross_val_score(pipeline, X, y, cv=MultilabelStratifiedKFold(y, 3))

2018-04-24 15:33:56,043 INFO Loading dictionaries from /Users/ruslan/miniconda3/lib/python3.6/site-packages/pymorphy2_dicts/data
2018-04-24 15:33:56,125 INFO format: 2.4, revision: 393442, updated: 2015-01-17T16:03:56.586168
2018-04-24 15:33:56,195 INFO Loading dictionaries from /Users/ruslan/miniconda3/lib/python3.6/site-packages/pymorphy2_dicts/data
2018-04-24 15:33:56,256 INFO format: 2.4, revision: 393442, updated: 2015-01-17T16:03:56.586168


Fold distributions are
[[170.  26. 141. 538. 164.  88.  59.]
 [174.  26. 145. 552. 168.  91.  60.]
 [174.  26. 144. 550. 168.  90.  60.]]


  'precision', 'predicted', average, warn_for)
2018-04-24 15:34:15,948 INFO Loading dictionaries from /Users/ruslan/miniconda3/lib/python3.6/site-packages/pymorphy2_dicts/data
2018-04-24 15:34:16,014 INFO format: 2.4, revision: 393442, updated: 2015-01-17T16:03:56.586168
  'precision', 'predicted', average, warn_for)
2018-04-24 15:34:36,237 INFO Loading dictionaries from /Users/ruslan/miniconda3/lib/python3.6/site-packages/pymorphy2_dicts/data
2018-04-24 15:34:36,298 INFO format: 2.4, revision: 393442, updated: 2015-01-17T16:03:56.586168
  'precision', 'predicted', average, warn_for)


In [10]:
scores.mean()

0.7880045045345367

In [11]:
scores.std()

0.006026617347027513

In [12]:
DictTransformer("cleaned.csv").transform(["хохол устал за день"])

2018-04-24 15:34:55,922 INFO Loading dictionaries from /Users/ruslan/miniconda3/lib/python3.6/site-packages/pymorphy2_dicts/data
2018-04-24 15:34:56,001 INFO format: 2.4, revision: 393442, updated: 2015-01-17T16:03:56.586168


array([[0.  , 0.  , 0.  , 0.25, 0.  , 0.  , 0.  ]])