In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import codecs, collections, csv, pymorphy2, re

from time import gmtime, strftime, time
from bs4 import BeautifulSoup
from scipy.spatial import distance
from sklearn import linear_model, metrics, model_selection, preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
class lib:
    def process_titles():
        data = {}
        with open('docs_titles.tsv') as f:
            for i, line in enumerate(f):
                if i == 0:
                    continue
                content = line.strip().split('\t', 1)
                doc_id = int(content[0])
                if len(content) == 1:
                    title = ''
                else:
                    title = content[1]
                data[doc_id] = title
        return pd.Series(data)
    
    train_data = pd.read_csv('train_groups.csv')
    test_data = pd.read_csv('test_groups.csv')
    titles = process_titles()
    g_arange = {
        'train': np.arange(1, 130),
        'test': np.arange(130, 310),
    }
    
    log_path = 'data/'
    features_path = lambda group_id: 'data/f_{}.npy'.format(group_id)
    target_path = lambda group_id: 'data/t_{}.npy'.format(group_id)
    lxml_str = lambda lxml: ''.join(map(lambda x: ' ' + x.text, lxml.find_all(re.compile('^h[1-6]$'))))

In [23]:
class consts:
    max_features = 25
    title_weight = 5
    
    function_words = {'INTJ', 'PRCL', 'CONJ', 'PREP'}
    morph = pymorphy2.MorphAnalyzer()
    separators = [';', ':', '-', '_', '|', ',',
                  '.', '!', '?', '«', '»', '"',
                  '(', ')', '[', ']', '–', '”',
                  '*', '�', '^', '=', '©', '“',
                  '’', '+', '…', '&', '—', '$',
                  '/', '→', '←', '{', '}', '`',
                  '°', '@', '#',
                  '\n', '\t', '\r', '\'', '\\']


In [24]:
def split(text, separators = consts.separators):
    text = text.lower()
    for sep in separators:
        text = text.replace(sep, ' ')
    return text.split()

def normalizer(text):
    text = split(text)
    text = list(filter(lambda x: consts.morph.parse(x)[0].tag.POS not in consts.function_words, text))
    text = map(lambda x: consts.morph.parse(x)[0].normal_form, text)
    text = map(lambda x: x.replace('ё', 'е'), text)
    text = list(filter(lambda x: not re.search('\d+', x) and len(x) > 1, text))
    return ' '.join(text)

def doc_handler(doc_id):
    path = 'content/{}.dat'.format(doc_id)
    with codecs.open(path, 'r', 'utf-8') as f:
        url = f.readline().strip()
        text = normalizer(lib.lxml_str(BeautifulSoup(f, 'lxml')) + consts.title_weight * (' ' + lib.titles[doc_id]))
    return text

In [29]:
def logger(func):
    def wrapper(mode):
        timer = time()
        result = func(mode)
        with open(lib.log_path + 'report.logs', 'w') as f:
            f.write('mode: {}\n'.format(mode))
            f.write('total time: {}\n'.format(strftime('%H:%M:%S', gmtime(np.round(time() - timer)))))
            f.write('max_features = {}\n'.format(consts.max_features))
            f.write('title_weight = {}\n'.format(consts.title_weight))
        return result
    return wrapper

def export_data(mode, group_id):
    if mode == 'train':
        docs = [doc_handler(doc) for doc in 
                         lib.train_data[lib.train_data.group_id == group_id].doc_id]
        data = TfidfVectorizer().fit_transform(docs).todense()
        
    elif mode == 'test':
        docs = [doc_handler(doc) for doc in 
                         lib.test_data[lib.test_data.group_id == group_id].doc_id]
        data = TfidfVectorizer().fit_transform(docs).todense()
        
    dist = distance.pdist(data, metric = 'cosine')
    dist[np.where(np.isnan(dist))[0]] = 1.0
    dist = distance.squareform(dist)
    
    np.save(lib.features_path(group_id), np.sort(dist)[:, 1:1 + consts.max_features:])
    
    if mode == 'train':
        targets = np.array(lib.train_data[lib.train_data.group_id == group_id].target)
        np.save(lib.target_path(group_id), targets)

In [33]:
@logger
def process_data(mode):
    if mode == 'train':
        arange = lib.g_arange['train']
        for group_id in arange:
            export_data(mode, group_id)
            
    elif mode == 'test':
        arange = lib.g_arange['test']
        for group_id in arange:
            export_data(mode, group_id)
            
    elif mode == 'all':
        process_data('train')
        process_data('test')
        
    else:
        raise ValueError('mode matches neither \'train\' nor \'test\' nor \'all\'')

In [42]:
@logger
def collect_data(mode):
    if mode == 'train':
        arange = lib.g_arange['train']
        targets = np.load(lib.target_path(arange[0]))
        features = np.load(lib.features_path(arange[0]))
        
        for group_id in arange[1::]:
            group_targets = np.load(lib.target_path(group_id))
            group_features = np.load(lib.features_path(group_id))

            targets = np.concatenate((targets, group_targets))
            features = np.concatenate((features, group_features))
            
        return features, targets
    
    elif mode == 'test':
        arange = lib.g_arange['test']
        features = np.load(lib.features_path(arange[0]))
        
        for group_id in arange[1::]:
            group_features = np.load(lib.features_path(group_id))
            features = np.concatenate((features, group_features))
            
        return features
    
    elif mode == 'all':
        train_data = collect_data('train')
        test_data = collect_data('test')
        return train_data[0], train_data[1], test_data
    
    else:
        raise ValueError('mode matches neither \'train\' nor \'test\' nor \'all\'')

In [35]:
%time process_data('test')

CPU times: user 27min 7s, sys: 21.6 s, total: 27min 29s
Wall time: 28min 24s
cat: /data/report.logs: No such file or directory


In [43]:
!cat data/report.logs
%time X_train, y_train, X_test = collect_data('all')

mode: test
total time: 00:28:25
max_features = 25
title_weight = 5CPU times: user 432 ms, sys: 314 ms, total: 746 ms
Wall time: 1.03 s


In [44]:
scaler = preprocessing.StandardScaler()
scaler.fit(X_train)
data_train, data_test, label_train, label_test = model_selection.train_test_split(X_train,
                                                                                  y_train,
                                                                                  test_size=0.1,
                                                                                  stratify=y_train)
data_train = scaler.transform(data_train)
data_test = scaler.transform(data_test)

In [54]:
model = linear_model.SGDClassifier(loss='log',
                                   penalty='l1',
                                   alpha=0.00002,
                                   l1_ratio=0.01,
                                   max_iter=10000,
                                   n_jobs=-1,
                                   n_iter_no_change=1,
                                   class_weight='balanced')
model.fit(data_train, label_train)
metrics.f1_score(label_test, model.predict(data_test))

0.7263157894736841

In [56]:
scaler = preprocessing.StandardScaler()
scaler.fit(X_test)
X_test = scaler.transform(X_test)
pred = model.predict(X_test)
with open('submission.csv', 'w') as f:
    writer = csv.DictWriter(f, fieldnames=['pair_id', 'target'])
    writer.writeheader()
    i = 11691
    for elem in pred:
        writer.writerow({'pair_id': str(i), 'target': str(elem)})
        i += 1