In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import codecs, collections, csv, pymorphy2, re

from time import gmtime, strftime, time
from bs4 import BeautifulSoup
from scipy.spatial import distance
from sklearn import linear_model, metrics, model_selection, preprocessing, svm, ensemble, neighbors
from sklearn.feature_extraction.text import TfidfVectorizer
from functools import wraps
from stop_words import get_stop_words

In [4]:
class lib:
    def process_titles():
        data = {}
        with open('docs_titles.tsv') as f:
            for i, line in enumerate(f):
                if i == 0:
                    continue
                content = line.strip().split('\t', 1)
                doc_id = int(content[0])
                if len(content) == 1:
                    title = ''
                else:
                    title = content[1]
                data[doc_id] = title
        return pd.Series(data)
    
    train_data = pd.read_csv('train_groups.csv')
    test_data = pd.read_csv('test_groups.csv')
    titles = process_titles()
    g_arange = {
        'train': np.arange(1, 130),
        'test': np.arange(130, 310),
    }
    
    log_path = 'data/'
    features_path = lambda group_id: 'data/f_{}.npy'.format(group_id)
    target_path = lambda group_id: 'data/t_{}.npy'.format(group_id)
    lxml_str = lambda lxml: ''.join(map(lambda x: ' ' + x.text, lxml.find_all(re.compile('^h[1-6]$'))))

In [210]:
class consts:
    max_features = 20
    title_weight = 3
    threshold = 0.35
    
    function_words = {'INTJ', 'PRCL', 'CONJ', 'PREP'}
    morph = pymorphy2.MorphAnalyzer()
    separators = [';', ':', '-', '_', '|', ',',
                  '.', '!', '?', '«', '»', '"',
                  '(', ')', '[', ']', '–', '”',
                  '*', '�', '^', '=', '©', '“',
                  '’', '+', '…', '&', '—', '$',
                  '/', '→', '←', '{', '}', '`',
                  '°', '@', '#',
                  '\n', '\t', '\r', '\'', '\\']


In [215]:
def split(text, separators = consts.separators):
    text = text.lower()
    for sep in separators:
        text = text.replace(sep, ' ')
    return text.split()

def normalizer(text):
    text = split(text)
    text = list(filter(lambda x: consts.morph.parse(x)[0].tag.POS not in consts.function_words, text))
    text = list(filter(lambda x: x not in get_stop_words('russian'), text))
    text = list(filter(lambda x: x not in get_stop_words('english'), text))
    text = list(map(lambda x: consts.morph.parse(x)[0].normal_form, text))
    text = list(map(lambda x: x.replace('ё', 'е'), text))
    text = list(filter(lambda x: not re.search('\d+', x) and len(x) > 1, text))
    return ' '.join(text)

def doc_handler(doc_id):
    path = 'content/{}.dat'.format(doc_id)
    with codecs.open(path, 'r', 'utf-8') as f:
        url = f.readline().strip()
        text = normalizer(lib.lxml_str(BeautifulSoup(f, 'lxml')) + consts.title_weight * (' ' + lib.titles[doc_id]))
    return text

In [17]:
def logger(func):
    f_log = open(lib.log_path + 'report.logs', 'w')
    @wraps(func)
    def wrapper(mode):
        timer = time()
        result = func(mode)
        print('mode:', mode, file=f_log)
        print('time:', strftime('%H:%M:%S', gmtime(np.round(time() - timer))), file=f_log)
        print('max_features =', consts.max_features, file=f_log)
        print('title_weigh =', consts.title_weight, file=f_log)
        print('\n', file=f_log)
        return result
    return wrapper

def export_data(mode, group_id):
    if mode == 'train':
        docs = [doc_handler(doc) for doc in 
                         lib.train_data[lib.train_data.group_id == group_id].doc_id]
        data = TfidfVectorizer().fit_transform(docs).todense()
        
    elif mode == 'test':
        docs = [doc_handler(doc) for doc in 
                         lib.test_data[lib.test_data.group_id == group_id].doc_id]
        data = TfidfVectorizer().fit_transform(docs).todense()
        
    dist = distance.pdist(data, metric = 'cosine')
    dist[np.where(np.isnan(dist))[0]] = 1.0
    dist = distance.squareform(dist)
    
    np.save(lib.features_path(group_id), 
            np.sort(np.partition(dist, consts.max_features + 1, axis=1)[:, :consts.max_features + 1:], axis=1)[:, 1::])
    
    if mode == 'train':
        targets = np.array(lib.train_data[lib.train_data.group_id == group_id].target)
        np.save(lib.target_path(group_id), targets)

In [18]:
@logger
def process_data(mode):
    if mode == 'train':
        arange = lib.g_arange['train']
        for group_id in arange:
            export_data(mode, group_id)
            
    elif mode == 'test':
        arange = lib.g_arange['test']
        for group_id in arange:
            export_data(mode, group_id)
            
    elif mode == 'all':
        process_data('train')
        process_data('test')
        
    else:
        raise ValueError('wrong mode')

In [19]:
@logger
def import_data(mode):
    if mode == 'train':
        arange = lib.g_arange['train']
        targets = np.load(lib.target_path(arange[0]))
        features = np.load(lib.features_path(arange[0]))
        
        for group_id in arange[1::]:
            group_targets = np.load(lib.target_path(group_id))
            group_features = np.load(lib.features_path(group_id))

            targets = np.concatenate((targets, group_targets))
            features = np.concatenate((features, group_features))
            
        return features, targets
    
    elif mode == 'test':
        arange = lib.g_arange['test']
        features = np.load(lib.features_path(arange[0]))
        
        for group_id in arange[1::]:
            group_features = np.load(lib.features_path(group_id))
            features = np.concatenate((features, group_features))
            
        return features
    
    elif mode == 'all':
        train_data = import_data('train')
        test_data = import_data('test')
        return train_data[0], train_data[1], test_data
    
    else:
        raise ValueError('wrong mode')

In [20]:
%time process_data('all')

CPU times: user 49min 1s, sys: 1min, total: 50min 2s
Wall time: 53min 5s


In [153]:
%time X_train, y_train, X_test = import_data('all')
!cat data/report.logs

CPU times: user 363 ms, sys: 239 ms, total: 602 ms
Wall time: 888 ms


In [211]:
scaler = preprocessing.StandardScaler()
scaler.fit(X_train)
data_train, data_test, label_train, label_test = model_selection.train_test_split(X_train,
                                                                                  y_train,
                                                                                  test_size=0.1,
                                                                                  stratify=y_train)
data_train = scaler.transform(data_train)
data_test = scaler.transform(data_test)

In [212]:
model = linear_model.SGDClassifier(loss='log',
                                   penalty='l1',
                                   alpha=0.005,
                                   l1_ratio=0.01,
                                   max_iter=10000,
                                   n_jobs=-1,
                                   n_iter_no_change=1,
                                   class_weight='balanced')
model.fit(data_train, label_train)
metrics.f1_score(label_test, model.predict(data_test))

0.7264833574529667

In [213]:
from sklearn.svm import SVC

model = SVC(kernel='rbf',
            max_iter=-1,
            gamma='scale',
            probability=True,
            tol=1e-3,
            class_weight='balanced')
model.fit(data_train, label_train)

pred_proba = model.predict_proba(data_test)
prediction = [int(x[1] > consts.threshold) for x in pred_proba]

metrics.f1_score(label_test, prediction)

0.7336010709504684

In [214]:
scaler = preprocessing.StandardScaler()
scaler.fit(X_test)
X_test = scaler.transform(X_test)

pred_proba = model.predict_proba(X_test)
prediction = [int(x[1] > consts.threshold) for x in pred_proba]

# prediction = model.predict(X_test)

with open('submission.csv', 'w') as f:
    writer = csv.DictWriter(f, fieldnames=['pair_id', 'target'])
    writer.writeheader()
    i = 11691
    for elem in prediction:
        writer.writerow({'pair_id': str(i), 'target': str(elem)})
        i += 1