In [147]:
import requests
import logging
import gzip
import json
import codecs
import re
import regex
import tarfile
import os.path

import numpy as np
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from tqdm import tqdm
from multiprocessing.dummy import Pool, Queue
from bs4 import BeautifulSoup
from time import sleep
from collections import Counter
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer 
from bs4.element import Comment
from sklearn.ensemble import GradientBoostingClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\selez\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]', 'option']:
        return False
    if isinstance(element, Comment):
        return False
    return True

In [146]:
def get_title_and_text(doc_id):
    """
        Get set of words from title
        and most common words from text
    """
    page = open("content/" + str(doc_id) + ".dat", 'r', encoding='utf-8').read()
    soup = BeautifulSoup(page, 'html.parser')

    title = soup.title.text

    txt = soup.findAll(text=True)
    visible = filter(tag_visible, txt)
    txt = u" ".join(t.strip() for t in visible)

    reg = regex.compile('[^a-zA-Zа-яА-Я ]')
    txt = reg.sub('', txt)
    #if not hasattr(get_title_and_text, "stemmer"):
    stemmer = SnowballStemmer("russian")
    stop = set(stopwords.words('russian'))
    words = [stemmer.stem(word) for word in txt.split() if (not (word in stop)) and len(word) > 1]
    #words = [get_title_and_text.stemmer.stem(word) for word in txt.split()
    #         if len(word) > 3 or len(word) == 3 and word.isupper()]
    cnt = Counter(words)

    return (set(title.strip().split()), set(dict(cnt.most_common(50)).keys()))

In [145]:
def process_page(pair_id):
    """
        Get information about pair (doc, group):
        - doc_id - номер документа,
        - title - множество слов из заголовка страницы,
        - text - множество из 50 самых частых слов текста страницы,
        - target.
    """
    new_doc = train_data.loc[pair_id]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    target = new_doc['target']
    title, text = get_title_and_text(doc_id)
    #title = doc_to_title[doc_id]
    if doc_group not in traingroups_data:
        traingroups_data[doc_group] = []
    traingroups_data[doc_group].append((doc_id, title, text, target))
    return (doc_id, title, text, target)

In [None]:
queue_groups = Queue()   # очередь ссылок на группы
train_data = pd.read_csv('train_groups.csv')
traingroups_data = {}
train_data.index = train_data.pair_id

groups_to_process = [2, 3, 4, 5]

# Create queue of groups and n_groups queues of pair_ids in group
queue = {i: Queue() for i in groups_to_process}# gtrain_data.group_id.unique()}
for group in groups_to_process:# train_data.group_id.unique():
    queue_groups.put(group)
    for pair in train_data[train_data.group_id == group].itertuples():
        queue[group].put(pair.pair_id)
#for pair in train_data[train_data.group_id == 1].itertuples():
#    queue.put(pair.pair_id)

pages_data = [[] for i in groups_to_process]# train_data.group_id.unique()]

zipfile = 'content.tar.gz'
tar = tarfile.open(zipfile, "r:gz") # Считывает архив с данными

def extract_pages(group):
    """
        Extract pages data from .gz archive if not extracted
    """
    for pair in train_data[train_data.group_id == group].itertuples():
        if not os.path.exists("content/" + str(pair.doc_id) + ".dat"):
            tar.extract("content/" + str(pair.doc_id) + ".dat")

def process_page_wrapper(i):
    while not queue_groups.empty():
        group = int(queue_groups.get())
        extract_pages(group)
        with gzip.open('data/group_{:03d}.jsonl.gz'.format(group), mode='wb') as f_json:
            f_json = codecs.getwriter('utf_16')(f_json)
            record = {}
            while not queue[group].empty():
                pair_id = queue[group].get()
                doc_id, title, text, target = process_page(pair_id)
                record[pair_id] = {'doc_id': int(doc_id), 'title': list(title), 'text': list(text), 'target': int(target)}
            record_str = json.dumps(record, indent=4, ensure_ascii=False)
            print(record_str, file=f_json)
            pages_data.append((doc_id, title, text, target))
            with lock:
                pbar.update(1)


with Pool(processes=4) as pool, tqdm(total=queue_groups.qsize()) as pbar:
    lock = pbar.get_lock()
    pool.map(process_page_wrapper, range(pool._processes))
    pool.close()
    pool.join()

  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

In [136]:
# Не нужно запускать!
with gzip.open('data/part_{:05d}.jsonl.gz'.format(0), mode='rb') as f_json:
    f_json = codecs.getreader('utf_16')(f_json)
    #data = [{} for i in range(17)]
    #data = json.loads(json.dumps(data))
    #data = f_json.read()
    data = [json.loads(line) for line in f_json]#open('data.json', 'r')]
data

JSONDecodeError: Expecting property name enclosed in double quotes: line 2 column 1 (char 3)

In [118]:
def group_parser(group_data, group_id):
    """
        Create features for documents in group_id
    """
    for k, (doc_id, title, text, target_id) in enumerate(group_data):
        y_train.append(target_id)
        groups_train.append(group_id)
        all_dist_title = []
        all_dist_text = []
        #words = set(title.strip().split())
        for j in range(0, len(group_data)):
            if k == j:
                continue
            doc_id_j, title_j, text_j, target_j = group[j]
            #words_j = set(title_j.strip().split())
            all_dist_title.append(len(title.intersection(title_j)))
            all_dist_text.append(len(text.intersection(text_j)))
        X_train.append(sorted(all_dist_title, reverse=True)[0:15] + sorted(all_dist_text, reverse=True)[0:15])

In [119]:
# Запуск создания фичей для одной группы -> переписать в цикл / многопроцессный код

y_train = []
X_train = []
groups_train = []
group_id = 1
group_parser(pages_data, group_id)
X_train = np.array(X_train)
y_train = np.array(y_train)
groups_train = np.array(groups_train)
print (X_train.shape, y_train.shape, groups_train.shape)

(102, 30) (102,) (102,)


In [139]:
clf = GradientBoostingClassifier()
clf.fit(X_train, y_train)
accuracy_score(clf.predict(X_train), y_train)

1.0