In [242]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import requests
from multiprocessing.dummy import Pool as ThreadPool
import time
import re
from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances
from sklearn.metrics import f1_score

import os

from tqdm import tqdm

import random

from gensim.models.word2vec import Word2Vec
from gensim.models.fasttext import FastText

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/bulatral42/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [245]:
def get_topk_inter_dist(texts, k=25):
    X = np.zeros(shape=(len(texts), len(texts)), dtype=float)
    for i in range(len(texts)):
        words_i = set(texts[i].lower().strip().split())
        for j in range(i + 1, len(texts)):
            words_j = set(texts[j].lower().strip().split())
            X[i, j] = X[j, i] = len(words_i & words_j) / (1 + len(words_i | words_j))
    return np.sort(X, axis=1)[:, :-k-1:-1]

In [246]:
def get_topk_tfidf_cosine_dist(texts, ngrams=(1, 1), k=25):
    vectorizer = TfidfVectorizer(ngram_range=ngrams)
    X = vectorizer.fit_transform(texts).toarray()
    return np.sort(pairwise_distances(X, X, metric='cosine'), axis=1)[:, 1:k+1]

In [247]:
def get_topk_d2v_sim(texts, k=25):
    tagged_data = [TaggedDocument(words=word_tokenize(doc), tags=[i]) 
                   for i, doc in enumerate(texts)]
    
    model_d2v = Doc2Vec(vector_size=50, alpha=0.025, min_count=2)
    model_d2v.build_vocab(tagged_data)

    for epoch in range(10):
        model_d2v.train(tagged_data,
                        total_examples=model_d2v.corpus_count,
                        epochs=model_d2v.epochs)

    X = np.zeros((len(texts), 50))

    for i in range(X.shape[0]):
        X[i] = model_d2v.docvecs[i]
    
    return np.sort(pairwise_distances(X, X, metric='cosine'), axis=1)[:, 1:k+1]

In [353]:
#pair_id, group_id, doc_id, target
def get_features(data, k=25):
    X = np.ndarray(shape=(0, 6 * k))
    y = []
    for gr_id in tqdm(data.group_id.unique()[:]):    
        gr_len = data[data.group_id == gr_id].doc_id.values.size
        
        raw_texts = []
        for doc_id in data[data.group_id == gr_id].doc_id.values:
            with open('data_title_h16/' + str(doc_id) + '.dat.txt', mode='r') as doc:
                raw_texts.append(doc.read())
                    
        with open('data/title_h16_' + str(gr_id) + '.txt', mode='r') as doc:
            th = doc.read().split(';\n')[:gr_len]
        with open('data/paragraps_' + str(gr_id) + '.txt', mode='r') as doc:
            par = doc.read().split(';\n')[:gr_len]
        # print(gr_id, len(th), len(par))
        if 'target' in data.columns:
            y += list(data[data.group_id == gr_id].target.values)
        feat_gr = np.hstack((get_topk_inter_dist(raw_texts, k=k), 
                             get_topk_tfidf_cosine_dist(raw_texts, k=k), 
                             get_topk_d2v_sim(raw_texts, k=k),
                             get_topk_inter_dist(par, k=k), 
                             get_topk_tfidf_cosine_dist(par, k=k), 
                             get_topk_d2v_sim(par, k=k)))
        X = np.vstack((X, feat_gr))
    if 'target' in data.columns:
        return X, np.asarray(y)
    else:
        return X

In [354]:
import lightgbm as lgb
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

import sys

In [355]:
doc_to_title = {}
with open('docs_titles.tsv') as f:
    for num_line, line in enumerate(f):
        if num_line == 0:
            continue
        data = line.strip().split('\t', 1)
        doc_id = int(data[0])
        if len(data) == 1:
            title = ''
        else:
            title = data[1]
        doc_to_title[doc_id] = title
print(len(doc_to_title))

28026


In [356]:
import pandas as pd
train_data = pd.read_csv('train_groups.csv')
traingroups_titledata = {}
for i in range(len(train_data)):
    new_doc = train_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    target = new_doc['target']
    title = doc_to_title[doc_id]
    if doc_group not in traingroups_titledata:
        traingroups_titledata[doc_group] = []
    traingroups_titledata[doc_group].append((doc_id, title, target))
type(train_data)

pandas.core.frame.DataFrame

In [357]:
test_data = pd.read_csv('test_groups.csv')
testgroups_titledata = {}
for i in range(len(test_data)):
    new_doc = test_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    title = doc_to_title[doc_id]
    if doc_group not in testgroups_titledata:
        testgroups_titledata[doc_group] = []
    testgroups_titledata[doc_group].append((doc_id, title))
test_data.head()

Unnamed: 0,pair_id,group_id,doc_id
0,11691,130,6710
1,11692,130,4030
2,11693,130,5561
3,11694,130,4055
4,11695,130,4247


In [358]:
test_data

Unnamed: 0,pair_id,group_id,doc_id
0,11691,130,6710
1,11692,130,4030
2,11693,130,5561
3,11694,130,4055
4,11695,130,4247
...,...,...,...
16622,28313,309,16637
16623,28314,309,16759
16624,28315,309,15358
16625,28316,309,17287


In [359]:
X_train, y_train = get_features(train_data, k=25)
X_test = get_features(test_data, k=25)

100%|██████████| 129/129 [24:57<00:00, 11.61s/it]
100%|██████████| 180/180 [37:19<00:00, 12.44s/it]


In [335]:
print(X_train.shape, y_train.shape, X_test.shape)

(11690, 125) (11690,) (16627, 125)


In [360]:
X_val_train, y_val_train = X_train[:9000], y_train[:9000]
X_val_test, y_val_test = X_train[9000:], y_train[9000:]

In [361]:
clf = make_pipeline(StandardScaler(), SVC(gamma='auto', class_weight='balanced')) # lgb.LGBMClassifier() # 
clf.fit(X_val_train, y_val_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(class_weight='balanced', gamma='auto'))])

In [362]:
y_val_pred = clf.predict(X_val_test)
y_val_pred.shape

(2690,)

In [363]:
print(X_val_train.shape, y_val_train.shape, X_val_test.shape)

(9000, 150) (9000,) (2690, 150)


In [364]:
print('Validation score: {}'.format(f1_score(y_val_test, y_val_pred)))

Validation score: 0.7843784378437844


In [365]:
clf = make_pipeline(StandardScaler(), SVC(gamma='auto', class_weight='balanced'))# lgb.LGBMClassifier()
clf.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(class_weight='balanced', gamma='auto'))])

In [366]:
y_pred = clf.predict(X_test)

In [367]:
data = { 'pair_id': np.asarray(test_data.pair_id), 'target': y_pred }
df = pd.DataFrame(data=data)
df = df.set_index(keys=['pair_id'])
df.to_csv('submitBoostHeaders.csv')
df.head()

Unnamed: 0_level_0,target
pair_id,Unnamed: 1_level_1
11691,1
11692,0
11693,0
11694,1
11695,0


In [368]:
print((y_pred == 1).astype(int).sum())
print((y_pred == 0).astype(int).sum())

print((y_train == 1).astype(int).sum())
print((y_train == 0).astype(int).sum())

5792
10835
3361
8329


скор на паблик лидерборде: 0.73223