In [1]:
from multiprocessing.dummy import Pool, Queue
import lxml
import tldextract
from tqdm import tqdm
from time import sleep
from bs4 import BeautifulSoup
from bs4.element import Comment
from contextlib import ExitStack
from typing import Generator, Dict, Any
import gzip
import pandas as pd
import codecs
import sys
import os
import json
import re
import numpy as np
from os import listdir
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import GermanStemmer, EnglishStemmer, RussianStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from time import time
import re
import warnings
warnings.filterwarnings("ignore")

In [2]:
regex_symbols = re.compile(r'[^a-zA-Z0-9а-яА-Я\u00E4\u00F6\u00FC\u00C4\u00D6\u00DC\u00df]')
stopwords_new = ['http', 'url', 'img','html', 'https', 'org', 'www', 'jpg', 'png', 'net','com','php', 'uid','src', 'ahttp', 'index', 'htm']
pattern = re.compile(r'\b(' + r'|'.join(stopwords_new) + r')\b\s*')
shortword = re.compile(r'\W*\b\w{1,2}\b')
musor = re.compile(r'\d{1}(\w{2})\d{2,4}')

In [3]:
stemmer_ru = RussianStemmer()
stemmer_eng = EnglishStemmer()
stemmer_ger = GermanStemmer()

In [4]:
stop_words = set(stopwords.words(['english', 'russian', 'german']))

In [5]:
my_position_start = 1
my_position_end = 28027
queue = Queue() # очередь ссылок на книги
for i in range(my_position_start, my_position_end):
    queue.put(i)

In [6]:
#split title to words
def split_title(title):
    words = nltk.word_tokenize(title)
    without_extra_words = [stemmer_ru.stem(stemmer_eng.stem(stemmer_ger.stem(word))) for word in words if word not in stop_words]
    return without_extra_words

In [7]:
#split text to words
def split_text(text):
    words = nltk.word_tokenize(text)
    without_extra_words = [stemmer_ru.stem(stemmer_eng.stem(stemmer_ger.stem(word))) for word in words if word not in stop_words]
    return without_extra_words

In [8]:
#returns words which title contains and text contains
def get_content(title, text, n=None):
    title_words = split_title(title)
    text_words = split_text(text)
    if n is not None:
        vectorizer = CountVectorizer().fit(text_words)
        tmp_words_matrix = vectorizer.transform(text_words)
        tmp_words_count = np.sum(tmp_words_matrix, axis=0)
        tmp_words = [(word, tmp_words_count[0, ind]) for word, ind in vectorizer.vocabulary_.items() if len(word) > 2]
        tmp_words = sorted(tmp_words, reverse=True, key=lambda x: x[1])
        tmp_words = [word[0] for word in tmp_words[:n]]
        doc_words = title_words + tmp_words
    else:
        doc_words = title_words + text_words
    return doc_words

In [10]:
def process_doc(number_of_doc):
    with open('./content/{:d}.dat'.format(number_of_doc), encoding='utf-8') as file:
        text_checker = {}
        html_test = file.read()
        soup = BeautifulSoup(html_test, 'html.parser')
        if soup.title:
            title_name = soup.title.text
            title_name = regex_symbols.sub(" ", title_name)
            title_name = re.sub("\s\s+" , " ", title_name)
        else:
            title_name = ' '
        url = soup.text[:soup.text.index('\n')]
        url = tldextract.extract(url)
        url = url.domain + '.' + url.suffix
        text = soup.text[soup.text.index('\n'):].lower()
        text = regex_symbols.sub(" ", text)
        text = shortword.sub(" ", text)
        text = pattern.sub(" ", text)
        text = musor.sub(' ', text)
        text = re.sub("\s\s+" , " ", text)
        text = re.sub('\xa0|\xad', ' ', text)
        content = get_content(title_name, text, n=10)
        text_checker[number_of_doc] = [url] + content
        return text_checker

In [11]:
def process_all_docs(i):
    with gzip.open('data3/part_{:05d}.jsonl.gz'.format(i), mode='wb') as f_json:
        f_json = codecs.getwriter('utf8')(f_json)

        while not queue.empty():
            try:
                id_new = queue.get()
                record = process_doc(id_new)
            except Exception as e:
                print(id_new, file=sys.stderr)
                print(e, file=sys.stderr)
            record_str = json.dumps(record, ensure_ascii=False)
            print(record_str, file=f_json)

            # счетчик должен атомарно обновиться
            with lock:
                pbar.update(1)


with Pool(processes=8) as pool, tqdm(total=queue.qsize()) as pbar:
    lock = pbar.get_lock()
    pool.map(process_all_docs, range(pool._processes))

  7%|█████▌                                                                     | 2067/28026 [19:48<1:32:50,  4.66it/s]2073
empty vocabulary; perhaps the documents only contain stop words
 12%|████████▉                                                                  | 3337/28026 [30:48<3:27:22,  1.98it/s]3345
[Errno 2] No such file or directory: './content/3345.dat'
 86%|███████████████████████████████████████████████████████████████▉          | 24237/28026 [3:48:31<26:15,  2.41it/s]24245
[Errno 2] No such file or directory: './content/24245.dat'
100%|██████████████████████████████████████████████████████████████████████████| 28026/28026 [4:22:24<00:00,  1.78it/s]


In [84]:
def records_reader(dirname: str) -> Generator[Dict[str, Any], None, None]:
    with ExitStack() as stack:
        files = [stack.enter_context(gzip.open(dirname + '/' + i, mode='rb')) for i in tqdm(listdir(dirname))]
        for j in files:
            d = codecs.getreader('utf8')(j)
            for k in d:
                yield json.loads(k)

In [85]:
df = pd.DataFrame(records_reader('data3'))
#df.to_csv('prom_res.csv', index=False)

100%|███████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 170.90it/s]


In [86]:
set([i for i in range(1, 28027)]) - set(np.unique(df.columns.values.astype('int')))

{2073, 3345, 24245}

In [22]:
df1 = pd.DataFrame(columns=['id', 'words'])

In [23]:
df1

Unnamed: 0,id,words


In [24]:
dicter = {}

In [25]:
for i in range(df.shape[0]):
    for j, k in df.loc[i][~df.loc[i].isna()].items():
        dicter[str(j)] = ' '.join(k)

In [None]:
dicter

In [275]:
df1.loc[28024] = [int(list(dicter.keys())[4])] + [list(dicter.values())[4]]

In [29]:
for j, k in enumerate(dicter.items()):
    df1.loc[j] = [int(k[0])] + [k[1]]

In [30]:
df1

Unnamed: 0,id,words
0,1,zrenielib.ru м б аншин центр репродукц генетик...
1,14,zashita-pravo.ru экстернат нов закон образован...
2,17,nashizubki.ru современ стоматолог кто так стом...
3,19,pspx.ru faq ps3 ftp serv установк настройк арх...
4,25,tks.ru медицинск издел 2013 формулировк нов ру...
...,...,...
28020,3800,mysonce.ru виде майнкрафт сталкер пут мечт игр...
28021,6085,mysonce.ru скача карт спавн сервер minecraft к...
28022,24245,sci-article.ru анализ нарушен иммунологическ р...
28023,1393,codegeassalternativ.ru 1 июн 2016 г операц кас...


In [55]:
df2 = pd.read_csv('test_groups.csv')

In [58]:
featss = [i for i in df2.columns.values][3:] #in case features are in test/train

In [60]:
df2.drop(featss,axis=1, inplace=True)

In [61]:
df2

Unnamed: 0,pair_id,group_id,doc_id
0,11691,130,6710
1,11692,130,4030
2,11693,130,5561
3,11694,130,4055
4,11695,130,4247
...,...,...,...
16622,28313,309,16637
16623,28314,309,16759
16624,28315,309,15358
16625,28316,309,17287


In [63]:
df1

Unnamed: 0,doc_id,words
0,1,zrenielib.ru м б аншин центр репродукц генетик...
1,14,zashita-pravo.ru экстернат нов закон образован...
2,17,nashizubki.ru современ стоматолог кто так стом...
3,19,pspx.ru faq ps3 ftp serv установк настройк арх...
4,25,tks.ru медицинск издел 2013 формулировк нов ру...
...,...,...
28020,3800,mysonce.ru виде майнкрафт сталкер пут мечт игр...
28021,6085,mysonce.ru скача карт спавн сервер minecraft к...
28022,24245,sci-article.ru анализ нарушен иммунологическ р...
28023,1393,codegeassalternativ.ru 1 июн 2016 г операц кас...


In [32]:
df1 = df1.rename(columns={'id': 'doc_id'})

In [64]:
df3 = pd.merge(df2, df1, how='left', on='doc_id')

In [65]:
df3

Unnamed: 0,pair_id,group_id,doc_id,words
0,11691,130,6710,youtube.com как прописа админк в кс 1 6 себ ил...
1,11692,130,4030,v-sampe.ru скача sgl rp доработк слив мод mysq...
2,11693,130,5561,dream-x.ru как прописа админк кс 1 6 count str...
3,11694,130,4055,net.ru как прописа прост админк кс 1 6 админк ...
4,11695,130,4247,o3one.ru подбор админ сервер код 4 арх форум o...
...,...,...,...,...
16622,28313,309,16637,mail.ru ответ mail ru полезн куша творог утр х...
16623,28314,309,16759,inmoment.ru творог полезн свойств лечен творог...
16624,28315,309,15358,edaplus.info творог полезн опасн свойств творо...
16625,28316,309,17287,mail.ru ответ mail ru чем полез творог творог ...


In [66]:
df3.to_csv('test_groups.csv', index=False)

In [51]:
df_train = pd.read_csv('./train_groups2.csv')

In [52]:
df_test = pd.read_csv('./test_groups2.csv')

In [53]:
featss = [i for i in df_test.columns.values][4:] #in case features are in test/train

In [54]:
featss

['fit0',
 'fit1',
 'fit2',
 'fit3',
 'fit4',
 'fit5',
 'fit6',
 'fit7',
 'fit8',
 'fit9',
 'fit10',
 'fit11',
 'fit12',
 'fit13',
 'fit14',
 'fit15',
 'fit16',
 'fit17',
 'fit18',
 'fit19',
 'fit20',
 'fit21',
 'fit22',
 'fit23',
 'fit24',
 'fit25',
 'fit26',
 'fit27',
 'fit28',
 'fit29',
 'fit30',
 'fit31',
 'fit32',
 'fit33',
 'fit34',
 'fit35',
 'fit36',
 'fit37',
 'fit38',
 'fit39',
 'fit40',
 'fit41',
 'fit42',
 'fit43',
 'fit44',
 'fit45',
 'fit46',
 'fit47',
 'fit48',
 'fit49',
 'fit50',
 'fit51',
 'fit52',
 'fit53',
 'fit54',
 'fit55',
 'fit56',
 'fit57',
 'fit58',
 'fit59',
 'fit60',
 'fit61',
 'fit62',
 'fit63',
 'fit64',
 'fit65',
 'fit66',
 'fit67',
 'fit68',
 'fit69',
 'fit70',
 'fit71',
 'fit72',
 'fit73',
 'fit74',
 'fit75',
 'fit76',
 'fit77',
 'fit78',
 'fit79',
 'fit80',
 'fit81',
 'fit82',
 'fit83',
 'fit84',
 'fit85',
 'fit86',
 'fit87',
 'fit88',
 'fit89',
 'fit90',
 'fit91',
 'fit92',
 'fit93',
 'fit94',
 'fit95',
 'fit96',
 'fit97',
 'fit98',
 'fit99',
 'fit100',

In [55]:
df_train.drop(featss,axis=1, inplace=True)

In [56]:
df_test.drop(featss,axis=1, inplace=True)

In [277]:
#df_merged = pd.concat([df_train, df_test])

In [22]:
tf_vect = TfidfVectorizer()

In [59]:
train_mas = tf_vect.fit_transform(df_train.words.values)

In [280]:
#merged_mas = tf_vect.fit_transform(df_merged.words.values)

In [60]:
help_mas = np.empty((df_train.shape[0], 25*5))

In [281]:
#help_mas_merged = np.empty((df_merged.shape[0], 25*5)) # for merged

In [61]:
for i in df_train.groupby('group_id'):
    ind = i[1].index.values
    values = train_mas[ind, :]
    #distances = np.partition(pairwise_distances(values, metric='cosine'),25, axis=1)[:,:25]
    distances = np.sort(pairwise_distances(values, metric='cosine'), axis=1)[:,1:26]
    meds = np.tile(np.median(distances, axis=0), distances.shape[0]).reshape(distances.shape[0], -1)
    maxs = np.tile(np.max(distances, axis=0), distances.shape[0]).reshape(distances.shape[0], -1)
    disps = np.tile(np.std(distances, axis=0), distances.shape[0]).reshape(distances.shape[0], -1)
    means = np.tile(np.mean(distances, axis=0), distances.shape[0]).reshape(distances.shape[0], -1)
    help_mas[ind, :] = np.concatenate((distances, meds, maxs,disps, means), axis=1)

In [282]:
#for merged
'''
for i in df_merged.groupby('group_id'):
    ind = i[1].index.values
    values = merged_mas[ind, :]
    distances = np.partition(pairwise_distances(values, metric='cosine'),25, axis=1)[:,:25]
    meds = np.tile(np.median(distances, axis=0), distances.shape[0]).reshape(distances.shape[0], -1)
    maxs = np.tile(np.max(distances, axis=0), distances.shape[0]).reshape(distances.shape[0], -1)
    disps = np.tile(np.std(distances, axis=0), distances.shape[0]).reshape(distances.shape[0], -1)
    means = np.tile(np.mean(distances, axis=0), distances.shape[0]).reshape(distances.shape[0], -1)
    help_mas_merged[ind, :] = np.concatenate((distances, meds, maxs,disps, means), axis=1)
'''

In [283]:
#df_help_merged = pd.DataFrame(help_mas_merged)

In [285]:
#df_help_merged.columns = ['fit' + str(i) for i in df_help_merged.columns.values]

In [62]:
df_help = pd.DataFrame(help_mas)

In [295]:
#df_train = pd.merge(df_train, df_help_merged.loc[df_train.index.values] ,how='left', left_index=True, right_index=True)

In [297]:
#df_test = pd.merge(df_test, df_help_merged.loc[df_test.index.values] ,how='left', left_index=True, right_index=True)

In [299]:
#df_test.to_csv('test_groups1.csv', index=False)
#df_train.to_csv('train_groups1.csv', index=False)

In [63]:
df_help.columns = ['fit' + str(i) for i in df_help.columns.values]

In [64]:
df_train = pd.merge(df_train, df_help ,how='left', left_index=True, right_index=True)

In [65]:
df_train.to_csv('train_groups_rew1.csv', index=False)

In [83]:
df_test = pd.read_csv('./test_groups.csv')

In [66]:
test_mas = tf_vect.transform(df_test.words.values)
help_mas = np.empty((df_test.shape[0], 25*5))
for i in df_test.groupby('group_id'):
    ind = i[1].index.values
    values = test_mas[ind, :]
    #distances = np.partition(pairwise_distances(values, metric='cosine'),25, axis=1)[:,:25] #change to sort and [:, 1:26]
    distances = np.sort(pairwise_distances(values, metric='cosine'), axis=1)[:,1:26]
    meds = np.tile(np.median(distances, axis=0), distances.shape[0]).reshape(distances.shape[0], -1)
    maxs = np.tile(np.max(distances, axis=0), distances.shape[0]).reshape(distances.shape[0], -1)
    disps = np.tile(np.std(distances, axis=0), distances.shape[0]).reshape(distances.shape[0], -1)
    means = np.tile(np.mean(distances, axis=0), distances.shape[0]).reshape(distances.shape[0], -1)
    help_mas[ind, :] = np.concatenate((distances, meds, maxs,disps, means), axis=1)
df_help = pd.DataFrame(help_mas)
df_help.columns = ['fit' + str(i) for i in df_help.columns.values]
df_test = pd.merge(df_test, df_help ,how='left', left_index=True, right_index=True)

In [67]:
df_test.to_csv('test_groups_rew1.csv', index=False)

In [68]:
df_train = pd.read_csv('train_groups_rew1.csv')
traingroups_titledata = {}
my_feat = [i for i in df_train.columns.values][5:]
for i in range(len(df_train)):
    new_doc = df_train.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    target = new_doc['target']
    features = new_doc[my_feat]
    title = new_doc['words']
    if doc_group not in traingroups_titledata:
        traingroups_titledata[doc_group] = []
    traingroups_titledata[doc_group].append((doc_id, title, features, target))

In [69]:
y_train = []
X_train = []
groups_train = []
for new_group in traingroups_titledata:
    docs = traingroups_titledata[new_group]
    for k, (doc_id, title, features, target_id) in enumerate(docs):
        y_train.append(target_id)
        groups_train.append(new_group)
        all_dist = []
        words = set(title.strip().split())
        for j in range(0, len(docs)):
            if k == j:
                continue
            doc_id_j, title_j, feat_j, target_j = docs[j]
            words_j = set(title_j.strip().split())
            all_dist.append(len(words.intersection(words_j)))
        X_train.append(sorted(all_dist, reverse=True)[0:25] + list(features)    )
X_train = np.array(X_train)
y_train = np.array(y_train)
groups_train = np.array(groups_train)
print (X_train.shape, y_train.shape, groups_train.shape)

(11689, 150) (11689,) (11689,)


In [70]:
df_test = pd.read_csv('test_groups_rew1.csv')
testgroups_titledata = {}
for i in range(len(df_test)):
    new_doc = df_test.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    features = new_doc[my_feat]
    title = new_doc['words']
    if doc_group not in testgroups_titledata:
        testgroups_titledata[doc_group] = []
    testgroups_titledata[doc_group].append((doc_id, title, features))

In [71]:
X_test = []
groups_test = []
for new_group in testgroups_titledata:
    docs = testgroups_titledata[new_group]
    for k, (doc_id, title, features) in enumerate(docs):
        groups_test.append(new_group)
        all_dist = []
        words = set(title.strip().split())
        for j in range(0, len(docs)):
            if k == j:
                continue
            doc_id_j, title_j, feat_j = docs[j]
            words_j = set(title_j.strip().split())
            all_dist.append(len(words.intersection(words_j)))
        X_test.append(sorted(all_dist, reverse=True)[0:25] + list(features)   )
X_test = np.array(X_test)
groups_test = np.array(groups_test)
print (X_test.shape, groups_test.shape)

(16627, 150) (16627,)


In [72]:
warnings.filterwarnings("ignore")

In [73]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
kf = StratifiedKFold(n_splits=5, shuffle=True)

In [74]:
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, RandomizedSearchCV

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = 0.1)

In [75]:
kf = StratifiedKFold(n_splits=5, shuffle=True)

def validate_model(estimator, params, validate_param, vals):
    score = 0
    for t in tqdm(vals):
        params[validate_param] = t
        tempScore = 0
        for train_split, test_split in kf.split(X_train, y_train):
            X_tr = X_train[train_split]
            y_tr = y_train[train_split]
            X_tst = X_train[test_split]
            y_tst = y_train[test_split]
            clf = estimator(**params)
            clf.fit(X_tr, y_tr)
            my_clf_pred = clf.predict(X_tst)
            tempScore += f1_score(y_tst, my_clf_pred)
        if (tempScore > score):
            score = tempScore
            result = t
    print(validate_param, ' = ', result, ' score= ', score / 5)
    return result

In [76]:
params = {'objective': 'binary:logistic'}
params['base_score'] = validate_model(xgb.XGBClassifier, params, 'base_score', np.linspace(0.0001,0.99999,10))
params['n_estimators'] = validate_model(xgb.XGBClassifier, params, 'n_estimators', 
                                            range(20,100,5))
params['colsample_bytree'] = validate_model(xgb.XGBClassifier, params, 'colsample_bytree', 
                                            np.linspace(0.0001,0.999,10))
params['scale_pos_weight'] = validate_model(xgb.XGBClassifier, params, 'scale_pos_weight', 
                                            np.linspace(2.0,3.0,10))
params['alpha'] = validate_model(xgb.XGBClassifier, params, 'alpha', 
                                            np.linspace(0.0001,1.0,10))
params['min_child_weight'] = validate_model(xgb.XGBClassifier, params, 'min_child_weight', 
                                            np.linspace(0.0001,1.0,10))
paramsLogistic = params

  0%|                                                                                           | 0/10 [00:00<?, ?it/s]



 10%|████████▎                                                                          | 1/10 [00:01<00:10,  1.21s/it]



 20%|████████████████▌                                                                  | 2/10 [00:09<00:27,  3.40s/it]



 30%|████████████████████████▉                                                          | 3/10 [00:18<00:35,  5.11s/it]



 40%|█████████████████████████████████▏                                                 | 4/10 [00:27<00:37,  6.32s/it]



 50%|█████████████████████████████████████████▌                                         | 5/10 [00:37<00:35,  7.19s/it]



 60%|█████████████████████████████████████████████████▊                                 | 6/10 [00:46<00:31,  7.81s/it]



 70%|██████████████████████████████████████████████████████████                         | 7/10 [00:55<00:24,  8.25s/it]



 80%|██████████████████████████████████████████████████████████████████▍                | 8/10 [01:04<00:17,  8.55s/it]



 90%|██████████████████████████████████████████████████████████████████████████▋        | 9/10 [01:14<00:08,  8.76s/it]



100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [01:15<00:00,  7.56s/it]
  0%|                                                                                           | 0/16 [00:00<?, ?it/s]

base_score  =  0.5555944444444445  score=  0.798201096860881


  6%|█████▏                                                                             | 1/16 [00:02<00:31,  2.12s/it]



 12%|██████████▍                                                                        | 2/16 [00:04<00:31,  2.25s/it]



 19%|███████████████▌                                                                   | 3/16 [00:07<00:32,  2.49s/it]



 25%|████████████████████▊                                                              | 4/16 [00:11<00:33,  2.79s/it]



 31%|█████████████████████████▉                                                         | 5/16 [00:15<00:35,  3.23s/it]



 38%|███████████████████████████████▏                                                   | 6/16 [00:20<00:37,  3.70s/it]



 44%|████████████████████████████████████▎                                              | 7/16 [00:25<00:37,  4.17s/it]



 50%|█████████████████████████████████████████▌                                         | 8/16 [00:31<00:36,  4.62s/it]



 56%|██████████████████████████████████████████████▋                                    | 9/16 [00:37<00:35,  5.10s/it]



 62%|███████████████████████████████████████████████████▎                              | 10/16 [00:44<00:34,  5.67s/it]



 69%|████████████████████████████████████████████████████████▍                         | 11/16 [00:51<00:30,  6.00s/it]



 75%|█████████████████████████████████████████████████████████████▌                    | 12/16 [00:58<00:25,  6.40s/it]



 81%|██████████████████████████████████████████████████████████████████▋               | 13/16 [01:06<00:20,  6.90s/it]



 88%|███████████████████████████████████████████████████████████████████████▊          | 14/16 [01:14<00:14,  7.30s/it]



 94%|████████████████████████████████████████████████████████████████████████████▉     | 15/16 [01:23<00:07,  7.75s/it]



100%|██████████████████████████████████████████████████████████████████████████████████| 16/16 [01:33<00:00,  5.83s/it]
  0%|                                                                                           | 0/10 [00:00<?, ?it/s]

n_estimators  =  95  score=  0.7982710429363014


 10%|████████▎                                                                          | 1/10 [00:01<00:13,  1.46s/it]



 20%|████████████████▌                                                                  | 2/10 [00:04<00:14,  1.80s/it]



 30%|████████████████████████▉                                                          | 3/10 [00:07<00:15,  2.21s/it]



 40%|█████████████████████████████████▏                                                 | 4/10 [00:11<00:16,  2.80s/it]



 50%|█████████████████████████████████████████▌                                         | 5/10 [00:16<00:17,  3.50s/it]



 60%|█████████████████████████████████████████████████▊                                 | 6/10 [00:22<00:17,  4.25s/it]



 70%|██████████████████████████████████████████████████████████                         | 7/10 [00:29<00:15,  5.01s/it]



 80%|██████████████████████████████████████████████████████████████████▍                | 8/10 [00:37<00:11,  5.90s/it]



 90%|██████████████████████████████████████████████████████████████████████████▋        | 9/10 [00:46<00:06,  6.88s/it]



100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:55<00:00,  5.59s/it]
  0%|                                                                                           | 0/10 [00:00<?, ?it/s]

colsample_bytree  =  0.44405555555555554  score=  0.7954074834712632


 10%|████████▎                                                                          | 1/10 [00:05<00:45,  5.04s/it]



 20%|████████████████▌                                                                  | 2/10 [00:09<00:40,  5.00s/it]



 30%|████████████████████████▉                                                          | 3/10 [00:14<00:34,  4.98s/it]



 40%|█████████████████████████████████▏                                                 | 4/10 [00:19<00:29,  4.99s/it]



 50%|█████████████████████████████████████████▌                                         | 5/10 [00:25<00:25,  5.05s/it]



 60%|█████████████████████████████████████████████████▊                                 | 6/10 [00:30<00:20,  5.11s/it]



 70%|██████████████████████████████████████████████████████████                         | 7/10 [00:35<00:15,  5.12s/it]



 80%|██████████████████████████████████████████████████████████████████▍                | 8/10 [00:40<00:10,  5.12s/it]



 90%|██████████████████████████████████████████████████████████████████████████▋        | 9/10 [00:45<00:05,  5.19s/it]



100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:51<00:00,  5.11s/it]
  0%|                                                                                           | 0/10 [00:00<?, ?it/s]

scale_pos_weight  =  2.6666666666666665  score=  0.8061751708457316


 10%|████████▎                                                                          | 1/10 [00:05<00:46,  5.17s/it]



 20%|████████████████▌                                                                  | 2/10 [00:10<00:41,  5.15s/it]



 30%|████████████████████████▉                                                          | 3/10 [00:15<00:36,  5.16s/it]



 40%|█████████████████████████████████▏                                                 | 4/10 [00:20<00:30,  5.13s/it]



 50%|█████████████████████████████████████████▌                                         | 5/10 [00:25<00:25,  5.12s/it]



 60%|█████████████████████████████████████████████████▊                                 | 6/10 [00:30<00:20,  5.18s/it]



 70%|██████████████████████████████████████████████████████████                         | 7/10 [00:36<00:15,  5.20s/it]



 80%|██████████████████████████████████████████████████████████████████▍                | 8/10 [00:41<00:10,  5.20s/it]



 90%|██████████████████████████████████████████████████████████████████████████▋        | 9/10 [00:46<00:05,  5.23s/it]



100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:52<00:00,  5.22s/it]
  0%|                                                                                           | 0/10 [00:00<?, ?it/s]

alpha  =  0.33340000000000003  score=  0.8083654884195173


 10%|████████▎                                                                          | 1/10 [00:05<00:50,  5.61s/it]



 20%|████████████████▌                                                                  | 2/10 [00:11<00:44,  5.62s/it]



 30%|████████████████████████▉                                                          | 3/10 [00:17<00:40,  5.85s/it]



 40%|█████████████████████████████████▏                                                 | 4/10 [00:24<00:36,  6.01s/it]



 50%|█████████████████████████████████████████▌                                         | 5/10 [00:31<00:31,  6.40s/it]



 60%|█████████████████████████████████████████████████▊                                 | 6/10 [00:38<00:26,  6.50s/it]



 70%|██████████████████████████████████████████████████████████                         | 7/10 [00:44<00:19,  6.44s/it]



 80%|██████████████████████████████████████████████████████████████████▍                | 8/10 [00:50<00:12,  6.24s/it]



 90%|██████████████████████████████████████████████████████████████████████████▋        | 9/10 [00:55<00:06,  6.06s/it]



100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [01:01<00:00,  6.14s/it]

min_child_weight  =  0.33340000000000003  score=  0.8056099193900039





In [81]:
params = {'booster': 'gbtree'}
params['n_estimators'] = validate_model(xgb.XGBClassifier, params, 'n_estimators', 
                                            range(20,100,5))
params['colsample_bytree'] = validate_model(xgb.XGBClassifier, params, 'colsample_bytree', 
                                            np.linspace(0.0001,0.999,10))
params['scale_pos_weight'] = validate_model(xgb.XGBClassifier, params, 'scale_pos_weight', 
                                            np.linspace(2.0,3.0,10))
params['alpha'] = validate_model(xgb.XGBClassifier, params, 'alpha', 
                                            np.linspace(0.0001,1.0,10))
params['min_child_weight'] = validate_model(xgb.XGBClassifier, params, 'min_child_weight', 
                                            np.linspace(0.0001,1.0,10))
paramsGbTree = params

  0%|                                                                                           | 0/16 [00:00<?, ?it/s]



  6%|█████▏                                                                             | 1/16 [00:02<00:31,  2.10s/it]



 12%|██████████▍                                                                        | 2/16 [00:04<00:31,  2.24s/it]



 19%|███████████████▌                                                                   | 3/16 [00:07<00:32,  2.53s/it]



 25%|████████████████████▊                                                              | 4/16 [00:11<00:34,  2.84s/it]



 31%|█████████████████████████▉                                                         | 5/16 [00:15<00:36,  3.28s/it]



 38%|███████████████████████████████▏                                                   | 6/16 [00:20<00:37,  3.75s/it]



 44%|████████████████████████████████████▎                                              | 7/16 [00:26<00:39,  4.40s/it]



 50%|█████████████████████████████████████████▌                                         | 8/16 [00:33<00:40,  5.08s/it]



 56%|██████████████████████████████████████████████▋                                    | 9/16 [00:40<00:39,  5.70s/it]



 62%|███████████████████████████████████████████████████▎                              | 10/16 [00:48<00:38,  6.42s/it]



 69%|████████████████████████████████████████████████████████▍                         | 11/16 [00:56<00:34,  6.81s/it]



 75%|█████████████████████████████████████████████████████████████▌                    | 12/16 [01:04<00:28,  7.23s/it]



 81%|██████████████████████████████████████████████████████████████████▋               | 13/16 [01:13<00:23,  7.69s/it]



 88%|███████████████████████████████████████████████████████████████████████▊          | 14/16 [01:22<00:16,  8.19s/it]



 94%|████████████████████████████████████████████████████████████████████████████▉     | 15/16 [01:32<00:08,  8.76s/it]



100%|██████████████████████████████████████████████████████████████████████████████████| 16/16 [01:43<00:00,  6.47s/it]
  0%|                                                                                           | 0/10 [00:00<?, ?it/s]

n_estimators  =  35  score=  0.7996091443506894


 10%|████████▎                                                                          | 1/10 [00:00<00:06,  1.29it/s]



 20%|████████████████▌                                                                  | 2/10 [00:02<00:07,  1.06it/s]



 30%|████████████████████████▉                                                          | 3/10 [00:03<00:08,  1.15s/it]



 40%|█████████████████████████████████▏                                                 | 4/10 [00:05<00:08,  1.43s/it]



 50%|█████████████████████████████████████████▌                                         | 5/10 [00:08<00:09,  1.80s/it]



 60%|█████████████████████████████████████████████████▊                                 | 6/10 [00:11<00:08,  2.16s/it]



 70%|██████████████████████████████████████████████████████████                         | 7/10 [00:14<00:07,  2.50s/it]



 80%|██████████████████████████████████████████████████████████████████▍                | 8/10 [00:18<00:05,  2.79s/it]



 90%|██████████████████████████████████████████████████████████████████████████▋        | 9/10 [00:22<00:03,  3.14s/it]



100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:26<00:00,  2.66s/it]
  0%|                                                                                           | 0/10 [00:00<?, ?it/s]

colsample_bytree  =  0.5550444444444445  score=  0.7983434858880595


 10%|████████▎                                                                          | 1/10 [00:02<00:25,  2.82s/it]



 20%|████████████████▌                                                                  | 2/10 [00:05<00:22,  2.77s/it]



 30%|████████████████████████▉                                                          | 3/10 [00:08<00:19,  2.75s/it]



 40%|█████████████████████████████████▏                                                 | 4/10 [00:10<00:16,  2.76s/it]



 50%|█████████████████████████████████████████▌                                         | 5/10 [00:13<00:13,  2.75s/it]



 60%|█████████████████████████████████████████████████▊                                 | 6/10 [00:16<00:10,  2.75s/it]



 70%|██████████████████████████████████████████████████████████                         | 7/10 [00:19<00:08,  2.84s/it]



 80%|██████████████████████████████████████████████████████████████████▍                | 8/10 [00:22<00:05,  2.81s/it]



 90%|██████████████████████████████████████████████████████████████████████████▋        | 9/10 [00:24<00:02,  2.79s/it]



100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:28<00:00,  2.80s/it]
  0%|                                                                                           | 0/10 [00:00<?, ?it/s]

scale_pos_weight  =  2.0  score=  0.8064949829172725


 10%|████████▎                                                                          | 1/10 [00:02<00:26,  2.92s/it]



 20%|████████████████▌                                                                  | 2/10 [00:05<00:23,  2.90s/it]



 30%|████████████████████████▉                                                          | 3/10 [00:08<00:20,  2.87s/it]



 40%|█████████████████████████████████▏                                                 | 4/10 [00:11<00:17,  2.88s/it]



 50%|█████████████████████████████████████████▌                                         | 5/10 [00:14<00:14,  2.87s/it]



 60%|█████████████████████████████████████████████████▊                                 | 6/10 [00:17<00:11,  2.84s/it]



 70%|██████████████████████████████████████████████████████████                         | 7/10 [00:20<00:08,  2.89s/it]



 80%|██████████████████████████████████████████████████████████████████▍                | 8/10 [00:23<00:05,  2.91s/it]



 90%|██████████████████████████████████████████████████████████████████████████▋        | 9/10 [00:25<00:02,  2.92s/it]



100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:28<00:00,  2.88s/it]
  0%|                                                                                           | 0/10 [00:00<?, ?it/s]

alpha  =  0.33340000000000003  score=  0.808411954851638


 10%|████████▎                                                                          | 1/10 [00:03<00:27,  3.02s/it]



 20%|████████████████▌                                                                  | 2/10 [00:06<00:24,  3.03s/it]



 30%|████████████████████████▉                                                          | 3/10 [00:09<00:21,  3.03s/it]



 40%|█████████████████████████████████▏                                                 | 4/10 [00:12<00:18,  3.03s/it]



 50%|█████████████████████████████████████████▌                                         | 5/10 [00:15<00:15,  3.03s/it]



 60%|█████████████████████████████████████████████████▊                                 | 6/10 [00:18<00:12,  3.04s/it]



 70%|██████████████████████████████████████████████████████████                         | 7/10 [00:21<00:09,  3.04s/it]



 80%|██████████████████████████████████████████████████████████████████▍                | 8/10 [00:24<00:06,  3.04s/it]



 90%|██████████████████████████████████████████████████████████████████████████▋        | 9/10 [00:27<00:03,  3.02s/it]



100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:30<00:00,  3.04s/it]

min_child_weight  =  0.11120000000000001  score=  0.8057805588969629





In [82]:
clf = xgb.XGBClassifier(**paramsGbTree)
clf.fit(X_train, y_train)



XGBClassifier(alpha=0.33340000000000003, base_score=0.5, booster='gbtree',
              colsample_bylevel=1, colsample_bynode=1,
              colsample_bytree=0.5550444444444445, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=0.11120000000000001, missing=nan,
              monotone_constraints='()', n_estimators=35, n_jobs=8,
              num_parallel_tree=1, random_state=0, reg_alpha=0.333400011,
              reg_lambda=1, scale_pos_weight=2.0, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [83]:
f1_score(clf.predict(X_test), y_test)

0.8179190751445086

In [102]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [103]:
clf = xgb.XGBClassifier(**paramsLogistic)
clf.fit(X_train, y_train)



XGBClassifier(alpha=0.2223, base_score=0.7777922222222223, booster='gbtree',
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.999,
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6,
              min_child_weight=0.11120000000000001, missing=nan,
              monotone_constraints='()', n_estimators=40, n_jobs=8,
              num_parallel_tree=1, random_state=0, reg_alpha=0.222299993,
              reg_lambda=1, scale_pos_weight=2.7777777777777777, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [104]:
predict = clf.predict(X_test)

In [105]:
df_test['target'] = predict

In [108]:
df_test[['pair_id', 'target']].to_csv('new_prediction.csv', index=False)

In [None]:
#try some another models

In [127]:
from sklearn.svm import OneClassSVM

In [210]:
X_train = np.concatenate((X_train, groups_train.reshape(-1,1)), axis=1)

In [248]:
kf = StratifiedKFold(n_splits=5, shuffle=True)

def validate_model_SVM(estimator, params, validate_param, vals):
    score = 0
    for t in tqdm(vals):
        params[validate_param] = t
        tempScore = 0
        for train_split, test_split in kf.split(X_train, y_train):
            X_tr = X_train[train_split]
            y_tr = y_train[train_split]
            #y_tr = ((y_tr * 2) - 1) * (-1)
            X_tst = X_train[test_split]
            y_tst = y_train[test_split]
            #y_tst = ((y_tst * 2) - 1) * (-1)
            clf = estimator(**params)
            clf.fit(X_tr, y_tr)
            my_clf_pred = clf.predict(X_tst)
            my_clf_pred = (my_clf_pred * (-1) + 1) // 2
            tempScore += f1_score(y_tst, my_clf_pred)
        if (tempScore > score):
            score = tempScore
            result = t
    print(validate_param, ' = ', result, ' score= ', score / 5)
    return result
'''
def validate_model_SVM(estimator, params, validate_param, vals):
    score = 0
    for t in tqdm(vals):
        params[validate_param] = t
        tempScore = 0
        for train_split, test_split in kf.split(X_train, y_train):
            X_tr = X_train[train_split]
            y_tr = y_train[train_split]
            X_tst = X_train[test_split]
            y_tst = y_train[test_split]
            clf = estimator(**params)
            res = np.zeros((X_tst.shape[0],))
            for i in np.unique(X_tr[:, 150]):
                ind = np.where(X_tr[:, 150] == i)
                clf.fit(X_tr[ind, :150][0], y_tr[ind])
                ind_test = np.where(X_tst[:, 150] == i)
                res[ind_test] = (clf.predict(X_tst[ind_test, :150][0])[:] * (-1) + 1) // 2
            #clf.fit(X_tr, y_tr)
            res = res.reshape((-1,1))
            #res = clf.predict(X_tst)
            tempScore += f1_score(y_tst, res)
        if (tempScore > score):
            score = tempScore
            result = t
    print(validate_param, ' = ', result, ' score= ', score / 5)
    return result
'''

"\ndef validate_model_SVM(estimator, params, validate_param, vals):\n    score = 0\n    for t in tqdm(vals):\n        params[validate_param] = t\n        tempScore = 0\n        for train_split, test_split in kf.split(X_train, y_train):\n            X_tr = X_train[train_split]\n            y_tr = y_train[train_split]\n            X_tst = X_train[test_split]\n            y_tst = y_train[test_split]\n            clf = estimator(**params)\n            res = np.zeros((X_tst.shape[0],))\n            for i in np.unique(X_tr[:, 150]):\n                ind = np.where(X_tr[:, 150] == i)\n                clf.fit(X_tr[ind, :150][0], y_tr[ind])\n                ind_test = np.where(X_tst[:, 150] == i)\n                res[ind_test] = (clf.predict(X_tst[ind_test, :150][0])[:] * (-1) + 1) // 2\n            #clf.fit(X_tr, y_tr)\n            res = res.reshape((-1,1))\n            #res = clf.predict(X_tst)\n            tempScore += f1_score(y_tst, res)\n        if (tempScore > score):\n            score 

In [249]:
clf = OneClassSVM()

In [250]:
#SVM has our '0' as '1' and '1' as '-1'
params = {'kernel': 'rbf'}
params['nu'] = validate_model_SVM(OneClassSVM, params, 'nu', np.linspace(0.00001,0.99999,10))
params['gamma'] = validate_model_SVM(OneClassSVM, params, 'gamma', ['scale', 'auto'])
paramsSVM = params

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [07:34<00:00, 45.47s/it]
  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

nu  =  0.5555544444444445  score=  0.505334953864634


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [01:52<00:00, 56.33s/it]

gamma  =  scale  score=  0.505343511446647





In [251]:
clf = OneClassSVM(**paramsSVM)
clf.fit(X_train, y_train)

OneClassSVM(nu=0.5555544444444445)

In [252]:
#np.abs(predict - 1) // 2 for good result
f1_score((np.abs(clf.predict(X_test) - 1) // 2), y_test)

0.4544502617801046

In [220]:
from sklearn.ensemble import IsolationForest

In [228]:
clf = IsolationForest()

In [237]:
def validate_model_Is_For(estimator, params, validate_param, vals):
    score = 0
    for t in tqdm(vals):
        params[validate_param] = t
        tempScore = 0
        for train_split, test_split in kf.split(X_train, y_train):
            X_tr = X_train[train_split]
            y_tr = y_train[train_split]
            X_tst = X_train[test_split]
            y_tst = y_train[test_split]
            clf = estimator(**params)
            clf.fit(X_tr, y_tr)
            my_clf_pred = clf.predict(X_tst)
            my_clf_pred = (my_clf_pred * (-1)  + 1) // 2
            tempScore += f1_score(y_tst, my_clf_pred)
        if (tempScore > score):
            score = tempScore
            result = t
    print(validate_param, ' = ', result, ' score= ', score / 5)
    return result

In [238]:
params = {'bootstrap': 'true', 'n_jobs': -1}
params['n_estimators'] = validate_model_Is_For(IsolationForest, params, 'n_estimators', 
                                            range(100,200,10))
params['max_samples'] = validate_model_Is_For(IsolationForest, params, 'max_samples', 
                                            np.linspace(256,1200,10).astype('int'))
params['contamination'] = validate_model_Is_For(IsolationForest, params, 'contamination', 
                                            np.linspace(0.0001,0.1000,10))
params['max_features'] = validate_model_Is_For(IsolationForest, params, 'max_features', 
                                            [np.sqrt(150).astype('int'), 1.0])
paramsIsol = params

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:52<00:00,  5.23s/it]
  0%|                                                                                           | 0/10 [00:00<?, ?it/s]

n_estimators  =  160  score=  0.47172296880880404


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [01:00<00:00,  6.08s/it]
  0%|                                                                                           | 0/10 [00:00<?, ?it/s]

max_samples  =  256  score=  0.4664162137075231


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [02:04<00:00, 12.49s/it]
  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

contamination  =  0.1  score=  0.35822085138489923


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:16<00:00,  8.43s/it]

max_features  =  1.0  score=  0.36853681570956104





In [239]:
clf = IsolationForest(**paramsIsol)
clf.fit(X_train, y_train)

IsolationForest(bootstrap='true', contamination=0.1, max_samples=256,
                n_estimators=160, n_jobs=-1)

In [243]:
#np.abs(predict * (-1) + 1) // 2 for good result
f1_score(((clf.predict(X_test) * (-1) + 1) // 2), y_test)

0.34101382488479265

In [257]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [258]:
clf1 = IsolationForest(bootstrap=True, n_jobs=-1, n_estimators=160)
clf1.fit(X_train, y_train)
pred1 = (clf1.predict(X_test) * (-1) + 1) // 2

In [260]:
clf2 = OneClassSVM(**paramsSVM)
clf2.fit(X_train, y_train)
pred2 = (clf2.predict(X_test) * (-1) + 1) // 2

In [265]:
clf3 = xgb.XGBClassifier(**paramsLogistic)
clf3.fit(X_train, y_train)
pred3 = clf3.predict(X_test)



In [266]:
clf4 = xgb.XGBClassifier(**paramsGbTree)
clf4.fit(X_train, y_train)
pred4 = clf4.predict(X_test)



In [287]:
vote = (0.4 * pred3 + 0.6 * pred4)#(0.2 * pred1 + 0.1 * pred2 + 0.35 * pred3 + 0.35 * pred4)

In [288]:
np.unique(vote)

array([0. , 0.4, 0.6, 1. ])

In [295]:
vote = vote > 0.4

In [297]:
df_test['target'] = vote.astype('int')

In [298]:
df_test[['pair_id', 'target']].to_csv('new_prediction3.csv', index=False)