In [None]:
import sys
FOLDER_PATH = '/content/drive/MyDrive/Colab Notebooks/project/'
sys.path.append(FOLDER_PATH)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install -r drive/MyDrive/Colab\ Notebooks/project/requirements

In [None]:
import re
import imp
import time
import pickle
import math

import numpy as np
import pandas as pd
import scipy as sc
import matplotlib.pyplot as plt
import seaborn as sns
import category_encoders as ce

from pandarallel import pandarallel
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
pandarallel.initialize(progress_bar=False)

from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV
from scipy import stats
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression

  import pandas.util.testing as tm


INFO: Pandarallel will run on 1 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [None]:
def auc_group(df):
    y = df['is_bad']
    y_hat = df['pred']
    try:
        return roc_auc_score(y, y_hat)
    except ValueError:
        return 0.5

def loguniform(low=0, high=1, size=None):
    return np.exp(np.random.uniform(low, high, size))

def sigmoid(x):
    return 1 / (1 + math.exp(-x))

In [None]:
df_train = pd.read_csv('{}_data/train_prep.csv'.format(FOLDER_PATH), sep='|')
df_val = pd.read_csv('{}_data/val_prep.csv'.format(FOLDER_PATH), sep='|')
df_test = pd.read_csv('{}_data/test_prep.csv'.format(FOLDER_PATH), sep='|')

df_train = df_train[['id', 'category', 'is_bad']]
df_val = df_val[['id', 'category', 'is_bad']]
df_test = df_test[['id', 'category', 'is_bad']]

cat_train = pd.read_csv('{}catboost_scores/X_train.csv'.format(FOLDER_PATH), sep='|', )
cat_val = pd.read_csv('{}catboost_scores/X_val.csv'.format(FOLDER_PATH), sep='|')
cat_test = pd.read_csv('{}catboost_scores/X_test.csv'.format(FOLDER_PATH), sep='|')

bert_train = pd.read_csv('{}bert_scores/epoch_2/X_train.csv'.format(FOLDER_PATH), sep='|')
bert_val = pd.read_csv('{}bert_scores/epoch_2/X_val.csv'.format(FOLDER_PATH), sep='|')
bert_test = pd.read_csv('{}bert_scores/epoch_2/X_test.csv'.format(FOLDER_PATH), sep='|')

In [None]:
cat_train.drop(['Unnamed: 0'], axis=1, inplace=True)
cat_val.drop(['Unnamed: 0'], axis=1, inplace=True)
cat_test.drop(['Unnamed: 0'], axis=1, inplace=True)

bert_train.drop(['Unnamed: 0'], axis=1, inplace=True)
bert_val.drop(['Unnamed: 0'], axis=1, inplace=True)
bert_test.drop(['Unnamed: 0'], axis=1, inplace=True)

In [None]:
display(cat_train.head())
display(bert_train.head())

Unnamed: 0,id,predict_catboost
0,id_0336875,9.2e-05
1,id_0602099,0.007664
2,id_0681935,0.937569
3,id_0092414,0.013666
4,id_0642364,0.000728


Unnamed: 0,id,predict_bert
0,id_0336875,-3.11343
1,id_0602099,-2.889626
2,id_0681935,2.650685
3,id_0092414,-1.899409
4,id_0642364,-3.113426


In [None]:
train = df_train\
  .merge(cat_train, how='inner', on=['id'])\
  .merge(bert_train, how='inner', on=['id'])

val = df_val\
  .merge(cat_val, how='inner', on=['id'])\
  .merge(bert_val, how='inner', on=['id'])

test = df_test\
  .merge(cat_test, how='inner', on=['id'])\
  .merge(bert_test, how='inner', on=['id'])

train.head()

Unnamed: 0,id,category,is_bad,predict_catboost,predict_bert
0,id_0336875,Транспорт,0,9.2e-05,-3.11343
1,id_0602099,Транспорт,0,0.007664,-2.889626
2,id_0681935,Для дома и дачи,1,0.937569,2.650685
3,id_0092414,Транспорт,0,0.013666,-1.899409
4,id_0642364,Личные вещи,0,0.000728,-3.113426


In [None]:
train['predict_bert'] = train['predict_bert'].parallel_apply(sigmoid)
val['predict_bert'] = val['predict_bert'].parallel_apply(sigmoid)
test['predict_bert'] = test['predict_bert'].parallel_apply(sigmoid)

In [None]:
val.head()

Unnamed: 0,id,category,is_bad,predict_catboost,predict_bert
0,id_0638258,Бытовая электроника,0,0.006692,0.043005
1,id_0497348,Личные вещи,1,0.988122,0.945179
2,id_0759873,Транспорт,0,0.091023,0.045253
3,id_0772139,Для дома и дачи,0,0.027595,0.046107
4,id_0248619,Услуги,1,0.368987,0.935957


# VAL DATA

In [None]:
param_grid = {
    'C': loguniform(-4.7, 0, 1000)}

model = LogisticRegression(
    random_state=42, 
    n_jobs=-1,
    solver='saga')

In [None]:
%%time
randomized_search = RandomizedSearchCV(
    model, 
    param_grid, 
    n_iter=20,
    n_jobs=-1, 
    verbose=4, 
    cv=5,
    scoring='roc_auc', 
    refit=True, 
    random_state=42)

randomized_search.fit(val[['predict_catboost', 'predict_bert']], val['is_bad'])

Fitting 5 folds for each of 20 candidates, totalling 100 fits
CPU times: user 2.08 s, sys: 176 ms, total: 2.26 s
Wall time: 28.7 s


In [None]:
randomized_search.best_score_

0.991231477110478

In [None]:
randomized_search.best_params_

{'C': 0.7812318666281485}

In [None]:
train['pred_val'] = randomized_search.best_estimator_.predict_proba(
    train[['predict_catboost', 'predict_bert']]
    )[:, 1]
val['pred_val'] = randomized_search.best_estimator_.predict_proba(
    val[['predict_catboost', 'predict_bert']]
    )[:, 1]
test['pred_val'] = randomized_search.best_estimator_.predict_proba(
    test[['predict_catboost', 'predict_bert']]
    )[:, 1]

In [None]:
def auc_group(df):
    y = df['is_bad']
    y_hat = df['pred_val']
    try:
        return roc_auc_score(y, y_hat)
    except ValueError:
        return 0.5

In [None]:
print('TRAIN')
print(roc_auc_score(train['is_bad'], train['pred_full']))
print(train.groupby(['category']).apply(auc_group))
print(train.groupby(['category']).apply(auc_group).mean())
print('=' * 20)

print('VAL')
print(roc_auc_score(val['is_bad'], val['pred_full']))
print(val.groupby(['category']).apply(auc_group))
print(val.groupby(['category']).apply(auc_group).mean())
print('=' * 20)

print('TEST')
print(roc_auc_score(test['is_bad'], test['pred_full']))
print(test.groupby(['category']).apply(auc_group))
print(test.groupby(['category']).apply(auc_group).mean())
print('=' * 20)

result_auc = pd.concat([
    train.groupby(['category']).apply(auc_group), 
    val.groupby(['category']).apply(auc_group), 
    test.groupby(['category']).apply(auc_group)], 
    axis=1)
result_auc.columns = ['train', 'val', 'test']
display(result_auc)

TRAIN
0.996127188119837
category
Бытовая электроника    0.994837
Для бизнеса            0.988542
Для дома и дачи        0.993017
Животные               0.997646
Личные вещи            0.996194
Недвижимость           0.997115
Работа                 0.992695
Транспорт              0.997307
Услуги                 0.994496
Хобби и отдых          0.992380
dtype: float64
0.9944229679208245
VAL
0.990969615063672
category
Бытовая электроника    0.986084
Для бизнеса            0.934890
Для дома и дачи        0.986871
Животные               0.994485
Личные вещи            0.992047
Недвижимость           0.993377
Работа                 0.976112
Транспорт              0.994555
Услуги                 0.985620
Хобби и отдых          0.980866
dtype: float64
0.9824908648858228
TEST
0.9819889211023903
category
Бытовая электроника    0.973086
Для бизнеса            0.927591
Для дома и дачи        0.968632
Животные               0.966984
Личные вещи            0.859753
Недвижимость           0.987294
Раб

Unnamed: 0_level_0,train,val,test
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Бытовая электроника,0.994837,0.986084,0.973086
Для бизнеса,0.988542,0.93489,0.927591
Для дома и дачи,0.993017,0.986871,0.968632
Животные,0.997646,0.994485,0.966984
Личные вещи,0.996194,0.992047,0.859753
Недвижимость,0.997115,0.993377,0.987294
Работа,0.992695,0.976112,0.977224
Транспорт,0.997307,0.994555,0.995588
Услуги,0.994496,0.98562,0.950744
Хобби и отдых,0.99238,0.980866,0.956208


In [None]:
params = {'C': 0.7812318666281485}

model = LogisticRegression(
    random_state=42, 
    n_jobs=-1,
    solver='saga',
    **params
    )

In [None]:
X = pd.concat([train, val, test], axis=0)[['predict_catboost', 'predict_bert']]
y = pd.concat([train, val, test], axis=0)['is_bad']

In [None]:
model.fit(X, y)

LogisticRegression(C=0.7812318666281485, n_jobs=-1, random_state=42,
                   solver='saga')

In [None]:
with open('{}blend_models/log_reg.pickle'.format(FOLDER_PATH), 'wb') as f:
    pickle.dump(model, f)