In [1]:
import pandas as pd
import numpy as np
import string
import re
from vowpalwabbit import pyvw
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score

In [2]:
phone = r'((8\D{0,2}9|7\D{0,2}9|9)\D{0,2}(\d\D{0,2}){9,9})'
nik = r'(@\S*)'
vk = r'(vk\.com\/\w*)'
i_d = r'(id\/\d*)'
site = r'(\w*\.(ru|com|net|org|biz|edu|gov|info|by|рф|бел|ua|укр))'
dis = r'(\w*#\d*)'
CONTACT = '|'.join([phone, nik, vk, i_d, site, dis])

In [3]:
def fill_nan(df):
    median_by_subcat = df.groupby('subcategory').median()
    for sbct in set(df['subcategory']):
        df.loc[(df['subcategory'] == sbct) & (df['price'].isna()), 'price'] = median_by_subcat.loc[sbct]['price']
    return df

In [4]:
def del_out_lognorm(df):
    df = df[df['price'] < 10**7]
    df['price'] = np.log1p(df['price'])
    return df

In [5]:
def get_hour(df):
    df['datetime_submitted'] = pd.to_datetime(df['datetime_submitted'], yearfirst=True)
    df['hour'] = [d.hour for d in df['datetime_submitted']]
    return df.drop('datetime_submitted', axis=1)

In [6]:
def make_logistic(df):
    df['is_bad'] = df['is_bad'].map({1: 1, 0: -1})
    return df

In [7]:
def to_vw_format(text, subcat, cat, price, region, city, hour, label=None):
    text = text.lower()
    table = str.maketrans({key: ' ' for key in string.punctuation + '\n'})

    text = text.translate(table)
    text = re.sub(CONTACT, ' контакт ', text, 0)

    subcat = subcat.replace(' ', '')
    cat = cat.replace(' ', '')
    region = region.replace(' ', '')
    city = city.replace(' ', '')
    return str(label or '') + ' |t ' + text + ' |price:' + str(price) + ' |s ' + subcat \
                            + ' |c ' + cat + ' |r ' + region + ' |ct ' + city + ' |h ' + str(hour) + '\n'

In [8]:
train = pd.read_csv('../tmp/data/train.csv')
val = pd.read_csv('../tmp/data/val.csv')

In [9]:
train = get_hour(train)
train = fill_nan(train)
train = del_out_lognorm(train)
#train = make_logistic(train)
val = get_hour(val)
val = fill_nan(val)
val = del_out_lognorm(val)
#val = make_logistic(val)

In [10]:
val1, test = train_test_split(val, test_size=0.5, random_state = 42)
y_train = train.is_bad.to_numpy()
y_test = test.is_bad.to_numpy()
y_val = val1.is_bad.to_numpy()


In [3]:
model = pyvw.vw(d='../tmp/prepared_vw/train.txt', f='../models/fitted_models/model.vw', b=22, classweight='-1:0.32',
                loss_function='logistic', link='logistic',
                ngram='t3', skips='t2',
                passes=20, cache_file='../models/fitted_models/train.cache', k=True,
                decay_learning_rate=0.85, l1=10**-6, l2=10**-6, ftrl=True,
                random_seed=45)

In [11]:
pred = pyvw.vw(i='../models/fitted_models/model.vw', d='../tmp/prepared_vw/train.txt', p='../tmp/predictions/predictions_train.txt')
pred = pyvw.vw(i='../models/fitted_models/model.vw', d='../tmp/prepared_vw/test.txt', p='../tmp/predictions/predictions_test.txt')
pred = pyvw.vw(i='../models/fitted_models/model.vw', d='../tmp/prepared_vw/val.txt', p='../tmp/predictions/predictions_val.txt')

In [12]:
y_pred_train = pd.read_csv('../tmp/predictions/predictions_train.txt', header=None).to_numpy().T[0]
m = y_pred_train > 0.5
y_pred_train = m.astype(np.int64)

In [13]:
y_pred_test = pd.read_csv('../tmp/predictions/predictions_test.txt', header=None).to_numpy().T[0]
m = y_pred_test > 0.5
y_pred_test = m.astype(np.int64)

In [14]:
y_pred_val = pd.read_csv('../tmp/predictions/predictions_val.txt', header=None).to_numpy().T[0]
m = y_pred_val > 0.5
y_pred_val = m.astype(np.int64)

In [15]:
roc_auc_score(y_train, y_pred_train)

0.957225424672258

In [16]:
accuracy_score(y_train, y_pred_train)

0.9632608285243409

In [17]:
roc_auc_score(y_test, y_pred_test)

0.8420030028037228

In [18]:
accuracy_score(y_test, y_pred_test)

0.8378512192102983

In [19]:
roc_auc_score(y_val, y_pred_val)

0.8388252227203914

In [20]:
accuracy_score(y_val, y_pred_val)

0.8299083931666255