In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

from catboost import CatBoostRegressor
import pyltr
import pickle

%matplotlib inline

In [2]:
import keras
from keras.models import load_model
from keras.losses import binary_crossentropy, mean_squared_error, mean_squared_error

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


In [3]:
import building, training, models, callbacks, wonder, generator, ranking

In [4]:
from importlib import reload

In [5]:
reload(building)
reload(training)
reload(models)
reload(callbacks)
reload(wonder)
reload(generator)
reload(ranking)

<module 'ranking' from '/Users/sergmiller/Documents/code/python/alissa/ranking.py'>

In [6]:
emb = building.make_emb('datasets/ru.csv')
train = pd.read_csv('datasets/sents/train_sents').fillna('')
val = pd.read_csv('datasets/sents/val_sents').fillna('')
boost = pd.read_csv('datasets/sents/boost_sents').fillna('')
test = pd.read_csv('datasets/sents/test_sents').fillna('')
public = pd.read_csv('datasets/public_df', index_col='Unnamed: 0').fillna('')

In [47]:
train_val = pd.concat([train, val], axis=0)

In [48]:
train_val.shape

(78757, 8)

In [7]:
model = load_model('weights/golden/weights.02-0.985.hdf5')

model.compile(
    loss={'class_out': binary_crossentropy, 
          'auto3' : mean_squared_error, 
          'auto4': mean_squared_error},
    loss_weights = {'class_out': 1, 
          'auto3' : 0, 
          'auto4': 0},
    optimizer='rmsprop',
)

Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead


must load end!

In [9]:
train.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,94846308296151,"я пойду , э , освежусь .","сделай массажный душ , джамбо .",хмм ?,0,"да , в ванной .",good,0.942226
1,94846308296151,"я пойду , э , освежусь .","сделай массажный душ , джамбо .",хмм ?,1,я не грязный .,bad,0.812665
2,94846308296151,"я пойду , э , освежусь .","сделай массажный душ , джамбо .",хмм ?,2,"так , будь готова .",good,0.488691
3,94846308296151,"я пойду , э , освежусь .","сделай массажный душ , джамбо .",хмм ?,3,на тот холм .,bad,0.996344
4,94846308296151,"я пойду , э , освежусь .","сделай массажный душ , джамбо .",хмм ?,4,ты идешь внутрь .,bad,0.506997


In [10]:
public.head()

Unnamed: 0,0,1,2,3,4,5
0,138920940977,"знаешь , я иногда подумываю , что тебе надо пр...",не - а .,нет ?,0,неа .
1,138920940977,"знаешь , я иногда подумываю , что тебе надо пр...",не - а .,нет ?,1,"нет , не хочу ."
2,138920940977,"знаешь , я иногда подумываю , что тебе надо пр...",не - а .,нет ?,2,нет .
3,138920940977,"знаешь , я иногда подумываю , что тебе надо пр...",не - а .,нет ?,3,"конечно , нет ."
4,138920940977,"знаешь , я иногда подумываю , что тебе надо пр...",не - а .,нет ?,4,"разумеется , нет ."


In [63]:
boost.shape

(13899, 8)

In [206]:
def run_model(train, val, emb, epochs=20, batch_size=64, sample_len=10000, load_file=None, save_file=None):
    if load_file is None:
        model = models.model((40, 300), eps=1e-5)
    else:
        model = load_model(load_file)

        model.compile(
            loss={'class_out': binary_crossentropy, 
                  'auto3' : mean_squared_error, 
                  'auto4': mean_squared_error},
            loss_weights = {'class_out': 1, 
                  'auto3' : 0, 
                  'auto4': 0},
            optimizer='rmsprop',
        )
    
    filepath = "weights/weights.{epoch:02d}-{val_loss:.3f}.hdf5"
    
    y_train = (train['6'] == 'good').values
    y_val = (val['6'] == 'good').values
    w_train = train['7']
    
    sample_train = building.shuffle_by_groups(train, '0', random_state=0)[:sample_len]

    model.fit_generator(training.flow(train, emb, batch_size, y_train, w_train),
                        steps_per_epoch=train.shape[0] / batch_size, 
                        epochs=epochs, 
                        shuffle=True,
                        callbacks=[callbacks.TelegramCallback(), 
                                callbacks.RocCallback(sample_train, val, emb),
                                callbacks.NDCGCallback(sample_train, val, emb),
                                keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.9,
                              patience=3, min_lr=1e-5),
                                keras.callbacks.ModelCheckpoint(filepath, monitor='val_loss', verbose=0, 
                                    save_best_only=False, save_weights_only=False, mode='auto', period=1)
                                  ],
                        validation_data=training.flow(val, emb, batch_size, y_train),
                        validation_steps=val.shape[0] / batch_size,
                       )

    if save_file is not None:
        model.save(save_file)

In [207]:
run_model(train, val, emb, load_file='weights/weights.06-2.292.hdf5')

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [196]:
training.predict_with_nn(model, test, emb, use_y=True)

ndcg_mean: 0.9033897938335989, ndcg_std: 0.14468554771085015


array([[0.6410377 ],
       [0.97201127],
       [0.58125263],
       ...,
       [0.61131245],
       [0.74294466],
       [0.7219546 ]], dtype=float32)

у недоученной модели(автоэнкодер) 0.892 +- 0.150, лидерборд - 0.84742

у четкой модели - 0.903 +- 0.145, лидерборд - 0.84497

In [54]:
training.predict_with_nn(model, boost, emb, use_y=True)

ndcg_mean: 0.8998286971054842, ndcg_std: 0.14367885667958052


array([[0.4160223 ],
       [0.35829148],
       [0.3237929 ],
       ...,
       [0.29188445],
       [0.8074343 ],
       [0.60766774]], dtype=float32)

In [98]:
public_pred = training.predict_with_nn(model, public, emb, use_y=False)

In [99]:
sub = wonder.create_sub(public, public_pred, 'submissions/sub_only_with_tuned_model0.csv')

## pyltr

In [9]:
with open('model_pyltr', 'rb') as f:
    rank_model = pickle.load(f)[0]

In [38]:
metric = pyltr.metrics.NDCG(k=6, gain_type='identity')

rank_model = pyltr.models.LambdaMART(
    metric=metric,
    n_estimators=1000,
    learning_rate=0.05,
    max_features=0.3,
    subsample=1,
    random_state=1,
    max_leaf_nodes=8,
    min_samples_leaf=100,
    verbose=1,
)

In [40]:
res, rank_model = generator.train_pyltr(rank_model, metric, \
                                   model, emb, boost, \
                                   model_file='rank_model', use_cache=True, stop_after=500)

In [33]:
res

(0.9172111600057461, 0.1250853174634983)

In [34]:
y_pred = training.predict_with_stack_model(rank_model, model, test, emb, use_y=True)

  output=model.get_layer(model.layers[5].name).output)


ndcg_mean: 0.9017489184385077, ndcg_std: 0.14364126367230878


## catboost

In [44]:
rank_model = CatBoostRegressor(
                            iterations=1000, 
                            depth=6, 
                            rsm=1, 
                            l2_leaf_reg=6,
                            thread_count=4,
                            learning_rate= 1e-2,
                            eval_metric='YetiRank',
                            loss_function='YetiRank',
                            random_state=1,
                            bagging_temperature=0.1,
                            use_best_model=True,
                                  )

In [49]:
reload(generator)

<module 'generator' from '/Users/sergmiller/Documents/code/python/alissa/generator.py'>

In [None]:
res, rank_model = generator.train_catboost(rank_model, \
                                model, emb, train_val, boost, \
                                model_file='rank_model_cat', use_cache=False)

In [34]:
res

(0.9075720853700862, 0.13469497977927758)

In [57]:
y_pred = training.predict_with_stack_model(rank_model, model, test, emb, use_y=True)

ndcg_mean: 0.8962751367496137, ndcg_std: 0.15006599666691117


In [28]:
public_pred = training.predict_with_stack_model(rank_model, model, public, emb)

  output=model.get_layer(model.layers[5].name).output)


In [46]:
sub = wonder.create_sub(public, public_pred, 'submissions/sub_pfound_1068_2level_model.csv')