In [1]:
#!mkdir -p ~/.kaggle
#!cp kaggle.json ~/.kaggle/
#!pip install kaggle
#!chmod 600 /root/.kaggle/kaggle.json
#!kaggle competitions download -c copy-of-ml-1-shad-fall-2022
#!unzip copy-of-ml-1-shad-fall-2022.zip

In [2]:
!pip install lightgbm



In [3]:
import random
from typing import Optional

import pandas as pd
import lightgbm
from lightgbm.callback import early_stopping

TARGET = 'target'
REQUEST_ID = 'request_id'

In [4]:
pd.__version__

'1.2.2'

In [5]:
def df_to_dict(df: pd.DataFrame):
    return df.groupby(REQUEST_ID).item_id.apply(list).to_dict()

def mapk_df(df_with_prediction: pd.DataFrame, k: int) -> float:
    df_sorted = df_with_prediction.sort_values(['request_id', 'prediction'], ascending=[True, False])
    df_sorted['rank'] = df_sorted.groupby('request_id').cumcount()
    top = df_sorted[df_sorted['rank'] < k]
    pred_dict = df_to_dict(top)
    ans_dict = df_to_dict(df_sorted.query('target == 1'))
    
    predicted = []
    actual = []
    for k in ans_dict:
        predicted.append(pred_dict[k])
        actual.append(ans_dict[k])
        
    return mapk(actual, predicted, k)


def apk(actual, predicted, k=30):

    if not actual:
        return 0.0

    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0
    for i, p in enumerate(predicted):
        # first condition checks whether it is valid prediction
        # second condition checks if prediction is not repeated
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    return score / min(len(actual), k)

def mapk(actual, predicted, k=30):
    return sum([apk(a, p, k) for a, p in zip(actual, predicted)]) / len(actual)
    

In [6]:
import numpy as np

def get_group_for_lgb(sorted_vals):
    curr_val = sorted_vals[0]
    cnt = 1
    groups = []
    for val in sorted_vals[1:]:
        if val != curr_val:
            groups.append(cnt)
            curr_val = val
            cnt = 1
        else:
            cnt += 1

    groups.append(cnt)

    return groups



class BaggingLightGBM:
    def __init__(self, num_estimators):
        self.num_estimators = num_estimators
        self.k = num_estimators ** 0.5
        self.models = []
        
    def _get_masks(self, n):
        masks = []
        for i in range(self.num_estimators):
            masks.append(np.random.choice([True, False], n, p = [1 / self.k, 1 - (1 / self.k)]))
        return masks
    
    def fit(self, train, val):
        masks = self._get_masks(len(train))
        for i in range(self.num_estimators):
            trn_cur, val_cur = train.iloc[masks[i]], val
            X_train, y_train, train_groups = trn_cur[features], trn_cur[TARGET], get_group_for_lgb(trn_cur[REQUEST_ID].values)
            X_val, y_val, val_groups = val_cur[features], val_cur[TARGET], get_group_for_lgb(val_cur[REQUEST_ID].values)
            lgb_train = lightgbm.Dataset(
                X_train, y_train, categorical_feature=['category_id'],
                group=train_groups, free_raw_data=False
            )
            lgb_eval = lightgbm.Dataset(
                X_val, y_val, reference=lgb_train, categorical_feature=['category_id'],
                group=val_groups, free_raw_data=False
            )


            model = lightgbm.train(
                {'objective': 'binary', 'metric': 'map', 'eval_at': [8, 30], 'learning_rate': 0.05},
                lgb_train,
                valid_sets=[lgb_eval],
                num_boost_round=500,
                callbacks=[
                    early_stopping(200),
                    lightgbm.print_evaluation(10)
                ],
            )
            self.models.append(model)
        return self
    
    def predict(self, test):
        return sum([self.models[i].predict(test) for i in range(len(self.models))]) / self.num_estimators
        

In [7]:

def train_val_test_split(
        df: pd.DataFrame,
        split_column: str,
        val_part: float = 0.1,
        test_part: float = 0.1,
        seed: int = 32,
) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
    random.seed(seed)
    unique_values = df[split_column].unique().tolist()
    random.shuffle(unique_values)
    val_start = int(len(unique_values) * (1 - val_part - test_part))
    test_start = int(len(unique_values) * (1 - test_part))
    train_ids = unique_values[:val_start]
    val_ids = unique_values[val_start: test_start]
    test_ids = unique_values[test_start:]

    return (
        df[df[split_column].isin(train_ids)],
        df[df[split_column].isin(val_ids)],
        df[df[split_column].isin(test_ids)],
    )



def train_lightgbm(ranking_df: pd.DataFrame, features: list):
    train, val, test = train_val_test_split(ranking_df, REQUEST_ID, val_part=0.1, test_part=0.1)
    X_train, y_train, train_groups = train[features], train[TARGET], get_group_for_lgb(train[REQUEST_ID].values)
    X_val, y_val, val_groups = val[features], val[TARGET], get_group_for_lgb(val[REQUEST_ID].values)

    lgb_train = lightgbm.Dataset(
        X_train, y_train, categorical_feature=['category_id'],
        group=train_groups, free_raw_data=False
    )
    lgb_eval = lightgbm.Dataset(
        X_val, y_val, reference=lgb_train, categorical_feature=['category_id'],
        group=val_groups, free_raw_data=False
    )


    model = lightgbm.train(
        {'objective': 'binary', 'metric': 'map', 'eval_at': [8, 30], 'learning_rate': 0.05},
        lgb_train,
        valid_sets=[lgb_eval],
        num_boost_round=500,
        callbacks=[
            lightgbm.print_evaluation(10)
        ],
    )
    
    test['prediction'] = model.predict(test[features])
    print(f'map@30 on holdout test: {mapk_df(test, 30)}')

    return model

In [8]:
!pip install pyarrow 



In [9]:
import random
from typing import Optional

import pandas as pd
import lightgbm
from lightgbm.callback import early_stopping

TARGET = 'target'
REQUEST_ID = 'request_id'

In [10]:
!ls

 copy-of-ml-1-shad-fall-2022.zip      'lightgbm-baseline (2).ipynb'
 data				       lightgbm-baseline.ipynb
 data_cv.zip			       model_17_epoch.pt
 history_test.pq		       model_27_epoch.pt
 history_train.pq		       __pycache__
 history_train_sample.pq	       requests_embeddings_baseline.npy
'homework_part1_detection_(1).ipynb'   requests_test.pq
 homework-part1-segmentation.ipynb     requests_train.pq
'homework_part2 (1) (1).ipynb'	       requests_train_sample.pq
'homework_part2 (1).ipynb'	       seminar_nlp_2.ipynb
 homework_pytorch.ipynb		       test.csv
 items_embeddings_baseline.npy	       train_mod.csv
 items_test.pq			       train_mod_v3.csv
 items_train.pq			       Untitled1.ipynb
 items_train_sample.pq		       Untitled2.ipynb
 kaggle.json			       Untitled3.ipynb
 lgbm_baseline.csv		       Untitled.ipynb
 lgbm_baseline_old_train.txt	       voc_data
 lgbm_svd_baseline.csv		       week1_nets.ipynb


In [11]:
path_to_data = ''

In [12]:
train = pd.read_parquet(path_to_data + 'items_train.pq')

In [13]:
train

Unnamed: 0,item_id,category_id,request_id,rank,model_a_score,model_a_rank,model_b_score,model_b_rank,model_b_f1,model_b_f2,...,item_stats.clicks,item_stats.clicks_rec,item_stats.last_clicks,item_stats.long_clicks,item_stats.clicks_last_day,item_stats.clicks_rec_last_day,item_stats.segment_contact_conversion,item_stats.seller_contact_conversion,item_stats.segment_click_conversion,item_stats.seller_click_conversion
243499,153904,2,0,199,,,,,,,...,201,157,9,32.870800,26,17,0.026808,0.0025,0.035178,0.05
243444,426282,2,0,144,,,,,,,...,949,511,31,101.482094,60,46,0.020660,0.0025,0.035813,0.05
243491,692087,2,0,191,,,,,,,...,1154,692,84,160.471603,128,85,0.023634,0.0025,0.037988,0.05
243448,825959,2,0,148,11.11,16.0,,,,,...,332,183,14,34.140823,23,10,0.008888,0.0025,0.024652,0.05
243548,911885,25,0,248,,,,,,,...,3026,1890,95,475.407562,153,94,0.002561,0.0025,0.033806,0.05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1017132,33660072,25,209250,132,,,,,,,...,692,601,99,109.645386,692,601,0.012530,0.0025,0.029092,0.05
1017194,33693594,47,209250,194,,,,,,,...,1222,896,94,101.288750,446,342,0.008212,0.0025,0.020907,0.05
1017117,33697517,1,209250,117,,,0.04,4.0,2.0,0.25,...,166,87,34,16.724236,166,87,0.001459,0.0025,0.035080,0.05
1017151,33874555,1,209250,151,,,0.04,109.0,54.0,0.25,...,2145,1436,228,223.345917,694,470,0.001373,0.0025,0.018022,0.05


### Внедряем SVD

In [14]:
request_embeddings = np.load('requests_embeddings_baseline.npy')
items_embeddings = np.load('items_embeddings_baseline.npy')



In [15]:
items_embeddings.shape

(209251, 6)

In [16]:
from tqdm.auto import trange

for i in trange(items_embeddings.shape[1]):
    train[f'item_emb_{i}'] = train['item_id'].apply(lambda x: request_embeddings[x][i])
    train[f'req_emb_{i}'] = train['request_id'].apply(lambda x: items_embeddings[x][i])


  0%|          | 0/6 [00:00<?, ?it/s]

In [17]:
request_ids = train.request_id.unique()
len(request_ids)

135412

In [18]:
features = [
       'category_id',
       'model_a_score',
       'model_a_rank', 
       'model_b_score',
       'model_b_rank', 
       'model_b_f1',
       'model_b_f2',
       'model_b_f3',
       'model_c_score',
       'model_c_rank',
       'model_d_score',
       'model_d_rank', 
       'shows_count', 
       'microcategory_id',
       'location_id',
       'price', 
       'sort_age', 
       'start_age',
       'latitude',
       'longitude',
       'item_stats.days',
       'item_stats.contact_conversion', 
       'item_stats.hides', 
       'item_stats.shows',
       'item_stats.shows_last_day',
       'item_stats.shows_rec',
       'item_stats.shows_rec_last_day', 
       'item_stats.contacts',
       'item_stats.contacts_last_day', 
       #'item_stats.contacts_rec_last_day',
       'item_stats.contacts_rec', 
       'item_stats.last_contacts',
       #'item_stats.delayed_contacts',
       #'item_stats.delayed_contacts_rec',
       'item_stats.clicks', 
       #'item_stats.clicks_rec', 
       'item_stats.last_clicks',
       'item_stats.long_clicks',
       'item_stats.clicks_last_day',
       'item_stats.clicks_rec_last_day',
       #'item_stats.segment_contact_conversion',
       'item_stats.seller_contact_conversion',
       'item_stats.segment_click_conversion',
       'item_stats.seller_click_conversion',
       'item_emb_0',
       'item_emb_1',
       'item_emb_2',
       'item_emb_3',
       'item_emb_4',
       'item_emb_5',
       'req_emb_0',
       'req_emb_1',
       'req_emb_2',
       'req_emb_3',
       'req_emb_4',
       'req_emb_5',

]

In [None]:
model = train_lightgbm(train, features)



[LightGBM] [Info] Number of positive: 176182, number of negative: 32322518
[LightGBM] [Info] Total groups: 108329, total data: 32498700
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11753
[LightGBM] [Info] Number of data points in the train set: 32498700, number of used features: 51




[LightGBM] [Info] Total groups: 13541, total data: 4062300
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.005421 -> initscore=-5.212002
[LightGBM] [Info] Start training from score -5.212002
[10]	valid_0's map@8: 0.102492	valid_0's map@30: 0.117225
[20]	valid_0's map@8: 0.108622	valid_0's map@30: 0.123671
[30]	valid_0's map@8: 0.11119	valid_0's map@30: 0.126642
[40]	valid_0's map@8: 0.113889	valid_0's map@30: 0.129379
[50]	valid_0's map@8: 0.115903	valid_0's map@30: 0.131497
[60]	valid_0's map@8: 0.117045	valid_0's map@30: 0.132787
[70]	valid_0's map@8: 0.117929	valid_0's map@30: 0.133805
[80]	valid_0's map@8: 0.11972	valid_0's map@30: 0.135653
[90]	valid_0's map@8: 0.120495	valid_0's map@30: 0.136565
[100]	valid_0's map@8: 0.12097	valid_0's map@30: 0.137212
[110]	valid_0's map@8: 0.121934	valid_0's map@30: 0.138275
[120]	valid_0's map@8: 0.1228	valid_0's map@30: 0.139261
[130]	valid_0's map@8: 0.123783	valid_0's map@30: 0.140323
[140]	valid_0's map@8: 0.124376	valid_0's map@30: 0.14

In [None]:
del train

In [None]:
#model.save_model('lgbm_baseline_old_train.txt')

In [None]:
# load from model:
#model = lightgbm.Booster(model_file='lgbm_baseline_old_train.txt')

In [None]:
test = pd.read_parquet(path_to_data + 'items_test.pq')

In [None]:
from tqdm.auto import trange

for i in trange(items_embeddings.shape[1]):
    test[f'item_emb_{i}'] = test['item_id'].apply(lambda x: request_embeddings[x][i] if x < request_embeddings.shape[0] else 0)
    test[f'req_emb_{i}'] = test['request_id'].apply(lambda x: items_embeddings[x][i] if x < items_embeddings.shape[0] else 0)


In [None]:
def create_submission(filename, model, test, k: int = 30):
    test['pred'] = model.predict(test[model.models[0].feature_name()])
    test_sorted = test.sort_values(['request_id', 'pred'], ascending=[True, False])
    test_sorted['rank'] = test_sorted.groupby('request_id').cumcount()
    top = test_sorted[test_sorted['rank'] < k]
    subm = top[['request_id', 'item_id']].groupby('request_id')['item_id'].apply(lambda x: ' '.join(map(str, x)))
    subm_df = subm.to_frame().reset_index().rename(columns={'item_id': 'item_ids'})
    subm_df.to_csv(filename, index=False)

In [None]:
create_submission('lgbm_svd_baseline.csv', model, test)

In [None]:
test.head(10)