In [1]:
#!mkdir -p ~/.kaggle
#!cp kaggle.json ~/.kaggle/
#!pip install kaggle
#!chmod 600 /root/.kaggle/kaggle.json
#!kaggle competitions download -c copy-of-ml-1-shad-fall-2022
#!unzip copy-of-ml-1-shad-fall-2022.zip

In [2]:
!pip install lightfm



In [3]:
import pandas as pd
import numpy as np 

df_history = pd.read_parquet('history_train.pq')
df_history.head()

Unnamed: 0,category_id,microcategory_id,item_id,event_type,source_event_type,request_id,location_id,event_age_seconds
0,31,189,979366,0,0,136128,5215,2651288
1,31,6321,979366,2,8,136128,161,2651246
2,31,6321,979366,4,8,136128,5215,2651229
3,31,189,11420517,0,0,136128,5215,2651206
4,31,189,15674214,0,0,136128,5215,2651198


In [4]:
from scipy.sparse import coo_matrix

def interactions_list_to_sparse_matrix(df):
    return coo_matrix((df['source_event_type'].values, (df['item_id'].values, df['request_id'].values)),
                             shape=(max(df['item_id'].values) + 1, max(df['request_id'].values) + 1))


In [5]:
from lightfm import LightFM
from scipy.sparse import csr_matrix, csc_matrix 


model_features = LightFM(no_components=30)

sparse_matrix = interactions_list_to_sparse_matrix(df_history)

model_features.fit(sparse_matrix)


<lightfm.lightfm.LightFM at 0x7ff8a47d1490>

In [6]:
model_features.get_item_representations()[0].shape

(209251,)

In [7]:
!pip install lightgbm



In [8]:
import random
from typing import Optional

import pandas as pd
import lightgbm
from lightgbm.callback import early_stopping

TARGET = 'target'
REQUEST_ID = 'request_id'

In [9]:
pd.__version__

'1.2.2'

In [10]:
def df_to_dict(df: pd.DataFrame):
    return df.groupby(REQUEST_ID).item_id.apply(list).to_dict()

def mapk_df(df_with_prediction: pd.DataFrame, k: int) -> float:
    df_sorted = df_with_prediction.sort_values(['request_id', 'prediction'], ascending=[True, False])
    df_sorted['rank'] = df_sorted.groupby('request_id').cumcount()
    top = df_sorted[df_sorted['rank'] < k]
    pred_dict = df_to_dict(top)
    ans_dict = df_to_dict(df_sorted.query('target == 1'))
    
    predicted = []
    actual = []
    for k in ans_dict:
        predicted.append(pred_dict[k])
        actual.append(ans_dict[k])
        
    return mapk(actual, predicted, k)


def apk(actual, predicted, k=30):

    if not actual:
        return 0.0

    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0
    for i, p in enumerate(predicted):
        # first condition checks whether it is valid prediction
        # second condition checks if prediction is not repeated
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    return score / min(len(actual), k)

def mapk(actual, predicted, k=30):
    return sum([apk(a, p, k) for a, p in zip(actual, predicted)]) / len(actual)
    

In [11]:
import numpy as np

def get_group_for_lgb(sorted_vals):
    curr_val = sorted_vals[0]
    cnt = 1
    groups = []
    for val in sorted_vals[1:]:
        if val != curr_val:
            groups.append(cnt)
            curr_val = val
            cnt = 1
        else:
            cnt += 1

    groups.append(cnt)

    return groups



class BaggingLightGBM:
    def __init__(self, num_estimators):
        self.num_estimators = num_estimators
        self.k = num_estimators ** 0.5
        self.models = []
        
    def _get_masks(self, n):
        masks = []
        for i in range(self.num_estimators):
            masks.append(np.random.choice([True, False], n, p = [1 / self.k, 1 - (1 / self.k)]))
        return masks
    
    def fit(self, train, val):
        masks = self._get_masks(len(train))
        for i in range(self.num_estimators):
            trn_cur, val_cur = train.iloc[masks[i]], val
            X_train, y_train, train_groups = trn_cur[features], trn_cur[TARGET], get_group_for_lgb(trn_cur[REQUEST_ID].values)
            X_val, y_val, val_groups = val_cur[features], val_cur[TARGET], get_group_for_lgb(val_cur[REQUEST_ID].values)
            lgb_train = lightgbm.Dataset(
                X_train, y_train, categorical_feature=['category_id'],
                group=train_groups, free_raw_data=False
            )
            lgb_eval = lightgbm.Dataset(
                X_val, y_val, reference=lgb_train, categorical_feature=['category_id'],
                group=val_groups, free_raw_data=False
            )


            model = lightgbm.train(
                {'objective': 'binary', 'metric': 'map', 'eval_at': [8, 30], 'learning_rate': 0.05},
                lgb_train,
                valid_sets=[lgb_eval],
                num_boost_round=500,
                callbacks=[
                    early_stopping(200),
                    lightgbm.print_evaluation(10)
                ],
            )
            self.models.append(model)
        return self
    
    def predict(self, test):
        return sum([self.models[i].predict(test) for i in range(len(self.models))]) / self.num_estimators
        

In [12]:

def train_val_test_split(
        df: pd.DataFrame,
        split_column: str,
        val_part: float = 0.1,
        test_part: float = 0.1,
        seed: int = 32,
) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
    random.seed(seed)
    unique_values = df[split_column].unique().tolist()
    random.shuffle(unique_values)
    val_start = int(len(unique_values) * (1 - val_part - test_part))
    test_start = int(len(unique_values) * (1 - test_part))
    train_ids = unique_values[:val_start]
    val_ids = unique_values[val_start: test_start]
    test_ids = unique_values[test_start:]

    return (
        df[df[split_column].isin(train_ids)],
        df[df[split_column].isin(val_ids)],
        df[df[split_column].isin(test_ids)],
    )



def train_lightgbm(ranking_df: pd.DataFrame, features: list):
    train, val, test = train_val_test_split(ranking_df, REQUEST_ID, val_part=0.1, test_part=0.1)
    X_train, y_train, train_groups = train[features], train[TARGET], get_group_for_lgb(train[REQUEST_ID].values)
    X_val, y_val, val_groups = val[features], val[TARGET], get_group_for_lgb(val[REQUEST_ID].values)

    lgb_train = lightgbm.Dataset(
        X_train, y_train, categorical_feature=['category_id'],
        group=train_groups, free_raw_data=False
    )
    lgb_eval = lightgbm.Dataset(
        X_val, y_val, reference=lgb_train, categorical_feature=['category_id'],
        group=val_groups, free_raw_data=False
    )


    model = lightgbm.train(
        {'objective': 'binary', 'metric': 'map', 'eval_at': [8, 30], 'learning_rate': 0.05},
        lgb_train,
        valid_sets=[lgb_eval],
        num_boost_round=550,
        callbacks=[
            lightgbm.print_evaluation(10)
        ],
    )
    
    test['prediction'] = model.predict(test[features])
    print(f'map@30 on holdout test: {mapk_df(test, 30)}')

    return model

In [13]:
!pip install pyarrow 



In [14]:
import random
from typing import Optional

import pandas as pd
import lightgbm
from lightgbm.callback import early_stopping

TARGET = 'target'
REQUEST_ID = 'request_id'

In [15]:
!ls

 copy-of-ml-1-shad-fall-2022.zip      'lightgbm-baseline (2).ipynb'
 data				       lightgbm-baseline.ipynb
 data_cv.zip			       model_17_epoch.pt
 dssm_competition.ipynb		       model_27_epoch.pt
 history_test.pq		       PageRank.ipynb
 history_train.pq		       __pycache__
 history_train_sample.pq	       requests_embeddings_baseline.npy
'homework_part1_detection_(1).ipynb'   requests_test.pq
 homework-part1-segmentation.ipynb     requests_train.pq
'homework_part2 (1) (1).ipynb'	       requests_train_sample.pq
'homework_part2 (1).ipynb'	       seminar_nlp_2.ipynb
 homework_pytorch.ipynb		       test.csv
 items_embeddings_baseline.npy	       test_mod_v3.csv
 items_test.pq			       train_mod.csv
 items_train.pq			       train_mod_v3.csv
 items_train_sample.pq		       Untitled1.ipynb
 kaggle.json			       Untitled2.ipynb
 lgbm_baseline.csv		       Untitled3.ipynb
 lgbm_baseline_old_train.txt	       Untitled.ipynb
 lgbm_svd_baseline_2.csv	       voc_data
 lgbm_svd_base

In [16]:
path_to_data = ''

In [29]:
train = pd.read_parquet(path_to_data + 'items_train.pq')

### Внедряем LightFM

In [30]:
from tqdm.auto import trange

train[f'item_emb_'] = train['item_id'].apply(lambda x: model_features.get_user_representations()[0][x])
train[f'req_emb_'] = train['request_id'].apply(lambda x: model_features.get_item_representations()[0][x])


In [31]:
request_ids = train.request_id.unique()
len(request_ids)

135412

In [32]:
features = [
       'category_id',
       'model_a_score',
       'model_a_rank', 
       'model_b_score',
       'model_b_rank', 
       'model_b_f1',
       'model_b_f2',
       'model_b_f3',
       'model_c_score',
       'model_c_rank',
       'model_d_score',
       'model_d_rank', 
       'shows_count', 
       'microcategory_id',
       'location_id',
       'price', 
       'sort_age', 
       'start_age',
       'latitude',
       'longitude',
       'item_stats.days',
       'item_stats.contact_conversion', 
       'item_stats.hides', 
       'item_stats.shows',
       'item_stats.shows_last_day',
       'item_stats.shows_rec',
       'item_stats.shows_rec_last_day', 
       'item_stats.contacts',
       'item_stats.contacts_last_day', 
       'item_stats.contacts_rec_last_day',
       'item_stats.contacts_rec', 
       'item_stats.last_contacts',
       'item_stats.delayed_contacts',
       'item_stats.delayed_contacts_rec',
       'item_stats.clicks', 
       'item_stats.clicks_rec', 
       'item_stats.last_clicks',
       'item_stats.long_clicks',
       'item_stats.clicks_last_day',
       'item_stats.clicks_rec_last_day',
       'item_stats.segment_contact_conversion',
       'item_stats.seller_contact_conversion',
       'item_stats.segment_click_conversion',
       'item_stats.seller_click_conversion',
       'item_emb_',
       'req_emb_',

]

In [33]:
model = train_lightgbm(train, features)



[LightGBM] [Info] Number of positive: 176182, number of negative: 32322518
[LightGBM] [Info] Total groups: 108329, total data: 32498700
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10353
[LightGBM] [Info] Number of data points in the train set: 32498700, number of used features: 46




[LightGBM] [Info] Total groups: 13541, total data: 4062300
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.005421 -> initscore=-5.212002
[LightGBM] [Info] Start training from score -5.212002
[10]	valid_0's map@8: 0.102951	valid_0's map@30: 0.117289
[20]	valid_0's map@8: 0.108913	valid_0's map@30: 0.12365
[30]	valid_0's map@8: 0.112405	valid_0's map@30: 0.127569
[40]	valid_0's map@8: 0.115701	valid_0's map@30: 0.131066
[50]	valid_0's map@8: 0.118042	valid_0's map@30: 0.133507
[60]	valid_0's map@8: 0.120364	valid_0's map@30: 0.135926
[70]	valid_0's map@8: 0.122313	valid_0's map@30: 0.13803
[80]	valid_0's map@8: 0.124618	valid_0's map@30: 0.140404
[90]	valid_0's map@8: 0.125718	valid_0's map@30: 0.141832
[100]	valid_0's map@8: 0.127194	valid_0's map@30: 0.143454
[110]	valid_0's map@8: 0.128342	valid_0's map@30: 0.144639
[120]	valid_0's map@8: 0.129149	valid_0's map@30: 0.145622
[130]	valid_0's map@8: 0.130464	valid_0's map@30: 0.146926
[140]	valid_0's map@8: 0.131606	valid_0's map@30: 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['prediction'] = model.predict(test[features])


map@30 on holdout test: 0.15626782954543683


0.15626782954543683 - 0.15128

In [36]:
del train

In [34]:
model.save_model('lgbm_baseline_old_train.txt')

<lightgbm.basic.Booster at 0x7ff8a47d1130>

In [35]:
# load from model:
model = lightgbm.Booster(model_file='lgbm_baseline_old_train.txt')

In [37]:
test = pd.read_parquet(path_to_data + 'items_test.pq')

In [38]:
from tqdm.auto import trange

test[f'item_emb_'] = test['item_id'].apply(lambda x: model_features.get_user_representations()[0][x] if x < len( model_features.get_user_representations()[0]) else 0)
test[f'req_emb_'] = test['request_id'].apply(lambda x: model_features.get_item_representations()[0][x] if x < len( model_features.get_user_representations()[0]) else 0)


In [39]:
def create_submission(filename, model, test, k: int = 30):
    test['pred'] = model.predict(test[model.feature_name()])
    test_sorted = test.sort_values(['request_id', 'pred'], ascending=[True, False])
    test_sorted['rank'] = test_sorted.groupby('request_id').cumcount()
    top = test_sorted[test_sorted['rank'] < k]
    subm = top[['request_id', 'item_id']].groupby('request_id')['item_id'].apply(lambda x: ' '.join(map(str, x)))
    subm_df = subm.to_frame().reset_index().rename(columns={'item_id': 'item_ids'})
    subm_df.to_csv(filename, index=False)

In [40]:
create_submission('lgbm_svd_baseline_3.csv', model, test)

In [None]:
test.head(10)