In [None]:
%run boilerplate.py

In [None]:
import pandas as pd
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 100)
import lightgbm as lgb
import numpy as np
from tqdm import tqdm

In [None]:
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
warnings.simplefilter(action='ignore', category=DataConversionWarning)
import matplotlib


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectPercentile
from sklearn.preprocessing import TargetEncoder, StandardScaler, PolynomialFeatures
from sklearn.model_selection import KFold,train_test_split, cross_val_score,GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer


In [None]:
import shap

## Reading data

In [None]:
movies_ratings = pd.read_csv(DIR_DATA.joinpath('movies_ratings.csv'))

In [None]:
movies_ratings = movies_ratings.fillna(movies_ratings.median())

In [None]:
tags = pd.read_csv("data/tags.csv")

### General Variables

In [None]:
genre_cols = {'Fantasy', 'Children', 'Animation', 'Comedy', 'Adventure', 'Romance',
       'Drama', 'Action', 'Thriller', 'Crime', 'Horror', 'Mystery', 'Sci-Fi',
       'War', 'Musical', 'Documentary', 'IMAX', 'Western'}
catg_cols = {'primary_lang', 'cast_0', 'cast_1'}
ord_cols = {'release_date_month', 'release_date_dow'}
num_cols = { 'popularity','runtime', 'vote_average', 'vote_count', 'release_date_yr','years_from_release',
        'rating'}

feat_cols = genre_cols.union(num_cols).union(catg_cols).union(ord_cols)

tgt = 'rating'

In [None]:
usersIds = movies_ratings.userId.unique()
user_movie_watch = {u : movies_ratings.query(f'userId == {u}').movieId.unique() for u in usersIds}

### Features engineering

Median and mode fill NA

In [None]:
for i in feat_cols:
    if movies_ratings[i].dtype in  {float, int}:
        movies_ratings[i] = movies_ratings[i].fillna(movies_ratings[i].median())
    else:
        movies_ratings[i] = movies_ratings[i].fillna(movies_ratings[i].mode())

Movie tag aggregates

In [None]:
movie_tags_ts_std = tags.groupby('movieId')[['timestamp']].std().add_suffix('_item_ts_std').fillna(0)
movie_tags_ct = tags.groupby('movieId').size().to_frame(name = 'movi_tags_ct').fillna(0)

Movie items aggregates

In [None]:
# Aggregate movie features
agg_movie_grps_df = pd.concat([
        movies_ratings.groupby('movieId').rating.agg([np.min, np.max, np.mean, len, np.std]).add_prefix('item_rating_'),
        movies_ratings.eval('yr_vote_diff = rating_yr - release_date_yr').groupby('movieId').yr_vote_diff.mean().to_frame().add_suffix('_item'),
        movies_ratings.groupby('movieId').timestamp.std().to_frame().add_suffix('_item_std')
], 1).fillna(0)

In [None]:
# Merging movie tags 
agg_movie_grps_tag_df = agg_movie_grps_df.merge(movie_tags_ct, on = 'movieId', how = 'left').merge(movie_tags_ts_std, on = 'movieId', how = 'left')

In [None]:
full_item_df = movies_ratings.drop(['userId', 'rating', 'tmdbId', 'imdbId','timestamp','rating_yr',],1)
full_item_df = full_item_df.drop_duplicates().set_index('movieId')
tmp = full_item_df.groupby('movieId').size()
assert tmp.loc[lambda x :x>1].empty

Target encoding

In [None]:
# Fit target encoding
lbl_enc = {}
for cols in movies_ratings[feat_cols].select_dtypes(exclude=[float,int, bool]).columns:
    feat_cols.remove(cols)
    lbl_enc[cols] = TargetEncoder()
    movies_ratings[f"enc_{cols}"] = lbl_enc[cols].fit_transform(movies_ratings[[cols]], movies_ratings[[tgt]])
    num_cols.add(f"enc_{cols}")
    feat_cols.add(f"enc_{cols}")
    print(f"Removing {cols} and creating enc_{cols} ")


Removing cast_1 and creating enc_cast_1 
Removing primary_lang and creating enc_primary_lang 
Removing cast_0 and creating enc_cast_0 


In [None]:
# Perform Target encoding
for col, enc in lbl_enc.items():
    print(col)
    full_item_df[f"enc_{col}"] = lbl_enc[col].transform(full_item_df[[col]])

cast_1
primary_lang
cast_0


In [None]:
full_item_df['rating_yr'] = 2023

In [None]:
# Check feat_cols difference with full_item
assert len(feat_cols.difference(full_item_df.filter(feat_cols).columns).difference({tgt})) == 0

Create item df for inference evaluation

In [None]:
full_item_df = full_item_df.merge(agg_movie_grps_tag_df, on='movieId', how = 'left').fillna(0)

In [None]:
item_df = full_item_df.filter(feat_cols)

Generating user attributes

In [None]:
def genre_mean(x: pd.DataFrame):
    """Taking mean only when genre = 1

    Args:
        x (_type_): _description_

    Returns:
        _type_: _description_
    """
    res = {}
    for g in genre_cols:
        mask = x[g]==1
        res[g]  = x.loc[mask, tgt].mean()
    return pd.Series(res)
    
def wt_avg(x, cols, tgt):
    """Apply wt avg for each cols based on tgt column"""
    res = {}
    for c in cols:
        res[c] = np.average(x[c].values, weights=x[tgt].values)
    return pd.Series(res)

def genre_cum_norm(x):
    """Using timestamp apply Cumulative normalization by row for genere cols"""
    x = x.set_index(['userId','timestamp']).groupby('timestamp').apply(lambda x: x[list(genre_cols)].mul(x['rating'], axis=0))
    cumsum = (x.groupby('timestamp')[list(genre_cols)].sum()).cumsum()
    return cumsum.div(cumsum.sum(axis=1), axis=0)


In [None]:
def genre_rating_diff(x, genre_rating):
    """ Rating diff between user and global for each genre"""
    genre_dist = x[list(genre_cols)].sum()
    genre_dist = genre_dist/genre_dist.sum()
    # display(genre_dist)
    res = {}
    for g,r in genre_rating.items():
        mask = x[g]==1
        if mask.any():
            res[g] =  (x.loc[mask,'rating'].mean()) - r
        else:
            res[g] = 0
    return pd.Series(res)
        
genre_rating = {g: movies_ratings.loc[movies_ratings[g] == 1, 'rating'].mean() for g in genre_cols}

In [None]:
def genre_watched_diff(x, genre_watched):
    """Compare Scaled movie watch vs Global movie watch by Genre"""
    genre_dist = x[list(genre_cols)].sum()
    genre_dist = genre_dist/genre_dist.sum()
    return (genre_dist / genre_watched).apply(np.exp)
    # # display(genre_dist)
    # res = {}
    # for g,r in genre_rating.items():
    #     mask = x[g]==1
    #     if mask.any():
    #         res[g] =  (x.loc[mask,'rating'].mean()) - r
    #     else:
    #         res[g] = 0
    # return pd.Series(res)
        
genre_watched = movies_ratings[list(genre_cols)].sum()/movies_ratings[list(genre_cols)].sum().sum()


In [None]:
# Aggregate user features
def user_features(df):
    usr_grp = df.groupby('userId')
    usr_grp_genre_mean = usr_grp.apply(lambda x: genre_rating_diff(x, genre_rating)).add_suffix('_user_genre_rating_diff').fillna(0)
    usr_grp_genre_dist = usr_grp.apply(genre_watched_diff, genre_watched).add_suffix('_user_genre_dist').fillna(0)

    return pd.concat([
        usr_grp_genre_mean,
        usr_grp_genre_dist,
], axis=1)


In [None]:
user_gen_feat = user_features(movies_ratings).reset_index()

In [None]:
user_ts_genre_profile = movies_ratings.groupby('userId').apply(genre_cum_norm).add_suffix('_user_cum_norm').reset_index()

In [None]:
# No Na columns
assert (user_gen_feat.describe().T['count'] == user_gen_feat.shape[0]).all()
assert (user_ts_genre_profile.userId.nunique() == user_gen_feat.shape[0])

In [None]:
feat_cols = feat_cols.union(set(user_gen_feat.columns[1:])).union(set((user_ts_genre_profile.columns[2:])))
num_cols = num_cols.union(set(user_gen_feat.columns[1:])).union(set((user_ts_genre_profile.columns[2:])))

In [None]:
# User_gen_feat same number as unique userId
assert movies_ratings.userId.nunique() == user_gen_feat.shape[0]

Merge user info with movie_rating

In [None]:
tmp = movies_ratings.merge(user_gen_feat, on='userId', validate="m:1")
assert tmp.shape[0] == movies_ratings.shape[0]
movies_ratings = tmp.copy()
del tmp

In [None]:
tmp = movies_ratings.merge(user_ts_genre_profile, on=['userId', 'timestamp'], validate="m:1")
assert tmp.shape[0] == movies_ratings.shape[0]
movies_ratings = tmp.copy()
del tmp

In [None]:
if user_ts_genre_profile.columns[-1] not in user_gen_feat.columns:
    last_user_ts_genre_profile = user_ts_genre_profile.groupby('userId').last().drop('timestamp',1)
    user_gen_feat = user_gen_feat.merge(last_user_ts_genre_profile, on = 'userId', validate="1:1")
    user_gen_feat = user_gen_feat.set_index('userId')

### Training and Test data generation 

Stratified Split by User

In [None]:
train_df, test_df = ut.split_train_val_test(movies_ratings[feat_cols.union({'userId', 'movieId'})], val_frac = 0.1, stratify=movies_ratings['userId'])

train_size =  90752 
val_size =  10084 


In [None]:
train_df['has_rated'] = 1
test_df['has_rated'] = 1
# test_df = test_df.set_index('userId')

Generate negative samples

In [43]:
NEG_TRAIN_SAMPLES = 19

In [46]:
u_res = {}
user_cols = list(user_gen_feat.columns)
for u in tqdm(train_df.userId.unique()):
    user_train_df = train_df[train_df.userId == u]
    u_s = user_train_df.shape[0]
    item_sample_df = full_item_df.loc[~full_item_df.index.isin(user_movie_watch[u])].sample(NEG_TRAIN_SAMPLES  * (u_s), replace=True).reset_index()
    item_sample_df['rating'] = 0
    res = []
    for _, (i, user_row) in enumerate(user_train_df.iterrows()):
        pos_df = user_row.to_frame().T.set_index('movieId')
        neg_df = pos_df[user_cols].merge(item_sample_df.iloc[_ * NEG_TRAIN_SAMPLES: (_+1) *NEG_TRAIN_SAMPLES], how = 'cross').set_index('movieId')

        res.append(neg_df[feat_cols].eval("has_rated = 0"))
        res.append(pos_df[feat_cols].eval("has_rated = 1"))
    u_res[u] = pd.concat(res)


100%|██████████| 610/610 [19:46<00:00,  1.95s/it] 


In [47]:
train_neg_df = pd.concat(u_res, names=['userId'])

In [48]:
train_neg_df.reset_index().to_csv('data/train_rank_neg_df.csv')

In [49]:
NEG_TEST_SAMPLES = 19

In [50]:
u_res = {}
user_cols = list(user_gen_feat.columns)
for u in tqdm(test_df.userId.unique()):
# for u in tqdm(train_neg_df.index.get_level_values(0).unique()):
    user_train_df = test_df[test_df.userId == u]
    u_s = user_train_df.shape[0]
    movie_neg_test = full_item_df.index.difference(user_movie_watch[u]).difference(train_neg_df.loc[u].index)
    item_sample_df = full_item_df.loc[movie_neg_test].sample(NEG_TEST_SAMPLES  * (u_s), replace=True).reset_index()
    item_sample_df['rating'] = 0
    res = []
    
    for _, (i, user_row) in enumerate(user_train_df.iterrows()):
        pos_df = user_row.to_frame().T.set_index('movieId')
        neg_df = pos_df[user_cols].merge(item_sample_df.iloc[_ * NEG_TEST_SAMPLES: (_+1) * NEG_TEST_SAMPLES], how = 'cross').set_index('movieId')

        res.append(neg_df[feat_cols].eval("has_rated = 0"))
        res.append(pos_df[feat_cols].eval("has_rated = 1"))
    u_res[u] = pd.concat(res)





  0%|          | 0/610 [00:00<?, ?it/s]

100%|██████████| 610/610 [02:14<00:00,  4.53it/s]


In [51]:
test_neg_df = pd.concat(u_res, names = ['userId'])

In [52]:
test_neg_df.reset_index().to_csv('data/test_rank_neg_df.csv')

In [57]:
train_neg_df = pd.read_csv('data/train_rank_neg_df.csv')
train_neg_df = train_neg_df.set_index(['userId', 'movieId']).drop('Unnamed: 0',1)

In [58]:
test_neg_df = pd.read_csv('data/test_rank_neg_df.csv')
test_neg_df = test_neg_df.set_index(['userId', 'movieId']).drop('Unnamed: 0',1)

In [59]:
train_sample_wt =((train_neg_df['rating'].astype(int)*2 - 1) * train_neg_df['has_rated']) + 1
test_sample_wt =(((test_neg_df['rating'].astype(int) - 1) * test_neg_df['has_rated']) + 1)

In [60]:
feat_cols = feat_cols -{'rating', 'enc_primary_lang'}

In [56]:
train_neg_df.shape, test_neg_df.shape

((1815040, 85), (201680, 85))

In [62]:
(train_neg_df.reset_index().merge(agg_movie_grps_tag_df, on='movieId')
 .set_index(['userId', 'movieId'])
).fillna(0).shape

(1815040, 94)

In [63]:
(test_neg_df.reset_index().merge(agg_movie_grps_tag_df, on='movieId')
 .set_index(['userId', 'movieId'])
).fillna(0).shape

(201680, 94)

Merging movie attributes in train and test

In [64]:
train_neg_df =(train_neg_df.reset_index().reset_index().merge(agg_movie_grps_tag_df, on='movieId')
 .set_index(['index','userId', 'movieId'])
).fillna(0).sort_index().reset_index('index', drop=True)

test_neg_df = (test_neg_df.reset_index().reset_index().merge(agg_movie_grps_tag_df, on='movieId')
 .set_index(['index','userId', 'movieId'])
).fillna(0).sort_index().reset_index('index', drop=True)
feat_cols = feat_cols.union(agg_movie_grps_tag_df.columns)

Add query/sample index for each rated movie by user in train and test used for ranking

In [65]:
def add_queryid(x):
    q = 0
    prev_i = 0
    res = []
    for i in x:
        q +=  prev_i 
        res.append(q)
        prev_i = i
    return pd.Series(res)


In [66]:
# For testing added qid
if 'qid' not in test_neg_df.index.names:
    test_qids = test_neg_df.groupby('userId').has_rated.apply(add_queryid).reset_index(level=1, drop=True)
    test_neg_df['qid'] = test_qids.values
    test_neg_df.set_index('qid', append=True, inplace=True)

In [67]:
# For training added qid
if 'qid' not in train_neg_df.index.names:
    test_qids = train_neg_df.groupby('userId').has_rated.apply(add_queryid).reset_index(level=1, drop=True)
    train_neg_df['qid'] = test_qids.values
    train_neg_df.set_index('qid', append=True, inplace=True)

In [70]:
# check if has_rated is present in  20 samples (19 negative one positive)
assert (test_neg_df.loc[1, 'has_rated'].head(20).iloc[-1] == 1)

assert (train_neg_df.loc[1, 'has_rated'].head(20).iloc[-1] == 1)

Shuffle qid 

In [71]:
tmp = test_neg_df.groupby(['userId', 'qid']).apply(lambda x: x.sample(frac=1)).reset_index(level=[0,1], drop=True)
X_test, y_test = tmp[feat_cols], tmp[['has_rated']]

In [72]:
tmp = train_neg_df.groupby(['userId', 'qid']).apply(lambda x: x.sample(frac=1)).reset_index(level=[0,1], drop=True)
X_train, y_train = tmp[feat_cols], tmp[['has_rated']]

In [None]:
assert (test_neg_df.loc[1, 'has_rated'].head(20).sum() == 1)
assert (train_neg_df.loc[1, 'has_rated'].head(20).sum() == 1)

In [76]:
X_train.shape, X_test.shape

((1815040, 91), (201680, 91))

In [77]:
assert X_train.columns.symmetric_difference(X_test.columns).empty

In [134]:
train_feat = X_train.columns

### Building model


In [132]:
user_models = {}
i_train = 0
i_test  = 0
for u in tqdm(X_train.index.get_level_values(0).unique()):
    qids_train = [20] * y_train.loc[u].sum().iloc[0]
    qids_test = [20] * y_test.loc[u].sum().iloc[0]
    
    user_models[u] =  Pipeline(
    [('p', 
                   Pipeline([
                       ('poly', PolynomialFeatures(include_bias=False, interaction_only=True)),
                       ('vif', VarianceThreshold()),
                        ('ss', StandardScaler()),
                        # ('f_select',  SelectPercentile(f_classif, percentile=25)),
                       ])), 
                   ('reg', lgb.LGBMRanker(objective="lambdarank",
    metric="map", n_jobs=-1, learning_rate=0.1,  n_estimators = 400, reg_lambda=0.2, verbose = -1, min_child_samples = 5, sub_feature=.65))
    ])

    user_models[u].fit(X_train.loc[u],
        y_train.loc[u],
        **{
        "reg__group":qids_train,
        "reg__eval_set":[(X_test.loc[u], y_test.loc[u])],
        'reg__eval_metric': 'map',
        "reg__eval_group":[qids_test],
        "reg__eval_at":(3, 5),
        "reg__feature_name":None,
        # "reg__sample_weight": train_sample_wt[i:i + qid_len],
        # "reg__eval_sample_weight": [test_sample_wt[i:i + qid_len]]*5,
        }
    )


  0%|          | 0/610 [00:00<?, ?it/s]

100%|██████████| 610/610 [13:37<00:00,  1.34s/it]  


#### Understanding feat importance 

In [107]:
res_feat_imp = {}
for u,m in user_models.items():
    res_feat_imp[u] = pd.Series(m.named_steps['reg'].feature_importances_, m.named_steps['p'].get_feature_names_out())
res_feat_imp_agg = pd.concat(res_feat_imp).reset_index(name='feat_imp').groupby('level_1').feat_imp.mean().sort_values()

### Evaluating Map 5 on test data


In [87]:
wts = []
map_5 = []
for u in user_models:
    wts.append(movies_ratings.query(f'userId == {u}').shape[0])
    map_5.append(pd.DataFrame(user_models[u].named_steps['reg'].evals_result_['valid_0'])['map@5'].max())

In [97]:
np.average(map_5, weights= wts)

0.5501038474809534

In [92]:
train_feat = list(X_train.columns)

### Evaluation on inference result by Hit Ratio @ 5

In [128]:
rank_user_res = {}
for u in tqdm(user_models):
    pred = user_models[u].predict(X_test.loc[u])
    udf = y_test.loc[u]
    udf.loc[:,'pred'] = pred
    udf.loc[:, 'rank'] = (udf.groupby('qid')['pred']
                            .rank(method='dense', ascending=False).astype(int))

    rank_user_res[u] = udf.query('has_rated == 1 and rank<=5').shape[0]/udf.query('has_rated == 1').shape[0]


100%|██████████| 610/610 [00:27<00:00, 21.80it/s]


In [129]:
# Mean Hit ratio@5 for all user sample rated 
pd.Series(rank_user_res).mean()

0.9375072489310825

In [166]:
test_neg_df[['qid', 'userId', 'movieId', 'has_rated','pred','rank']]

Unnamed: 0_level_0,qid,userId,movieId,has_rated,pred,rank
group_userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
325,79015,325,58,1,104.609335,2
325,79015,603,2829,0,104.331136,3
325,79015,448,149354,0,104.053013,4
325,79015,434,68954,0,104.940571,1
325,79254,325,4086,1,104.265293,3
...,...,...,...,...,...,...
289,73226,558,367,0,105.429420,2
289,73221,289,594,1,104.775600,1
289,73221,610,111360,0,104.646484,2
289,73221,603,1280,0,104.505416,4


### Dumping pickles

In [98]:
import joblib,pickle

In [100]:
joblib.dump(user_models, "./results/best_model/pipeline_rank_poly_V3.joblib")

['./results/best_model/pipeline_rank_poly_V3.joblib']

In [135]:
with open('./results/best_model/rank_train_feat.pkl', 'wb') as f:
    pickle.dump(train_feat, f)