In [2]:
import numpy as np
import pandas as pd
import scipy.sparse as sparse

In [3]:
train = pd.read_parquet('data/train.par')
test = pd.read_parquet('data/test.par')
items = pd.read_parquet('data/items.par')

In [13]:
train

Unnamed: 0,user_id,item_id
0,4385,13532
1,13327,13532
2,2785,14098
3,386,14098
4,2806,14098
...,...,...
712378,3154,49206
712379,3551,49500
712380,12970,50079
712381,5337,50201


In [7]:
items

Unnamed: 0,item_id,title,brand,category,price
0,15018,Buzz Lightyear Jet Pack,,"[Clothing, Shoes & Jewelry, Costumes & Accesso...",$7.41
1,15576,Converse Chuck Taylor All Star Canvas Low Top ...,,"[Clothing, Shoes & Jewelry, Men, Shoes, Fashio...",$18.29 - $189.99
2,15580,Converse Chuck Taylor All Star High Top,,"[Clothing, Shoes & Jewelry, Men, Shoes, Fashio...",$29.55 - $160.95
3,15603,Buxton Heiress Double Cardex Wallet,Buxton,"[Clothing, Shoes & Jewelry, Women, Accessories...",$15.00
4,15604,Buxton Heiress Ensemble Clutch Wallet,Buxton,"[Clothing, Shoes & Jewelry, Women, Accessories...",$19.99
...,...,...,...,...,...
53023,50828,Bnext 3D VR Headset Virtual Reality Glasses fo...,Bnext,"[Cell Phones & Accessories, Accessories, Virtu...",
53024,50835,"EXSHOW Car Mount,Universal Windshield Dashboar...",EXSHOW,"[Cell Phones & Accessories, Accessories, Car A...",$16.99
53025,50844,Car Holder Universal Windshield Cell Phone Hol...,POY,"[Cell Phones & Accessories, Accessories, Car A...",
53026,50854,Aduro Solid-Grip Phone Holder for Desk - Adjus...,Aduro,"[Cell Phones & Accessories, Accessories, Mount...",$9.99


In [8]:
items.drop_duplicates(subset=['item_id'], inplace=True)

## Top-Popular Model

In [9]:
def top_popular(interactions: pd.DataFrame, k=10):
    item_popularity = interactions.groupby('item_id').size().reset_index(name='popularity')
    top_popular = item_popularity.sort_values('popularity', ascending=False).head(k).item_id.values
    prediction = interactions[['user_id']].drop_duplicates(ignore_index=True)
    prediction['item_id'] = prediction.user_id.apply(lambda x: top_popular)
    return prediction

toppop_prediction = top_popular(train)

In [10]:
import my_metrics
my_metrics.compute(toppop_prediction, test)

{'recall': 0.009369903632320238, 'map': 0.002193609022556391}

## ALS

In [11]:
n_users = train.user_id.max() + 1
n_items = train.item_id.max() + 1

In [12]:
n_users, n_items

(13490, 50863)

In [14]:
train_ratings = train \
    .groupby(['item_id', 'user_id'], as_index=False) \
    .size() \
    .rename(columns={'size': 'rating'})

user_sum_rating = train_ratings.groupby('user_id').rating.sum()
train_ratings = train_ratings.join(user_sum_rating, on='user_id', rsuffix='_sum')
train_ratings['rating_normal'] = train_ratings['rating'] / train_ratings['rating_sum']

In [15]:
train_ratings

Unnamed: 0,item_id,user_id,rating,rating_sum,rating_normal
0,0,598,1,39,0.025641
1,0,954,1,42,0.023810
2,0,1372,1,41,0.024390
3,0,1401,1,48,0.020833
4,0,1576,1,102,0.009804
...,...,...,...,...,...
688754,50861,157,1,41,0.024390
688755,50861,2272,1,55,0.018182
688756,50862,2161,1,42,0.023810
688757,50862,3140,1,45,0.022222


In [16]:
confidence = 1.0 + train_ratings.rating_normal.values * 40.0

rating_matrix = sparse.csr_matrix(
    (
        confidence, 
        (
            train_ratings.item_id.values,
            train_ratings.user_id.values
        )
    ),
    shape=(n_items, n_users)
)

rating_matrix_T = sparse.csr_matrix(
    (
        np.full(rating_matrix.nnz, 1), 
        (
            train_ratings.user_id.values,
            train_ratings.item_id.values
        )
    ),
    shape=(n_users, n_items)
)

In [19]:
rating_matrix.nnz / (n_items * n_users) * 100

0.10038142694891947

In [20]:
import implicit

als = implicit.als.AlternatingLeastSquares(factors=128, 
                                           calculate_training_loss=True, 
                                           iterations=100)

als.fit(rating_matrix)



  0%|          | 0/100 [00:00<?, ?it/s]

In [21]:
import joblib

def predict_als_for_user(user_id):
    recommendations = als.recommend(user_id, rating_matrix_T, N=10)
    recommended_items = [x for x, _ in recommendations]
    recommended_scores = [x for _, x in recommendations]
    return user_id, recommended_items, recommended_scores

als_prediction_raw = joblib.Parallel(backend='multiprocessing', verbose=1, n_jobs=32)(
    joblib.delayed(predict_als_for_user)(u) for u in train.user_id.unique()
)

als_prediction = pd.DataFrame(als_prediction_raw, columns=['user_id', 'item_id', 'score'])

[Parallel(n_jobs=32)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 224 tasks      | elapsed:    0.3s
[Parallel(n_jobs=32)]: Done 2000 tasks      | elapsed:    2.1s
[Parallel(n_jobs=32)]: Done 4800 tasks      | elapsed:    4.6s
[Parallel(n_jobs=32)]: Done 8400 tasks      | elapsed:    7.8s
[Parallel(n_jobs=32)]: Done 12800 tasks      | elapsed:   11.3s
[Parallel(n_jobs=32)]: Done 13490 out of 13490 | elapsed:   11.7s finished


In [22]:
my_metrics.compute(als_prediction, test)

{'recall': 0.13897454904867804, 'map': 0.09399931165943025}