In [1]:
import pandas as pd
import numpy as np

In [2]:
interactions = pd.read_csv("processed_data/interactions_train.csv")
item_popularity = pd.read_csv("processed_data/popularity_item.csv")
user_popularity = pd.read_csv("processed_data/popularity_user.csv")
product_features = pd.read_csv("processed_data/product_features_train.csv")

### Recommendation as per recently viewed items by the user

In [6]:
viewed_items = interactions[interactions['event'] == 'view']
recently_viewed = viewed_items.sort_values(by=['visitorid', 'timestamp'], ascending=[True, False])
top_recently_viewed = recently_viewed.groupby('visitorid')['itemid'].apply(list).reset_index()

In [66]:
top_recently_viewed[top_recently_viewed['visitorid'] == 2]['itemid'].apply(list)
top_recently_viewed.head()

Unnamed: 0,timestamp,visitorid,event,itemid,index,202,283,364,6,678,...,839,888,917,available,categoryid,parent_level_1,parent_level_2,parent_level_3,parent_level_4,parent_level_5
2126260,2015-08-13 17:46:06.444,1,view,72028,698704.0,72800,887448 174342 820477 72800,1297407.0,887448 174342,820477,...,820477,72800,934982,0.0,1192.0,955.0,384.0,140.0,,
2025104,2015-08-07 18:20:57.845,2,view,325215,3145520.0,961487 755772 350564 n1980.000 1052766,36842 295499 961487 755772 350564 n1980.000 10...,1301516.0,36842,295499,...,295499,961487 755772 350564 n1980.000 1052766 327591 ...,961487 755772 533005 1052766 1121507 521742,1.0,299.0,73.0,1202.0,653.0,,
2025034,2015-08-07 18:17:43.170,2,view,216305,2093552.0,961487 755772 n1740.000,36842 295499 961487 755772 n1740.000 21488 658...,312806.0,36842,295499,...,295499,961487 755772 n1740.000,n81821076.000,1.0,299.0,73.0,1202.0,653.0,,
2025025,2015-08-07 18:17:24.375,2,view,342816,3315884.0,493427 1103345 274271,934173 150169 36842 295499 493427 1103345 2742...,866445.0,934173 150169 36842,295499,...,295499,683597 493427 1103345 274271,216742 1172395 1103345 554515 n24.000 1299781,1.0,444.0,73.0,1202.0,653.0,,
2024827,2015-08-07 18:08:25.669,2,view,342816,3315884.0,493427 1103345 274271,934173 150169 36842 295499 493427 1103345 2742...,866445.0,934173 150169 36842,295499,...,295499,683597 493427 1103345 274271,216742 1172395 1103345 554515 n24.000 1299781,1.0,444.0,73.0,1202.0,653.0,,


### Most popular items as per the number_of_views property

In [10]:
top_items = item_popularity.sort_values(by='number_of_views', ascending=False)

N = 10
top_popular_items = top_items['itemid'].head(N).tolist()

In [67]:
top_popular_items

[187946, 5411, 461686, 370653, 309778, 298009, 257040, 219512, 96924, 335975]

### Most popular items in each category as per number_of_views property

In [18]:
category_popularity = interactions.groupby(['categoryid', 'itemid']).agg(
    number_of_views=('event', lambda x: (x == 'view').sum())
).reset_index()
category_popularity['rank'] = category_popularity.groupby('categoryid')['number_of_views'].rank(ascending=False, method='first')
top_n = 10
category_top_items = category_popularity[category_popularity['rank'] <= top_n]
category_to_top_items = category_top_items.groupby('categoryid')['itemid'].apply(list).to_dict()

### Recommending Co-Viewed Items 

In [32]:
from collections import Counter

def get_co_viewed_items(interactions, itemid, top_n=5):
    sessions = interactions.groupby('visitorid')['itemid'].apply(list)

    co_viewed_counts = Counter()
    for session in sessions:
        if itemid in session:
            co_viewed_counts.update(session)
    co_viewed_counts.pop(itemid, None)
    return [item for item, _ in co_viewed_counts.most_common(top_n)]

co_viewed = get_co_viewed_items(interactions, itemid=187946, top_n=5)

In [33]:
co_viewed

[389814, 133549, 46399, 358895, 128296]

In [34]:
from collections import defaultdict

usersPerItem = defaultdict(set) # Maps an item to the users who rated it
itemsPerUser = defaultdict(set)

for idx, row in interactions.iterrows():
    user,item = row['visitorid'], row['itemid']
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)

### Jaccard Similarity Model

In [37]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

In [44]:
def mostSimilar(i, N):
    similarities = []
    users = usersPerItem[i]
    for i2 in usersPerItem:
        if i2 == i: continue
        sim = Jaccard(users, usersPerItem[i2])
        similarities.append((sim,i2))
    similarities.sort(reverse=True)
    return similarities[:N]

In [49]:
items_jacard = [k for (_,k) in mostSimilar(149767, 10)]
print(items_jacard)

[444363, 377133, 338395, 205143, 425920, 437778, 439963, 79956, 442725, 305634]


In [70]:
interactions_test = pd.read_csv("processed_data/interactions_valid.csv")

In [71]:
test_interactions = interactions_test.groupby('visitorid')['itemid'].apply(set).reset_index()
test_interactions.columns = ['visitorid', 'actual_items']

In [72]:
test_interactions.head()

Unnamed: 0,visitorid,actual_items
0,0,"{285930, 357564, 67045}"
1,4,{177677}
2,15,{22495}
3,18,{209302}
4,32,"{465465, 282491}"


In [91]:
def recently_viewed_recommendation(user_id,k):
    user_recent_items = top_recently_viewed[top_recently_viewed['visitorid'] == user_id]['itemid']
    return user_recent_items.iloc[0][:k] if not user_recent_items.empty else []

def popularity_recommendation(user_id):
    return top_popular_items 

In [92]:
def precision_at_k(actual, predicted, k=10):
    predicted = predicted[:k]
    relevant = set(actual)
    hits = len(set(predicted) & relevant)
    return hits / k

In [93]:
def evaluate_model(test_data, recommendation_func, k=10):
    precisions = []

    for _, row in test_data.iterrows():
        user_id = row['visitorid']
        actual_items = row['actual_items']

        predicted_items = recommendation_func(user_id, k)

        precisions.append(precision_at_k(actual_items, predicted_items, k))

    return {
        'Precision@K': np.mean(precisions),
    }

In [96]:
results_recently_viewed = evaluate_model(test_interactions, recently_viewed_recommendation, k=20)

In [95]:
results_recently_viewed

{'Precision@K': 0.004040799410572849}

In [90]:
top_popular_items

[187946, 5411, 461686, 370653, 309778, 298009, 257040, 219512, 96924, 335975]