In [6]:
import pandas as pd

train = pd.read_csv('coursera_sessions_train.txt', sep=';', header=None, names = ['views', 'buys'])
X_test = pd.read_csv('coursera_sessions_test.txt', sep=';', header=None, names = ['views', 'buys'])

In [7]:
import numpy as np

def string_features_to_int(data, nan_place):
    views, buys = list(), list()
    for view, buy in zip(data['views'], data['buys']):
        views.append([int(el) for el in view.split(',')])
        if type(buy) == float:
            buys.append(nan_place)
        else:
            buys.append([int(el) for el in buy.split(',')])
    return views, buys
views, buys = string_features_to_int(train, nan_place=[-1])

In [8]:
X_train = pd.DataFrame()
views, X_buys = string_features_to_int(train, nan_place=np.nan)
X_train['views'] = views
X_train['buys']  = X_buys
X_train.dropna(inplace=True)
X_train.reset_index(drop=True, inplace=True)
X_train.head()

Unnamed: 0,views,buys
0,"[59, 60, 61, 62, 60, 63, 64, 65, 66, 61, 67, 6...","[67, 60, 63]"
1,"[84, 85, 86, 87, 88, 89, 84, 90, 91, 92, 93, 86]",[86]
2,"[138, 198, 199, 127]",[199]
3,"[303, 304, 305, 306, 307, 308, 309, 310, 311, ...",[303]
4,"[352, 353, 352]",[352]


In [9]:
from collections import Counter

train['views'] = views
train['buys']  = buys

views_cnt = Counter()
buys_cnt  = Counter()
for v_lst, b_lst in zip(train['views'], train['buys']):
    for v_el in v_lst:
        views_cnt[v_el] += 1
    for b_el in b_lst:
        buys_cnt[b_el] += 1

In [10]:
test_views, test_buys = string_features_to_int(X_test, nan_place=np.nan)
X_test['views'] = test_views
X_test['buys']  = test_buys
X_test.dropna(inplace=True)
X_test.reset_index(drop=True, inplace=True)
X_test.head()


Unnamed: 0,views,buys
0,"[63, 68, 69, 70, 66, 61, 59, 61, 66, 68]","[66, 63]"
1,"[158, 159, 160, 159, 161, 162]",[162]
2,"[200, 201, 202, 203, 204]","[201, 205]"
3,"[371, 372, 371]","[371, 373]"
4,[422],[422]


In [11]:
def predict(viewed, k, method='popular'):
    unique_viewed = np.array(viewed)[np.sort(np.unique(viewed, return_index=True)[1])]    
    k = min(len(viewed), k)    
    ratings = []
    
    for item in unique_viewed:
        if method == 'popular':
            ratings.append(-views_cnt[item])
        else:
            ratings.append(-buys_cnt[item])
    
    sorted_items = np.argsort(ratings, kind='mergesort')
    return list(unique_viewed[sorted_items])[0:k]

In [12]:
def recall(predicted, buyed):
    rb = [x for x in buyed if x in predicted]
    return len(rb)/float(len(buyed))


In [13]:
def precision(predicted, buyed, k):
    rb = [x for x in buyed if x in predicted]
    return len(rb)/float(k)

In [14]:
def get_stats(data, method='popular'):
    ar1 = np.mean([recall   (predict(v, 1, method=method), b)    for v, b in zip(data['views'], data['buys'])])
    ap1 = np.mean([precision(predict(v, 1, method=method), b, 1) for v, b in zip(data['views'], data['buys'])])
    ar5 = np.mean([recall   (predict(v, 5, method=method), b)    for v, b in zip(data['views'], data['buys'])])
    ap5 = np.mean([precision(predict(v, 1, method=method), b, 5) for v, b in zip(data['views'], data['buys'])])
    return ar1, ap1, ar5, ap5

train_popular = get_stats(X_train, 'popular')
test_popular  = get_stats(X_test, 'popular')
train_purch   = get_stats(X_train, 'purch')
test_purch    = get_stats(X_test, 'purch')
print(train_popular)
print(test_popular)
print(train_purch)
print(test_purch)

(0.44263431659495955, 0.5121951219512195, 0.8246918247126118, 0.10243902439024391)
(0.41733266203252556, 0.48130968622100956, 0.8000340663538578, 0.0962619372442019)
(0.6884494924267651, 0.8037694013303769, 0.9263073024228791, 0.1607538802660754)
(0.4606201666660298, 0.5276944065484311, 0.8201874337490196, 0.10553888130968621)
