In [4]:
import numpy as np
import pandas as pd
import datetime
import os

In [5]:
dataset_path = './data/raw'

In [9]:
queries = pd.read_csv(os.path.join(dataset_path,'train-queries.csv'),
                      sep=';',
                      low_memory=False,
                      usecols=['queryId', 'items', 'is.test'])
print('Total queries', len(queries))
queries.head()

Total queries 923127


Unnamed: 0,queryId,items,is.test
0,1,"7518,71,30311,7837,30792,8252,81766,9338,62220...",False
1,2,"70095,15964,8627,134850,32754,100747,74771,314...",False
2,3,"59081,51125,9338,9550,32087,62793,2717,10403,3...",True
3,4,"46632,57465,79064,57748,6080,35997,47088,6078,...",False
4,5,"27312,84626,12621,46209,5140,57539,5368,12923,...",False


In [10]:
# Leaving only test queries (the ones which items we have to sort)
queries = queries[queries['is.test'] == True][['queryId', 'items']]
queries.reset_index(drop=True, inplace=True)
print('Test queries', len(queries))

Test queries 286967


In [11]:
# Loading item views; taking itemId column
item_views = pd.read_csv(os.path.join(dataset_path,'train-item-views.csv'),
                      sep=';',
                      low_memory=False,
                      usecols=['itemId'])
print('Item views', len(item_views))
item_views.head()

Item views 1235380


Unnamed: 0,itemId
0,81766
1,31331
2,32118
3,9654
4,32627


In [12]:
# Loading clicks; taking itemId column
clicks = pd.read_csv(os.path.join(dataset_path,'train-clicks.csv'),
                      sep=';',
                      low_memory=False,
                      usecols=['itemId'])
print('Clicks', len(clicks))
clicks.head()

Clicks 1127764


Unnamed: 0,itemId
0,24857
1,30792
2,8252
3,33969
4,7837


In [13]:
# Loading purchases; taking itemId column
purchases = pd.read_csv(os.path.join(dataset_path,'train-purchases.csv'),
                      sep=';',
                      low_memory=False,
                      usecols=['itemId'])
print('Purchases', len(purchases))
purchases.head()

Purchases 18025


Unnamed: 0,itemId
0,25911
1,175874
2,35324
3,31233
4,34677


In [14]:
# Calculating popularity as [Amount of views] * 1 + Amount of clicks * 2 + [Amount of purchases] * 3
print('Scoring popularity for each item ...')
prod_pop = {}
for cost, container in enumerate([item_views, clicks, purchases]):
    for prod in container.values:
        product = str(prod[0])
        if product not in prod_pop:
            prod_pop[product] = cost
        else:
            prod_pop[product] += cost

print('Popularity scored for', len(prod_pop), 'products')

Scoring popularity for each item ...
Popularity scored for 128910 products


In [4]:
# For each query:
#   parse items (comma-separated values in last column)
#   sort them by score;
#   write them to the submission file.
# This is longest part; it usually takes around 5 minutes.
print('Sorting items per query by popularity...')

answers = []
step = int(len(queries) / 20)

with open('submission.txt', 'w+') as submission:
    for i, q in enumerate(queries.values):

        # Fancy progressbar
        if i % step == 0:
            print(5 * i / step, '%...')

        # Splitting last column which contains comma-separated items
        items = q[-1].split(',')
        # Getting scores for each item. Also, inverting scores here, so we can use argsort
        items_scores = list(map(lambda x: -prod_pop.get(x, 0), items))
        # Sorting items using items_scores order permutation
        sorted_items = np.array(items)[np.array(items_scores).argsort()]
        # Squashing items together
        s = ','.join(sorted_items)
        # and writing them to submission
        submission.write(str(q[0]) + " " + s + "\n")

end_time = datetime.datetime.now()
print("Done. Now it's ", end_time.isoformat())
print("Calculated baseline in ", (end_time - start_time).seconds, " seconds")

(1235380, 5)