In [1]:
import pandas as pd
import numpy as np

In [175]:
EDGE_MULTIPLIER = 0.8

In [3]:
catalog_items = pd.read_csv('catalog_items.csv')
# purchased_products = pd.read_csv('purchased_producs.csv')
target_group = pd.read_csv('target_group.csv')['customer_id'].tolist()
visited_products = pd.read_csv('visited_products.csv')

# print(catalog_items)
# print(purchased_products)
# print(target_group)
# print(visited_products)

In [176]:
class Graph():
    def __init__(self, products):
        n = len(products) # number of products
        bijection = list(enumerate(products))
        self.vertex_to_product = {bijection[i][0]: bijection[i][1] for i in range(n)}
        self.product_to_vertex = {bijection[i][1]: bijection[i][0] for i in range(n)}
        self.edges = {i: dict() for i in range(n)}
    
    def __repr__(self):
        return f"Graph with {len(self.vertex_to_product)} products"
    
    def update_edge(self, prod_1, prod_2, weight): # v1 --> v2, v2 -> v1
        v1 = self.product_to_vertex[prod_1]
        v2 = self.product_to_vertex[prod_2]
        if v2 not in self.edges[v1]:
            self.edges[v1][v2] = 0
        if v1 not in self.edges[v2]:
            self.edges[v2][v1] = 0
        self.edges[v1][v2] += min(weight, 1) # add to edge weight
        self.edges[v2][v1] += min(weight, 1) * EDGE_MULTIPLIER # add to edge weight

In [177]:
products = catalog_items['product_id'].tolist()
graph = Graph(products)
print(graph)

Graph with 112561 products


In [6]:
# This part of the code creates the lists of products the people visited, in chronological order.
# Independent of the graph itself, so no need to run again after initialisation.
from datetime import datetime

def convert_to_seconds(date_time):
    date_time = date_time[:-6] # remove timezone info
    try:
        date_time = datetime.strptime(date_time, '%Y-%m-%d %H:%M:%S.%f')
    except Exception as err:
        return None
    return (date_time - datetime(2000, 1, 1)).total_seconds()

find_category = dict()
M = len(catalog_items['product_id'])
for i in range(M):
    prod_id = catalog_items['product_id'][i]
    category = catalog_items['category'][i]
    find_category[prod_id] = category

find_subcategory = dict()
for i in range(M):
    prod_id = catalog_items['product_id'][i]
    subcategory = catalog_items['subcategory'][i]
    find_subcategory[prod_id] = subcategory
    

customer_adjacency = dict()

customers = visited_products['customer_id'].unique().tolist()
N = len(customers)

for i in range(N):
    cust_id = visited_products["customer_id"][i]
    prod_id = visited_products["product_id"][i]
    if prod_id not in find_category:
        continue
    
    time = convert_to_seconds(visited_products["timestamp"][i])
    if time == None:
        continue
    price = visited_products["price"][i]
    # have: cust_id, prod_id, time, price
    # for now, only need the time, and the product.
    if cust_id in customer_adjacency:
        customer_adjacency[cust_id].append([time, prod_id])
    else:
        customer_adjacency[cust_id] = [[time, prod_id]]
        

In [178]:
# This part of the code adds edges to the graph, after traversing through the "path" of products
# that each customer has obtained.
# To make shit easier (while Sharvil finishes the graph implementation): we will store stuff
# in a list, or something

# Only use 80% of the test cases, and use the rest 20% for testing.
edges = []
num_clicks = dict()
for customer in customer_adjacency:
    customer_adjacency[customer].sort(key = lambda x: x[0])
    # for reference: customer_adjacency[customer] gives the pairs of information [time, product they searched] in a list.
    # compute: sum of adjacencies
    for node in customer_adjacency[customer]:
        prod_id = node[1]
        if prod_id not in num_clicks:
            num_clicks[prod_id] = 0
        num_clicks[prod_id] += 1
    # next: need to add stuff to the graph.
    views = len(customer_adjacency[customer])
    for i in range(1, views):
        old_time, old_prod = tuple(customer_adjacency[customer][i-1])
        new_time, new_prod = tuple(customer_adjacency[customer][i])
        curr_category = find_category[old_prod]
        if old_prod == new_prod:
#             print(customer_adjacency[customer][i-1], customer_adjacency[customer][i])
            continue
        # add edges?
#         edges.append([old_prod, new_prod, new_time - old_time])
        if new_time != old_time:
            score = 10
            graph.update_edge(old_prod, new_prod, score)

In [179]:
# metric for most popular = most number of clicks
best_five_recommend = sorted(num_clicks.items(), key = lambda x: -x[1])[:5]

In [180]:
# CATEGORY_WEIGHT = 1 # best
SUBCATEGORY_WEIGHT = 1 # best
for CATEGORY_WEIGHT in [1]:
    for DECAY_PARAMETER in [1]:
        def recommendations(customer):
            if customer not in customer_adjacency:
                return best_five_recommend
            else:
                path = customer_adjacency[customer][::-1]
                adjacent_nodes = dict()
                i = 0
                for i in range(len(path)):
                    node = path[i]
                    bad_nodes = []
                    if i != 0:
                        bad_nodes.append(path[i-1])
                    elif i != len(path) - 1:
                        bad_nodes.append(path[i+1])
                    category = find_category[node[1]]
                    prod_id = graph.product_to_vertex[node[1]]
                    adjacent_dict_of_weights = graph.edges[prod_id]
                    for adj_node in adjacent_dict_of_weights:
                        if adj_node not in bad_nodes:
                            weight = graph.edges[prod_id][adj_node]
                            if find_category[graph.vertex_to_product[adj_node]] == find_category[node[1]]:
                                weight *= CATEGORY_WEIGHT
                                if find_subcategory[graph.vertex_to_product[adj_node]] == find_subcategory[node[1]]:
                                    weight *= SUBCATEGORY_WEIGHT
                            decay = np.exp(-DECAY_PARAMETER*i) # This is current best model
        #                     decay = np.exp((path[0][0] - node[0])/1e+5)
#                             if path[0][0] - node[0] < -1800:
#                                 decay = np.exp(-2*i)
#                             else:
#                                 decay = 1
                            if adj_node not in adjacent_nodes:
                                adjacent_nodes[adj_node] = weight * decay
                            else:
                                adjacent_nodes[adj_node] = max(adjacent_nodes[adj_node], weight * decay)
                scores = sorted(list(adjacent_nodes.items()), key = lambda x: -x[1])[:4]
                conv_scores = [(graph.vertex_to_product[score[0]], score[1]) for score in scores] + [best_five_recommend[0]]
#                 conv_scores = best_five_recommend
                return conv_scores

        import os
        i = 1
        while os.path.isfile(f"recommendations{i}.csv"):
        #     print("file exists")
            i += 1
        else:
            with open(f"recommendations{i}.csv", 'w') as f:
                f.write('customer_id,product_id\n')
                flag = False
                for customer in target_group:
                    output = recommendations(customer)
                    if not flag and customer in customer_adjacency:
                        print(i, output)
                        flag = True
                    for x in output:
                        f.write(customer + str(',') + x[0] + str('\n'))

10 [('0049bcbb-205d-4cab-9958-0f87a0f764d7', 937.1999999999923), ('0b3cfce1-8b7d-469d-90c7-d9fbec290270', 412.98146065905627), ('0c9061fb-0b78-4474-8e0b-6f4f021d8791', 368.4680482773131), ('e95855d5-fa06-4501-bcc0-b5008c56519b', 361.47833889505546), ('a8f0292d-5fca-42b9-b0d3-b38e7efa416b', 10909)]
