In [None]:
import numpy as np
import pandas as pd

import json
from collections import Counter, defaultdict
from tqdm.notebook import tqdm
import random

from sklearn.model_selection import train_test_split
from mlxtend.frequent_patterns import apriori

import csv

# nDCG scoring

In [None]:
def ndcg(y_pred, y_true):
    dcg = 0
    idcg = 22.42461597 * len(y_true)
    for pred, true in zip(y_pred, y_true):
        position = 1
        for item in pred:
            if item == true:
                dcg += 12 / (np.log(1 + position))
            elif metadata[item]['domain_id'] == metadata[true]['domain_id']:
                dcg += 1 / (np.log(1 + position))
            position += 1
    score = dcg / idcg
    
    return score

# **Workshop**

In [None]:
def jl_to_list(fname):
    output = []
    with open(fname) as f:
        for line in f:
            output.append(json.loads(line))
    return output

In [None]:
train_rows = jl_to_list('/kaggle/input/meli-data-challenge-2020/train_dataset.jl')
train_rows = train_rows[:80000]

In [None]:
train_rows, test_rows = train_test_split(train_rows, test_size=0.2)

y_true = []
for row in test_rows:
    y_true.append(row['item_bought'])

In [None]:
item_data = jl_to_list('/kaggle/input/meli-data-challenge-2020/item_data.jl')
metadata = {x['item_id']:x for x in item_data}
all_items = list(metadata.keys())

# Domain prediction

In [None]:
vistas_compras_dominios = defaultdict(lambda: defaultdict(int))
for row in tqdm(train_rows):
    viewed = [ev['event_info'] for ev in row['user_history'] if ev['event_type']=='view']
    viewed = list(set(viewed))
    for item in viewed:
        domain_viewed = metadata[item]['domain_id']
        domain_bought = metadata[row['item_bought']]['domain_id']
        vistas_compras_dominios[domain_viewed][domain_bought] += 1
        vistas_compras_dominios[domain_viewed]['Total'] += 1

In [None]:
dic = {}
i = 0
for k, v in tqdm(vistas_compras_dominios.items()):
    total = Counter(vistas_compras_dominios[k]).most_common(2)[0][1]
    domain_bought = Counter(vistas_compras_dominios[k]).most_common(2)[1][0]
    count = Counter(vistas_compras_dominios[k]).most_common(2)[1][1]
    dic[i] = {'domain': k, 'support': total, 'domain_bought': domain_bought, 'confidence': count}
    i += 1

In [None]:
df = pd.DataFrame.from_dict(dic)
df = df.transpose()
df['confidence'] = df['confidence'] / df['support']
df['support'] = df['support'] / len(train_rows)

In [None]:
df.drop(df[df.support < 0.001].index, inplace=True)
df.drop(df[df.confidence < 0.32].index, inplace=True)
df.sort_values(by=['confidence'], ascending=False, inplace=True)

In [None]:
df.head()

In [None]:
def predict_domain(row):
    domains = dominios_visitados(row)
    if len(domains) == 0:
        return ""
    most_viewed = domains.most_common(1)[0][0]
    domains = list(domains.elements())
    for domain, recom in zip(df['domain'], df['domain_bought']):
        if domain in domains:
            return recom
    
    return most_viewed

# Baseline ventas por dominio

In [None]:
ventas_por_dominio = {}
items_vendidos = {}

for item in all_items:
    domain = metadata[item]['domain_id']
    ventas_por_dominio[domain] = {}

for item in all_items:
    domain = metadata[item]['domain_id']
    ventas_por_dominio[domain][item] = 0

for row in train_rows:
    item = row['item_bought']
    items_vendidos[item] = 0
    
for row in train_rows:
    item = row['item_bought']
    items_vendidos[item] += 1

for item in items_vendidos:
    domain = metadata[item]['domain_id']
    ventas_por_dominio[domain][item] = items_vendidos[item]

del items_vendidos

In [None]:
def dominios_visitados(row, max_views=25):
    domains = Counter()
    viewed = [ev['event_info'] for ev in row['user_history'] if ev['event_type']=='view']
    if len(viewed) > max_views:
        viewed = viewed[:max_views]
    for item in viewed:
        domain = metadata[item]['domain_id']
        domains[domain] += 1
    return domains

In [None]:
def top_bought_items(domain, k=10):
    top = ventas_por_dominio[domain]
    top = Counter(top)
    top = top.most_common(k)
    return [x[0] for x in top]

In [None]:
def top_buys_by_best_domain(row, k=10):
    viewed = [ev['event_info'] for ev in row['user_history'] if ev['event_type']=='view']
    if len(viewed) == 0:
        return random.choices(all_items, k=k)
    domain = predict_domain(row)
    return top_bought_items(domain, k=k)

# Baseline vistas por dominio

In [None]:
vistas_por_dominio = defaultdict(lambda: defaultdict(int))

for row in tqdm(train_rows):
    viewed = [ev['event_info'] for ev in row['user_history'] if ev['event_type']=='view']
    for item in viewed:
        domain = metadata[item]['domain_id']
        vistas_por_dominio[domain][item] += 1

In [None]:
def top_viewed_items(domain, k=10):
    top = vistas_por_dominio[domain]
    top = Counter(top)
    top = top.most_common(k)
    return [x[0] for x in top]

In [None]:
def top_views_by_best_domain(row, k=10):
    viewed = [ev['event_info'] for ev in row['user_history'] if ev['event_type']=='view']
    if len(viewed) == 0:
        return random.choices(all_items, k=k)
    domain = predict_domain(row)
    return top_viewed_items(domain, k=k)

# Baseline ultimos vistos

In [None]:
def last_viewed(row, k=10):
    viewed = [ev for ev in row['user_history'] if ev['event_type']=='view']
    viewed = sorted(viewed, key=lambda x: x['event_timestamp'], reverse=True)
    viewed = [ev['event_info'] for ev in viewed]
    recom = []
    for item in viewed:
        if item not in recom:
            recom.append(item)
    if len(recom) > 10:
        recom = recom[:10]
    if len(recom) == 10:
        return recom
    k = 10 - len(recom)
    relleno = random.choices(all_items, k=k)
    
    return recom + relleno

# Baseline vistas-compras

In [None]:
vistas_compras = defaultdict(lambda: defaultdict(int))
for row in tqdm(train_rows):
    for ev in row['user_history']:
        if ev['event_type']=='view':
            vistas_compras[int(ev['event_info'])][int(row['item_bought'])]+=1

In [None]:
def get_item_scores(row):
    item_scores = defaultdict(int)
    for ev in row['user_history']:
        if ev['event_type']=='view':
            for k,v in vistas_compras[int(ev['event_info'])].items():
                item_scores[k]+=v
    
    return Counter(item_scores)

In [None]:
def vc_reco(row):
    recom = []
    scores = get_item_scores(row)
    most_common = scores.most_common()
    for item, score in most_common:
        recom.append(item)
        if len(recom)==10:
            return recom
        
    k = 10 - len(recom)
    relleno = random.choices(all_items, k=k)
    
    return recom + relleno

# Ultimos vistos con relleno de vistas por dominio

In [None]:
def last_viewed_with_views_fill(row, k=10):
    viewed = [ev for ev in row['user_history'] if ev['event_type']=='view']
    viewed = sorted(viewed, key=lambda x: x['event_timestamp'], reverse=True)
    viewed = [ev['event_info'] for ev in viewed]
    recom = []
    for item in viewed:
        if item not in recom:
            recom.append(item)
    if len(recom) > k:
        recom = recom[:k]
    if len(recom) == k:
        return recom
    j = 10 - len(recom)
    relleno = top_views_by_best_domain(row, k=j+3)
    recom = recom + relleno
    list(set(recom))
    
    return recom[:k]

# Ultimos vistos con relleno de vistas-compras

In [None]:
def last_viewed_with_vc_reco_fill(row, k=10):
    viewed = [ev for ev in row['user_history'] if ev['event_type']=='view']
    viewed = sorted(viewed, key=lambda x: x['event_timestamp'], reverse=True)
    viewed = [ev['event_info'] for ev in viewed]
    recom = []
    for item in viewed:
        if item not in recom:
            recom.append(item)
    if len(recom) > k:
        recom = recom[:k]
    if len(recom) == k:
        return recom
    
    relleno = vc_reco(row)
    recom = recom + relleno
    list(set(recom))
    
    return recom[:k]

# Ultimos vistos con relleno de ventas-compras

In [None]:
def last_viewed_with_buys_fill(row, k=10):
    viewed = [ev for ev in row['user_history'] if ev['event_type']=='view']
    viewed = sorted(viewed, key=lambda x: x['event_timestamp'], reverse=True)
    viewed = [ev['event_info'] for ev in viewed]
    recom = []
    for item in viewed:
        if item not in recom:
            recom.append(item)
    if len(recom) > k:
        recom = recom[:k]
    if len(recom) == k:
        return recom
    j = k - len(recom)
    relleno = top_buys_by_best_domain(row, k=j+3)
    recom = recom + relleno
    list(set(recom))
    
    return recom[:k]

# Analysis

In [None]:
def analysis(rows):
    data = {'last_domain_viewed': 0, 'domain_predicted': 0}
    domain_position = {1: 0, 2: 0, 3: 0}
    buy_position = {0:0, 1:0, 2:0, 3:0, 4:0, 5:0, 6:0, 7:0, 8:0, 9:0, 10: 0}
    top_viewed_position = {0:0, 1:0, 2:0, 3:0, 4:0, 5:0, 6:0, 7:0, 8:0, 9:0, 10: 0}
    top_bought_position = {0:0, 1:0, 2:0, 3:0, 4:0, 5:0, 6:0, 7:0, 8:0, 9:0, 10: 0}
    
    for row in tqdm(rows):
        item = row['item_bought']
        domain_bought = metadata[item]['domain_id']
        domain_predicted = predict_domain(row)
        
        viewed = last_viewed(row)
        for i in range(1, 11):
            if item in viewed[:i]:
                buy_position[i] += 1
        if metadata[viewed[0]]['domain_id'] == domain_bought:
            data['last_domain_viewed'] += 1
        if domain_bought == domain_predicted:
            data['domain_predicted'] += 1
        
        domain = dominios_visitados(row)
        if len(domain) == 0:
            domain = ""
        else:
            domain = domain.most_common(3)
        
        for i in range(0, len(domain)):
            if domain_bought == domain[i][0]:
                domain_position[i+1] += 1
        
        top_viewed_items_in_domain = top_viewed_items(domain_bought)
        for i in range(1, 11):
            if item in top_viewed_items_in_domain[:i]:
                top_viewed_position[i] += 1
        
        top_bought_items_in_domain = top_bought_items(domain_bought)
        for i in range(1, 11):
            if item in top_bought_items_in_domain[:i]:
                top_bought_position[i] += 1
    
    count = len(rows)
    
    print('Domain predicted correctly in ' + str(data['domain_predicted']/count))
    print('')
    print('Domain matches with the last domain viewed in ' + str(data['last_domain_viewed']/count))
    print('')
    for i in range(1, 4):
        print('Item belongs to ' + str(i) + ' most viewed domain in ' + str(domain_position[i]/count))
    print('')
    for i in range(1, 11):
        percentage = (top_bought_position[i] - top_bought_position[i-1])/count
        print('Item was in position ' + str(i) + ' in bought items in domain in ' + str(percentage))
        print('Cumulative: ' + str(top_bought_position[i]/count))
    print('')
    for i in range(1, 11):
        percentage = (top_viewed_position[i] - top_viewed_position[i-1])/count
        print('Item was in position ' + str(i) + ' in viewed items in domain in ' + str(percentage))
        print('Cumulative: ' + str(top_viewed_position[i]/count))
    print('')
    for i in range(1, 11):
        percentage = (buy_position[i] - buy_position[i-1])/count
        print('Item was in position ' + str(i) + ' in user last views in ' + str(percentage))
        print('Cumulative: ' + str(buy_position[i]/count))

In [None]:
analysis(test_rows)

# CSV

In [None]:
test_rows = jl_to_list('/kaggle/input/meli-data-challenge-2020/test_dataset.jl')

In [None]:
file = open('submission.csv', 'w+', newline ='')

with file:
    write = csv.writer(file)
    write.writerows(y_pred)