# Imports 📥

**🟦EN**



**🟦EN** Install [polars](https://www.pola.rs/)



In [None]:
!pip install polars

Collecting polars
  Downloading polars-0.15.8-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: polars
Successfully installed polars-0.15.8
[0m

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import multiprocessing
import polars as pl
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
import os
from annoy import AnnoyIndex
import collections

# Constants 📋

In [None]:
PATH = "/kaggle/input/otto-full-optimized-memory-footprint"
TEST_NAME = "test.parquet"
TRAIN_NAME = "train.parquet"
TEST_PATH = os.path.join(PATH, TEST_NAME)
TRAIN_PATH = os.path.join(PATH, TRAIN_NAME)

# Funcions 📘

In [None]:
def m20_mult(x, n_length):
    x = x[-20:]
    if n_length > 19:
        n_length = 19
    if len(x) >= n_length:
        y = []
        y_sum = list(np.zeros(21-len(x)))
        for i in range(n_length):
            y.append(list(index.get_nns_by_item(x[-(i+1)], 21 - len(x))[1:]))
            y_sum += y[-1]

        counter = dict(collections.Counter(y_sum))
        res = sorted(list(set(y_sum)), key = lambda d: counter[d], reverse=True)
    else:
        res = list(index.get_nns_by_item(x[-1], 21 - len(x))[1:])
        
    x = list(x) + list(res)[:20-len(x)]
    
    return x



# Data modification 📊

**🟦EN** Get data using *polar* and the función *read_paquet*



In [None]:
train_df = pl.read_parquet(TRAIN_PATH)
test_df = pl.read_parquet(TEST_PATH)

**🟦EN** Show the raw data



In [None]:
train_df.head()

session,aid,ts,type
i32,i32,i32,u8
0,1517085,1659304800,0
0,1563459,1659304904,0
0,1309446,1659367439,0
0,16246,1659367719,0
0,1781822,1659367871,0


In [None]:
test_df.head()

session,aid,ts,type
i32,i32,i32,u8
12899779,59625,1661724000,0
12899780,1142000,1661724000,0
12899780,582732,1661724058,0
12899780,973453,1661724109,0
12899780,736515,1661724136,0


In [None]:
sentences_df = pl.concat([train_df, test_df]).groupby("session").agg(pl.col("aid").alias("sentence"))

In [None]:
sentences_df.head()

session,sentence
i32,list[i32]
585540,"[1390152, 450695, ... 1132355]"
1777864,"[1502122, 822934, ... 758858]"
8497000,"[1517086, 647838, ... 1448942]"
10937276,"[1843722, 1843722, ... 462858]"
12885164,"[100178, 1635884, ... 304832]"


**🟦EN** Modify the data to imporve the results of the model.



In [None]:
test_pred_df = pl.concat([test_df]).groupby("session").agg(pl.col("aid").alias("sentence"))
test_pred_df.head()

session,sentence
i32,list[i32]
13173732,"[176502, 176502, ... 1716698]"
13198020,"[79595, 268265]"
14279316,"[1157167, 1793236]"
13230216,[1289949]
13539380,"[465861, 579690, 1296039]"


In [None]:
sentences_df.head()

session,sentence
i32,list[i32]
585540,"[1390152, 450695, ... 1132355]"
1777864,"[1502122, 822934, ... 758858]"
8497000,"[1517086, 647838, ... 1448942]"
10937276,"[1843722, 1843722, ... 462858]"
12885164,"[100178, 1635884, ... 304832]"


In [None]:
test_pred_df = test_pred_df.to_pandas().rename(columns={'sentence':'labels'})

sentences_df_clicks = pl.concat([test_df]).filter(pl.col('type') == 0)
sentences_df_carts = pl.concat([test_df]).filter(pl.col('type') == 1)
sentences_df_orders = pl.concat([test_df]).filter(pl.col('type') == 2)

sentences_df_clicks = sentences_df_clicks.groupby('session').agg(pl.col('aid').alias('sentence'))
sentences_df_carts = sentences_df_carts.groupby('session').agg(pl.col('aid').alias('sentence'))
sentences_df_orders = sentences_df_orders.groupby('session').agg(pl.col('aid').alias('sentence'))

sentences_df_clicks = sentences_df_clicks.to_pandas().rename(columns={'sentence':'labels_clicks'})
sentences_df_carts = sentences_df_carts.to_pandas().rename(columns={'sentence':'labels_carts'})
sentences_df_orders = sentences_df_orders.to_pandas().rename(columns={'sentence':'labels_orders'})

In [None]:
test_pred_df = test_pred_df.merge(sentences_df_clicks, how='left', on='session') \
                           .merge(sentences_df_carts, how='left', on='session') \
                           .merge(sentences_df_orders, how='left', on='session') 
test_pred_df.head()

Unnamed: 0,session,labels,labels_clicks,labels_carts,labels_orders
0,13173732,"[176502, 176502, 964381, 964381, 1334074, 1334...","[176502, 176502, 964381, 964381, 1334074, 1334...",,
1,13198020,"[79595, 268265]","[79595, 268265]",,
2,14279316,"[1157167, 1793236]","[1157167, 1793236]",,
3,13230216,[1289949],[1289949],,
4,13539380,"[465861, 579690, 1296039]","[465861, 579690, 1296039]",,


In [None]:
sentences_list = sentences_df['sentence'].to_list()

In [None]:
n_cores = multiprocessing.cpu_count() - 1
w2v = Word2Vec(
    sentences = sentences_list,
    vector_size = 100,
    alpha = 0.02,
    min_alpha = 0.01,
    min_count = 1,
    workers = n_cores)

**🟦EN** We load the model we have downloaded to predict the results.



In [None]:
w2v.save("w2vc.model")
model = Word2Vec.load("w2vc.model")

In [None]:
aid2idx = {aid: i for i, aid in enumerate(model.wv.index_to_key)}
index = AnnoyIndex(100, 'angular')

for aid, idx in aid2idx.items():
    index.add_item(aid, model.wv.vectors[idx])
    
index.build(50)

True

**🟦EN** Split the labels by carts, order r clicks.



In [None]:
test_pred_df['labels_carts'] = test_pred_df['labels_carts'].fillna(test_pred_df['labels'])
test_pred_df['labels_orders'] = test_pred_df['labels_orders'].fillna(test_pred_df['labels'])
test_pred_df['labels_clicks'] = test_pred_df['labels_clicks'].fillna(test_pred_df['labels'])

In [None]:
test_pred_df['labels'] = test_pred_df.labels.apply(lambda x: list(set(x)))
test_pred_df['labels'] = test_pred_df.labels.apply(lambda x: m20_mult(x, len(x)))
test_pred_df['labels'] = test_pred_df.labels.apply(lambda x: " ".join(map(str,x)))
test_pred_df = test_pred_df.drop(['labels_clicks', 'labels_carts', 'labels_orders'], axis=1)
clicks_pred_df = test_pred_df.copy()
clicks_pred_df.session = clicks_pred_df.session.apply(lambda x: str(x) + '_clicks')
orders_pred_df = test_pred_df.copy()
orders_pred_df.session = orders_pred_df.session.apply(lambda x: str(x) + '_orders')
carts_pred_df = test_pred_df.copy()
carts_pred_df.session = carts_pred_df.session.apply(lambda x: str(x) + '_carts')

In [None]:
pred_df = pd.concat(
    [clicks_pred_df, orders_pred_df, carts_pred_df]
)
pred_df.columns = ['session_type', 'labels']

**🟦EN** Predict the results


In [None]:
pred_df = pred_df.sort_values(by='session_type').reset_index()
pred_df = pred_df.drop('index', axis=1)

## Submission 📤

In [None]:
pred_df.to_csv("submission.csv", index=False)