In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# val

In [3]:
!pip install -q polars
!pip install -q gensim==4.0.1
import polars as pl
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

train = pl.read_parquet('/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/otto-train-and-test-data-for-local-validation/train.parquet')
test = pl.read_parquet('/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/otto-train-and-test-data-for-local-validation/test.parquet')

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.2/15.2 MB[0m [31m73.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.9/23.9 MB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
[?25h



In [4]:
sentences_df = pl.concat([train, test]).groupby('session').agg(
    pl.col('aid').alias('sentence'))
sentences = sentences_df['sentence'].to_list()

In [5]:
%%time
w2vec = Word2Vec(sentences=sentences, vector_size=32, min_count=1, workers=4)

CPU times: user 51min 4s, sys: 13.6 s, total: 51min 18s
Wall time: 17min 54s


In [6]:
%%time
!pip install -q annoy==1.17.1
from annoy import AnnoyIndex

aid2idx = {aid: i for i, aid in enumerate(w2vec.wv.index_to_key)}
index = AnnoyIndex(32, 'euclidean')

for aid, idx in aid2idx.items():
    index.add_item(idx, w2vec.wv.vectors[idx])
    
index.build(10)

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/648.0 KB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m648.0/648.0 KB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for annoy (setup.py) ... [?25l[?25hdone
CPU times: user 44.9 s, sys: 1.89 s, total: 46.8 s
Wall time: 31.6 s


True

In [7]:
import pandas as pd
import numpy as np

from collections import defaultdict

sample_sub = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/original/sample_submission.csv')

session_types = ['clicks', 'carts', 'orders']
test_session_AIDs = test.to_pandas().reset_index(drop=True).groupby('session')['aid'].apply(list)
test_session_types = test.to_pandas().reset_index(drop=True).groupby('session')['type'].apply(list)

labels = []

type_weight_multipliers = {0: 1, 1: 6, 2: 3}
for AIDs, types in zip(test_session_AIDs, test_session_types):
    if len(AIDs) >= 20:
        # if we have enough aids (over equals 20) we don't need to look for candidates! we just use the old logic
        weights=np.logspace(0.1,1,len(AIDs),base=2, endpoint=True)-1
        aids_temp=defaultdict(lambda: 0)
        for aid,w,t in zip(AIDs,weights,types): 
            aids_temp[aid]+= w * type_weight_multipliers[t]
            
        sorted_aids=[k for k, v in sorted(aids_temp.items(), key=lambda item: -item[1])]
        labels.append(sorted_aids[:20])
    else:
        # here we don't have 20 aids to output -- we will use word2vec embeddings to generate candidates!
        AIDs = list(dict.fromkeys(AIDs[::-1]))
        
        # let's grab the most recent aid
        most_recent_aid = AIDs[0]
        
        # and look for some neighbors!
        nns = [w2vec.wv.index_to_key[i] for i in index.get_nns_by_item(aid2idx[most_recent_aid], 21)[1:]]
                        
        labels.append((AIDs+nns)[:20])

In [8]:
labels_as_strings = [' '.join([str(l) for l in lls]) for lls in labels]

predictions = pd.DataFrame(data={'session_type': test_session_AIDs.index, 'labels': labels_as_strings})

prediction_dfs = []

for st in session_types:
    modified_predictions = predictions.copy()
    modified_predictions.session_type = modified_predictions.session_type.astype('str') + f'_{st}'
    prediction_dfs.append(modified_predictions)

submission = pd.concat(prediction_dfs).reset_index(drop=True)
submission.to_csv('/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/candidate/word2vec/val-word2vec1.csv', index=False)

In [5]:
import pandas as pd
submission = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/candidate/word2vec/val_word2vec1.csv')

In [6]:
submission_clicks = submission[submission.session_type.str.contains('clicks')]
submission_carts = submission[submission.session_type.str.contains('carts')]
submission_orders = submission[submission.session_type.str.contains('orders')]

In [7]:
def pprocess(sub):
    sub.session_type = sub.session_type.apply(lambda x:x.split('_')[0])
    sub.labels = sub.labels.apply(lambda x:x.split(' '))
    sub = pd.DataFrame([i for k in sub.labels for i in k],
                            index=[sub.session_type.iloc[i] for k in range(len(sub)) for i in range(len(sub.iloc[k].labels))],columns=['item'])
    sub.item = sub.astype(int)
    return sub

In [8]:
val_clicks = pprocess(submission_clicks)
val_clicks.to_parquet('/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/candidate/word2vec/val_clicks1.pqt')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [9]:
val_carts = pprocess(submission_carts)
val_carts.to_parquet('/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/candidate/word2vec/val_carts1.pqt')

In [10]:
val_orders = pprocess(submission_orders)
val_orders.to_parquet('/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/candidate/word2vec/val_orders1.pqt')

# test

In [16]:
train = pl.read_parquet('/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/otto-full-optimized-memory-footprint/train.parquet')
test = pl.read_parquet('/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/otto-full-optimized-memory-footprint/test.parquet')

In [17]:
sentences_df = pl.concat([train, test]).groupby('session').agg(
    pl.col('aid').alias('sentence'))
sentences = sentences_df['sentence'].to_list()

In [18]:
%%time
w2vec = Word2Vec(sentences=sentences, vector_size=32, min_count=1, workers=4)

CPU times: user 1h 12min 4s, sys: 19.9 s, total: 1h 12min 24s
Wall time: 24min 21s


In [19]:
%%time
!pip install -q annoy==1.17.1
from annoy import AnnoyIndex

aid2idx = {aid: i for i, aid in enumerate(w2vec.wv.index_to_key)}
index = AnnoyIndex(32, 'euclidean')

for aid, idx in aid2idx.items():
    index.add_item(idx, w2vec.wv.vectors[idx])
    
index.build(10)

CPU times: user 43.9 s, sys: 2.13 s, total: 46 s
Wall time: 20.9 s


True

In [20]:
import pandas as pd
import numpy as np

from collections import defaultdict

sample_sub = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/original/sample_submission.csv')

session_types = ['clicks', 'carts', 'orders']
test_session_AIDs = test.to_pandas().reset_index(drop=True).groupby('session')['aid'].apply(list)
test_session_types = test.to_pandas().reset_index(drop=True).groupby('session')['type'].apply(list)

labels = []

type_weight_multipliers = {0: 1, 1: 6, 2: 3}
for AIDs, types in zip(test_session_AIDs, test_session_types):
    if len(AIDs) >= 20:
        # if we have enough aids (over equals 20) we don't need to look for candidates! we just use the old logic
        weights=np.logspace(0.1,1,len(AIDs),base=2, endpoint=True)-1
        aids_temp=defaultdict(lambda: 0)
        for aid,w,t in zip(AIDs,weights,types): 
            aids_temp[aid]+= w * type_weight_multipliers[t]
            
        sorted_aids=[k for k, v in sorted(aids_temp.items(), key=lambda item: -item[1])]
        labels.append(sorted_aids[:20])
    else:
        # here we don't have 20 aids to output -- we will use word2vec embeddings to generate candidates!
        AIDs = list(dict.fromkeys(AIDs[::-1]))
        
        # let's grab the most recent aid
        most_recent_aid = AIDs[0]
        
        # and look for some neighbors!
        nns = [w2vec.wv.index_to_key[i] for i in index.get_nns_by_item(aid2idx[most_recent_aid], 21)[1:]]
                        
        labels.append((AIDs+nns)[:20])

In [21]:
labels_as_strings = [' '.join([str(l) for l in lls]) for lls in labels]

predictions = pd.DataFrame(data={'session_type': test_session_AIDs.index, 'labels': labels_as_strings})

prediction_dfs = []

for st in session_types:
    modified_predictions = predictions.copy()
    modified_predictions.session_type = modified_predictions.session_type.astype('str') + f'_{st}'
    prediction_dfs.append(modified_predictions)

submission = pd.concat(prediction_dfs).reset_index(drop=True)
submission.to_csv('/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/candidate/word2vec/test-word2vec1.csv', index=False)

In [68]:
submission = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/candidate/word2vec/test-word2vec1.csv')

In [69]:
submission_clicks = submission[submission.session_type.str.contains('clicks')]
submission_clicks.shape

(1671803, 2)

In [70]:
submission_carts = submission[submission.session_type.str.contains('carts')]
submission_carts.shape

(1671803, 2)

In [71]:
submission_orders = submission[submission.session_type.str.contains('orders')]
submission_orders.shape

(1671803, 2)

In [None]:
submission_clicks.session_type = submission_clicks.session_type.apply(lambda x:x.split('_')[0])
submission_clicks.labels = submission_clicks.labels.apply(lambda x:x.split(' '))
test_clicks = pd.DataFrame([i for k in submission_clicks.labels for i in k],
                           index=[submission_clicks.session_type.iloc[i] for k in range(len(submission_clicks)) for i in range(len(submission_clicks.iloc[k].labels))],columns=['item'])
test_clicks.item = test_clicks.astype(int)

In [108]:
test_clicks.to_parquet('/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/candidate/word2vec/test_clicks1.pqt')

In [104]:
def pprocess(sub):
    sub.session_type = sub.session_type.apply(lambda x:x.split('_')[0])
    sub.labels = sub.labels.apply(lambda x:x.split(' '))
    sub = pd.DataFrame([i for k in sub.labels for i in k],
                            index=[sub.session_type.iloc[i] for k in range(len(sub)) for i in range(len(sub.iloc[k].labels))],columns=['item'])
    sub.item = sub.astype(int)
    return sub

In [105]:
test_carts = pprocess(submission_carts)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [109]:
test_carts.to_parquet('/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/candidate/word2vec/test_carts1.pqt')

In [114]:
test_orders = pprocess(submission_orders)
test_orders.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


Unnamed: 0,item
12899779,59625
12899780,1750518
12899781,1036258
12899782,1591716
12899783,133038


In [115]:
test_orders.to_parquet('/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/candidate/word2vec/test_orders1.pqt')