In [1]:
from lightgbm.sklearn import LGBMRanker
import lightgbm as lgb
from datetime import timedelta
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
import os
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime, date, timedelta

In [2]:
INPUT_DIR = '../input/'
OUTPUT_DIR = '../output/'

In [4]:
# helper functions
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    # remove this case in advance
    # if not actual:
    #     return 0.0

    return score / min(len(actual), k)


def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

# label encoderとかでもいいと思う。
class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []
        
    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)

# Read Data

In [7]:
transactions = pd.read_csv(os.path.join(INPUT_DIR, 'transactions_train.csv'), dtype={"article_id": "str"})
customers = pd.read_csv(os.path.join(INPUT_DIR, 'customers.csv'))
articles = pd.read_csv(os.path.join(INPUT_DIR, 'articles.csv'), dtype={"article_id": "str"})

In [8]:
transactions['customer_id'] = customer_hex_id_to_int(transactions['customer_id'])

transactions.t_dat = pd.to_datetime(transactions.t_dat, format='%Y-%m-%d')

transactions['week'] = 104 - (transactions.t_dat.max() - transactions.t_dat).dt.days // 7

transactions.article_id = article_id_str_to_int(transactions.article_id)
articles.article_id = article_id_str_to_int(articles.article_id)

transactions.week = transactions.week.astype('int8')
transactions.sales_channel_id = transactions.sales_channel_id.astype('int8')
transactions.price = transactions.price.astype('float32')

customers.customer_id = customer_hex_id_to_int(customers.customer_id)
for col in ['FN', 'Active', 'age']:
    customers[col].fillna(-1, inplace=True)
    customers[col] = customers[col].astype('int8')

1362281

In [13]:
# label encode
customers.club_member_status = Categorize().fit_transform(customers[['club_member_status']]).club_member_status
customers.postal_code = Categorize().fit_transform(customers[['postal_code']]).postal_code
customers.fashion_news_frequency = Categorize().fit_transform(customers[['fashion_news_frequency']]).fashion_news_frequency

for col in articles.columns:
    if articles[col].dtype == 'object':
        articles[col] = Categorize().fit_transform(articles[[col]])[col]
        
for col in articles.columns:
    if articles[col].dtype == 'int64':
        articles[col] = articles[col].astype('int32')

In [17]:
transactions.sort_values(['t_dat', 'customer_id'], inplace=True)

In [19]:
transactions.to_parquet(os.path.join(OUTPUT_DIR, 'transactions_train.parquet'))
customers.to_parquet(os.path.join(OUTPUT_DIR, 'customers.parquet'))
articles.to_parquet(os.path.join(OUTPUT_DIR, 'articles.parquet'))