In [215]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

# Transform from hexadecimal to decimal number
# from str(64bytes) to int(64bits)
def hex_id_to_int(string):
    return int(string[-16:], 16)

# Transform from int(64bits) to int(32bits)
def article_id_str_to_int(series):
    return series.astype("int32")

# Transform(from int 32bits to strings) and pre zero padding
def article_id_int_to_str(series):
    return "0" + series.astype("str")

In [216]:
import os
import pandas as pd

# Load Data
DIR = "/Users/seema/Downloads/h-and-m-personalized-fashion-recommendations"
transactions = pd.read_csv(os.path.join(DIR, "transactions_train.csv"))
articles = pd.read_csv(os.path.join(DIR, "articles.csv"))
customers = pd.read_csv(os.path.join(DIR, "customers.csv"))

#Convert customer ids which are in hexadecimal to integers
# Use "customer_hex_id_to_int" 
#display(customers["customer_id"])  # Original data
display(customer_hex_id_to_int(customers["customer_id"]))  # Applied data


0           6883939031699146327
1          11246327431398957306
2          18439897732908966680
3          18352672461570950206
4          18162778555210377306
                   ...         
1371975     7551062398649767985
1371976     9305341941720086711
1371977    10160427316885688932
1371978     2551401172826382186
1371979    16929951892735599169
Name: customer_id, Length: 1371980, dtype: uint64

In [217]:
# Convert article_id in "transactions" to string
# Use "article_id_str_to_int" and "atticle_id_int_to_str"
#display(transactions["article_id"])   # Original data
#display(article_id_str_to_int(transactions["article_id"]))  # Transform from int64 to int 32
display(article_id_int_to_str(transactions["article_id"]))  # Transform from int to str and zero padding

0           0663713001
1           0541518023
2           0505221004
3           0685687003
4           0685687004
               ...    
31788319    0929511001
31788320    0891322004
31788321    0918325001
31788322    0833459002
31788323    0898573003
Name: article_id, Length: 31788324, dtype: object

In [218]:
#Add new columns to transaction table. Transaction year, week,. For each customer post 2020-05-04 which week, year did they purchase artcile
# Create new columns
transactions["t_dat"] = pd.to_datetime(transactions["t_dat"])
transactions["year"] = transactions["t_dat"].dt.year

shape_2018 = transactions.query("year == 2018").shape[0]
shape_2019 = transactions.query("year == 2019").shape[0] + shape_2018
shape_2020 = transactions.query("year == 2020").shape[0] + shape_2019

transactions["week"] = transactions["t_dat"].dt.week
transactions.loc[shape_2018:shape_2019-1, "week"] = transactions.loc[shape_2018:shape_2019-1, "week"] + transactions.query("year == 2018")["week"].max() - 1
transactions.loc[shape_2019:shape_2020-1, "week"] = transactions.loc[shape_2019:shape_2020-1, "week"] + transactions.query("year == 2019")["week"].max() - 1
transactions["week"] = transactions["week"] - 37

transactions = transactions.query("t_dat >= '2020-05-04' & t_dat <= '2020-06-29'")
display(transactions)
display(transactions.groupby(["week"])["t_dat"].agg(["min", "max"]))

  transactions["week"] = transactions["t_dat"].dt.week


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,year,week
25360130,2020-05-04,0001177027259b455f979d85a278e4b280205d4de5cce4...,863456003,0.025407,2,2020,84
25360131,2020-05-04,0001177027259b455f979d85a278e4b280205d4de5cce4...,863456003,0.025407,2,2020,84
25360132,2020-05-04,0001177027259b455f979d85a278e4b280205d4de5cce4...,570002090,0.013542,2,2020,84
25360133,2020-05-04,0001177027259b455f979d85a278e4b280205d4de5cce4...,570002090,0.013542,2,2020,84
25360134,2020-05-04,0001177027259b455f979d85a278e4b280205d4de5cce4...,863456005,0.025407,2,2020,84
...,...,...,...,...,...,...,...
28340203,2020-06-29,fffa67737587e52ff1afa9c7c6490b5eb7acbc439fe82b...,857030001,0.050831,2,2020,92
28340204,2020-06-29,fffa67737587e52ff1afa9c7c6490b5eb7acbc439fe82b...,830489004,0.022017,2,2020,92
28340205,2020-06-29,fffd21f1e18d27df08db7814e8d76be0d59903d6d42fd4...,826838001,0.022017,2,2020,92
28340206,2020-06-29,fffd21f1e18d27df08db7814e8d76be0d59903d6d42fd4...,817110001,0.022017,2,2020,92


Unnamed: 0_level_0,min,max
week,Unnamed: 1_level_1,Unnamed: 2_level_1
84,2020-05-04,2020-05-10
85,2020-05-11,2020-05-17
86,2020-05-18,2020-05-24
87,2020-05-25,2020-05-31
88,2020-06-01,2020-06-07
89,2020-06-08,2020-06-14
90,2020-06-15,2020-06-21
91,2020-06-22,2020-06-28
92,2020-06-29,2020-06-29


In [231]:
# List of weeks when each customer bought an article.
# Transform "custoemr_id" hexadecimal to decimal number
transactions["customer_id"] = customer_hex_id_to_int(transactions["customer_id"])

# To merge customers and transactions so transform "customer_id" as same
customers["customer_id"] = customer_hex_id_to_int(customers["customer_id"])

# Results is slightly different due to day of the start week is Monday
c2weeks = transactions.groupby(["customer_id"])["week"].unique()
display(c2weeks)

customer_id
28847241659200                      [85, 91]
41318098387474                      [87, 90]
77117344919861                      [84, 90]
200292573348128         [84, 85, 87, 89, 91]
248294615847351                     [90, 91]
                                ...         
18446419945542455645                    [90]
18446420423308293068                [84, 89]
18446433324124325877                [85, 89]
18446566209623725451                    [91]
18446571879212697038                [87, 91]
Name: week, Length: 423963, dtype: object

In [232]:
#Creating dictionary with the weeks a customer purchased: the week the custgomer purchased next(for last item test week)
# Test week is the week after the last data of the train data
test_week = transactions["week"].max() + 1
print("test_week=", test_week)

c2weeks2shifted_weeks = dict()

for c_id, weeks in c2weeks.items():  # customer_id and list of weeks when the customer bought an article.
    c2weeks2shifted_weeks[c_id] = dict()
    for i in range(weeks.shape[0]-1):
        # key: the week when customer bought an article value: the week when customer bought an article next.
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week

# Display an example
print(c2weeks2shifted_weeks[28847241659200])

print(c2weeks2shifted_weeks[272412481300040])

test_week= 93
{85: 91, 91: 93}
{85: 91, 91: 93}


In [233]:
# Make candidates_last_purchase table which has the transactions table+ week of next purchase+ year
# Copy transactions data and week is replaced the customer bought and article next
candidates_last_purchase = transactions.copy()

weeks = []
# Create list to replace with next week by using dict
for i, (c_id, week) in enumerate(zip(transactions["customer_id"], transactions["week"])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])

candidates_last_purchase["week"] = weeks

# Display replace data
display(candidates_last_purchase[candidates_last_purchase['customer_id']==272412481300040])

# Display original transactions data to compare
display(transactions[transactions['customer_id']==272412481300040])

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,year,week,purchased
25719491,2020-05-14,272412481300040,855249006,0.045746,2,2020,91,1
25719492,2020-05-14,272412481300040,827370001,0.022864,2,2020,91,1
25719493,2020-05-14,272412481300040,799365002,0.045746,2,2020,91,1
25719494,2020-05-14,272412481300040,855249008,0.045746,2,2020,91,1
25719495,2020-05-14,272412481300040,857224001,0.030492,2,2020,91,1
27739098,2020-06-22,272412481300040,825581001,0.025407,1,2020,93,1
27739099,2020-06-22,272412481300040,877643001,0.027102,1,2020,93,1


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,year,week,purchased
25719491,2020-05-14,272412481300040,855249006,0.045746,2,2020,85,1
25719492,2020-05-14,272412481300040,827370001,0.022864,2,2020,85,1
25719493,2020-05-14,272412481300040,799365002,0.045746,2,2020,85,1
25719494,2020-05-14,272412481300040,855249008,0.045746,2,2020,85,1
25719495,2020-05-14,272412481300040,857224001,0.030492,2,2020,85,1
27739098,2020-06-22,272412481300040,825581001,0.025407,1,2020,91,1
27739099,2020-06-22,272412481300040,877643001,0.027102,1,2020,91,1


In [234]:
#For each week in transactions orginal+week+year per article id whats the mean price(multiple customers price mean)
# Calculate mean price for a week
mean_price = transactions.groupby(["week", "article_id"])["price"].mean()


In [235]:
#For each week, count no. of articles of each type sold and then find the top 12 most popular articles and rank them in descending order
sales = transactions \
    .groupby(["week"])["article_id"].value_counts() \
    .groupby(["week"]).rank(method="dense", ascending=False) \
    .groupby(["week"]).head(5).rename("bestseller_rank").astype("int8")

display(sales.head(5))
display("counts=", transactions.groupby(["week"])["article_id"].value_counts())


week  article_id
84    841260011     1
      741356002     2
      720125001     3
      706016001     4
      610776001     5
Name: bestseller_rank, dtype: int8

'counts='

week  article_id
84    841260011     711
      741356002     663
      720125001     547
      706016001     514
      610776001     487
                   ... 
92    921090001       1
      921531002       1
      923466001       1
      924699001       1
      926739002       1
Name: article_id, Length: 181172, dtype: int64

In [236]:
# Divide their processings to understand what did in this code 
# line 1

# Count number of sold articles in each week
display(transactions.groupby(["week"])["article_id"].value_counts())

week  article_id
84    841260011     711
      741356002     663
      720125001     547
      706016001     514
      610776001     487
                   ... 
92    921090001       1
      921531002       1
      923466001       1
      924699001       1
      926739002       1
Name: article_id, Length: 181172, dtype: int64

In [237]:
# Merge data week wise top 12 artciles and their. mean price
bestsellers_previous_week = pd.merge(sales, mean_price, on=["week", "article_id"]).reset_index()
bestsellers_previous_week["week"] += 1

display(bestsellers_previous_week.query("week == 90"))

Unnamed: 0,week,article_id,bestseller_rank,price
25,90,817472002,1,0.016646
26,90,599580038,2,0.016756
27,90,372860001,3,0.013053
28,90,879781003,4,0.016618
29,90,599580052,5,0.016633


In [239]:
#In each week only consider 1 transaction for each customer(the first transaction)
# This code is equal following
# transactions.drop_duplicates(["week", "customer_id"])
display(transactions.head(25))
unique_transactions = transactions \
    .groupby(["week", "customer_id"]) \
    .head(1) \
    .drop(columns=["article_id", "price"]) \
    .copy()

display(unique_transactions)
display(unique_transactions.shape)

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,year,week,purchased
25360130,2020-05-04,9497171930054871954,863456003,0.025407,2,2020,84,1
25360131,2020-05-04,9497171930054871954,863456003,0.025407,2,2020,84,1
25360132,2020-05-04,9497171930054871954,570002090,0.013542,2,2020,84,1
25360133,2020-05-04,9497171930054871954,570002090,0.013542,2,2020,84,1
25360134,2020-05-04,9497171930054871954,863456005,0.025407,2,2020,84,1
25360135,2020-05-04,9497171930054871954,863456005,0.025407,2,2020,84,1
25360136,2020-05-04,10027439189241372429,849859002,0.010661,2,2020,84,1
25360137,2020-05-04,10027439189241372429,780297001,0.022864,2,2020,84,1
25360138,2020-05-04,10027439189241372429,784396001,0.019814,2,2020,84,1
25360139,2020-05-04,10027439189241372429,675027003,0.013712,2,2020,84,1


Unnamed: 0,t_dat,customer_id,sales_channel_id,year,week,purchased
25360130,2020-05-04,9497171930054871954,2,2020,84,1
25360136,2020-05-04,10027439189241372429,2,2020,84,1
25360141,2020-05-04,3643303674040122763,2,2020,84,1
25360144,2020-05-04,5066037363543359322,2,2020,84,1
25360146,2020-05-04,960034110748388768,2,2020,84,1
...,...,...,...,...,...,...
28340186,2020-06-29,17055865410987022059,1,2020,92,1
28340189,2020-06-29,12805956369521093411,1,2020,92,1
28340195,2020-06-29,17924054183488701256,1,2020,92,1
28340201,2020-06-29,2122442112338100006,2,2020,92,1


(702030, 6)

In [241]:
# Merge data #week is previous week
#for each customer, in the last week they purchased, get the top 12 items in ranked and mean price
candidates_bestsellers = pd.merge(unique_transactions, bestsellers_previous_week, on="week")
display(candidates_bestsellers.head(5))


Unnamed: 0,t_dat,customer_id,sales_channel_id,year,week,purchased,article_id,bestseller_rank,price
0,2020-05-11,7433207736229661742,1,2020,85,1,841260011,1,0.016012
1,2020-05-11,7433207736229661742,1,2020,85,1,741356002,2,0.03226
2,2020-05-11,7433207736229661742,1,2020,85,1,720125001,3,0.032061
3,2020-05-11,7433207736229661742,1,2020,85,1,706016001,4,0.03241
4,2020-05-11,7433207736229661742,1,2020,85,1,610776001,5,0.008125


In [242]:
#get only uniqiue customers
test_set_transactions = unique_transactions.drop_duplicates("customer_id").reset_index(drop=True)
display(test_set_transactions)
test_set_transactions["week"] = test_week #test_week=105 for all customers #for each customer test_week is unique its the week after the week of latest purchase for hat customer

display(test_set_transactions.head(5))

Unnamed: 0,t_dat,customer_id,sales_channel_id,year,week,purchased
0,2020-05-04,9497171930054871954,2,2020,84,1
1,2020-05-04,10027439189241372429,2,2020,84,1
2,2020-05-04,3643303674040122763,2,2020,84,1
3,2020-05-04,5066037363543359322,2,2020,84,1
4,2020-05-04,960034110748388768,2,2020,84,1
...,...,...,...,...,...,...
423958,2020-06-29,11862473267660028250,2,2020,92,1
423959,2020-06-29,5169170505799009956,1,2020,92,1
423960,2020-06-29,17055865410987022059,1,2020,92,1
423961,2020-06-29,12805956369521093411,1,2020,92,1


Unnamed: 0,t_dat,customer_id,sales_channel_id,year,week,purchased
0,2020-05-04,9497171930054871954,2,2020,93,1
1,2020-05-04,10027439189241372429,2,2020,93,1
2,2020-05-04,3643303674040122763,2,2020,93,1
3,2020-05-04,5066037363543359322,2,2020,93,1
4,2020-05-04,960034110748388768,2,2020,93,1


In [220]:
# Transactions data last bought before test term and best sellers
candidates_bestsellers_test_week = pd.merge(test_set_transactions,
                                            bestsellers_previous_week,
                                            on="week")

display(candidates_bestsellers_test_week.head(5))# week column is the test week

Unnamed: 0,t_dat,customer_id,sales_channel_id,year,week,article_id,bestseller_rank,price
0,2020-05-04,9497171930054871954,2,2020,93,881031001,1,0.0247
1,2020-05-04,9497171930054871954,2,2020,93,866383006,2,0.024703
2,2020-05-04,9497171930054871954,2,2020,93,871581002,3,0.016446
3,2020-05-04,9497171930054871954,2,2020,93,372860069,4,0.005229
4,2020-05-04,9497171930054871954,2,2020,93,817353008,5,0.023234


In [243]:
# Merge train term data and test term data
#display(candidates_bestsellers, candidates_bestsellers_test_week)
candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
# drop bestseller_rank column
candidates_bestsellers = candidates_bestsellers.drop("bestseller_rank", axis=1)
#display(candidates_bestsellers)

In [244]:
candidates_bestsellers #for each candidate the top 12 best sellers without the rank

Unnamed: 0,t_dat,customer_id,sales_channel_id,year,week,purchased,article_id,price
0,2020-05-11,7433207736229661742,1,2020,85,1.0,841260011,0.016012
1,2020-05-11,7433207736229661742,1,2020,85,1.0,741356002,0.032260
2,2020-05-11,7433207736229661742,1,2020,85,1.0,720125001,0.032061
3,2020-05-11,7433207736229661742,1,2020,85,1.0,706016001,0.032410
4,2020-05-11,7433207736229661742,1,2020,85,1.0,610776001,0.008125
...,...,...,...,...,...,...,...,...
5087551,2020-06-29,17924054183488701256,1,2020,93,,866261001,0.016751
5087552,2020-06-29,17924054183488701256,1,2020,93,,880749002,0.021444
5087553,2020-06-29,17924054183488701256,1,2020,93,,733749001,0.004975
5087554,2020-06-29,17924054183488701256,1,2020,93,,874078003,0.016748


In [245]:
transactions['purchased'] = 1

In [246]:

data = pd.concat([transactions, candidates_last_purchase, candidates_bestsellers])
data.purchased.fillna(0, inplace=True) # purchased for future trans ations(test week) will be NaN

In [247]:
data.head(5)

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,year,week,purchased
25360130,2020-05-04,9497171930054871954,863456003,0.025407,2,2020,84,1.0
25360131,2020-05-04,9497171930054871954,863456003,0.025407,2,2020,84,1.0
25360132,2020-05-04,9497171930054871954,570002090,0.013542,2,2020,84,1.0
25360133,2020-05-04,9497171930054871954,570002090,0.013542,2,2020,84,1.0
25360134,2020-05-04,9497171930054871954,863456005,0.025407,2,2020,84,1.0


In [248]:
data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)
display(data)

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,year,week,purchased
25360130,2020-05-04,9497171930054871954,863456003,0.025407,2,2020,84,1.0
25360132,2020-05-04,9497171930054871954,570002090,0.013542,2,2020,84,1.0
25360134,2020-05-04,9497171930054871954,863456005,0.025407,2,2020,84,1.0
25360136,2020-05-04,10027439189241372429,849859002,0.010661,2,2020,84,1.0
25360137,2020-05-04,10027439189241372429,780297001,0.022864,2,2020,84,1.0
...,...,...,...,...,...,...,...,...
5087551,2020-06-29,17924054183488701256,866261001,0.016751,1,2020,93,0.0
5087552,2020-06-29,17924054183488701256,880749002,0.021444,1,2020,93,0.0
5087553,2020-06-29,17924054183488701256,733749001,0.004975,1,2020,93,0.0
5087554,2020-06-29,17924054183488701256,874078003,0.016748,1,2020,93,0.0


In [249]:
data.purchased.mean()

0.6259428581612718

In [250]:
data = pd.merge(
    data,
    bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
)
display(data)

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,year,week,purchased,bestseller_rank
0,2020-05-04,9497171930054871954,863456003,0.025407,2,2020,84,1.0,
1,2020-05-04,9497171930054871954,570002090,0.013542,2,2020,84,1.0,
2,2020-05-04,9497171930054871954,863456005,0.025407,2,2020,84,1.0,
3,2020-05-04,10027439189241372429,849859002,0.010661,2,2020,84,1.0,
4,2020-05-04,10027439189241372429,780297001,0.022864,2,2020,84,1.0,
...,...,...,...,...,...,...,...,...,...
13578522,2020-06-29,17924054183488701256,866261001,0.016751,1,2020,93,0.0,
13578523,2020-06-29,17924054183488701256,880749002,0.021444,1,2020,93,0.0,
13578524,2020-06-29,17924054183488701256,733749001,0.004975,1,2020,93,0.0,
13578525,2020-06-29,17924054183488701256,874078003,0.016748,1,2020,93,0.0,


In [251]:
display(data.week.min())

84

In [252]:

data = data[data.week != data.week.min()] #why?
#display(data)
data.bestseller_rank.fillna(999, inplace=True)  #why?
display(data)

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,year,week,purchased,bestseller_rank
211236,2020-05-11,7433207736229661742,832331002,0.014390,1,2020,85,1.0,999.0
211237,2020-05-11,562033157658334051,808698004,0.043492,2,2020,85,1.0,999.0
211238,2020-05-11,562033157658334051,854305002,0.086983,2,2020,85,1.0,999.0
211239,2020-05-11,2160337529732391090,888727008,0.067780,2,2020,85,1.0,999.0
211240,2020-05-11,2160337529732391090,872707002,0.067780,2,2020,85,1.0,999.0
...,...,...,...,...,...,...,...,...,...
13578522,2020-06-29,17924054183488701256,866261001,0.016751,1,2020,93,0.0,999.0
13578523,2020-06-29,17924054183488701256,880749002,0.021444,1,2020,93,0.0,999.0
13578524,2020-06-29,17924054183488701256,733749001,0.004975,1,2020,93,0.0,999.0
13578525,2020-06-29,17924054183488701256,874078003,0.016748,1,2020,93,0.0,999.0


In [103]:
data = pd.merge(data, articles, on='article_id', how='left')
display(data)
data = pd.merge(data, customers, on='customer_id', how='left')
display(data)

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,year,week,purchased,bestseller_rank,product_code,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,2020-05-11,7433207736229661742,832331002,0.014390,1,2020,85,1.0,999.0,832331,...,Swimwear,B,Lingeries/Tights,1,Ladieswear,60,"Womens Swimwear, beachwear",1018,Swimwear,"Fully lined bikini bottoms with a high waist, ..."
1,2020-05-11,562033157658334051,808698004,0.043492,2,2020,85,1.0,999.0,808698,...,Heels,C,Ladies Accessories,1,Ladieswear,64,Womens Shoes,1020,Shoes,Wedge-heeled sandals with a braided jute trim ...
2,2020-05-11,562033157658334051,854305002,0.086983,2,2020,85,1.0,999.0,854305,...,Premium Quality,C,Ladies Accessories,1,Ladieswear,64,Womens Shoes,1020,Shoes,Wedge-heeled sandals in suede with long straps...
3,2020-05-11,2160337529732391090,888727008,0.067780,2,2020,85,1.0,999.0,888727,...,Jersey,A,Ladieswear,1,Ladieswear,18,Womens Trend,1005,Jersey Fancy,"Short, fitted dress in jersey crêpe with a sew..."
4,2020-05-11,2160337529732391090,872707002,0.067780,2,2020,85,1.0,999.0,872707,...,Jersey,A,Ladieswear,1,Ladieswear,18,Womens Trend,1005,Jersey Fancy,"Short, fitted dress in airy, patterned jersey ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17846241,2020-06-29,17924054183488701256,866261001,0.016751,1,2020,93,0.0,8.0,866261,...,Swimwear,B,Lingeries/Tights,1,Ladieswear,60,"Womens Swimwear, beachwear",1018,Swimwear,"Fully lined bikini bottoms with a low, V-shape..."
17846242,2020-06-29,17924054183488701256,880749002,0.021444,1,2020,93,0.0,9.0,880749,...,Dresses,D,Divided,2,Divided,53,Divided Collection,1013,Dresses Ladies,"Short, wide dress in a viscose crêpe weave. Ro..."
17846243,2020-06-29,17924054183488701256,733749001,0.004975,1,2020,93,0.0,10.0,733749,...,EQ Divided Basics,D,Divided,2,Divided,80,Divided Complements Other,1002,Jersey Basic,Cropped top in soft cotton jersey with conceal...
17846244,2020-06-29,17924054183488701256,874078003,0.016748,1,2020,93,0.0,11.0,874078,...,Swimwear,B,Lingeries/Tights,1,Ladieswear,60,"Womens Swimwear, beachwear",1018,Swimwear,Fully lined bikini bottoms with a mid waist an...


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,year,week,purchased,bestseller_rank,product_code,...,section_name,garment_group_no,garment_group_name,detail_desc,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,2020-05-11,7433207736229661742,832331002,0.014390,1,2020,85,1.0,999.0,832331,...,"Womens Swimwear, beachwear",1018,Swimwear,"Fully lined bikini bottoms with a high waist, ...",,,ACTIVE,NONE,17.0,bfb09a56a46038e18a2ecdad3823e8e465e2e03f8af133...
1,2020-05-11,562033157658334051,808698004,0.043492,2,2020,85,1.0,999.0,808698,...,Womens Shoes,1020,Shoes,Wedge-heeled sandals with a braided jute trim ...,1.0,1.0,ACTIVE,Regularly,54.0,a541d4639a3d07246cb7ce7a609ee52c8faacb86d26311...
2,2020-05-11,562033157658334051,854305002,0.086983,2,2020,85,1.0,999.0,854305,...,Womens Shoes,1020,Shoes,Wedge-heeled sandals in suede with long straps...,1.0,1.0,ACTIVE,Regularly,54.0,a541d4639a3d07246cb7ce7a609ee52c8faacb86d26311...
3,2020-05-11,2160337529732391090,888727008,0.067780,2,2020,85,1.0,999.0,888727,...,Womens Trend,1005,Jersey Fancy,"Short, fitted dress in jersey crêpe with a sew...",1.0,1.0,ACTIVE,Regularly,22.0,db49a50939e79daa44a585660f5881c4144501f0c1ad8c...
4,2020-05-11,2160337529732391090,872707002,0.067780,2,2020,85,1.0,999.0,872707,...,Womens Trend,1005,Jersey Fancy,"Short, fitted dress in airy, patterned jersey ...",1.0,1.0,ACTIVE,Regularly,22.0,db49a50939e79daa44a585660f5881c4144501f0c1ad8c...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17846241,2020-06-29,17924054183488701256,866261001,0.016751,1,2020,93,0.0,8.0,866261,...,"Womens Swimwear, beachwear",1018,Swimwear,"Fully lined bikini bottoms with a low, V-shape...",,,ACTIVE,NONE,21.0,755c73dcc3f3397be9e4dc663c92cfd238f48a13d8b5d1...
17846242,2020-06-29,17924054183488701256,880749002,0.021444,1,2020,93,0.0,9.0,880749,...,Divided Collection,1013,Dresses Ladies,"Short, wide dress in a viscose crêpe weave. Ro...",,,ACTIVE,NONE,21.0,755c73dcc3f3397be9e4dc663c92cfd238f48a13d8b5d1...
17846243,2020-06-29,17924054183488701256,733749001,0.004975,1,2020,93,0.0,10.0,733749,...,Divided Complements Other,1002,Jersey Basic,Cropped top in soft cotton jersey with conceal...,,,ACTIVE,NONE,21.0,755c73dcc3f3397be9e4dc663c92cfd238f48a13d8b5d1...
17846244,2020-06-29,17924054183488701256,874078003,0.016748,1,2020,93,0.0,11.0,874078,...,"Womens Swimwear, beachwear",1018,Swimwear,Fully lined bikini bottoms with a mid waist an...,,,ACTIVE,NONE,21.0,755c73dcc3f3397be9e4dc663c92cfd238f48a13d8b5d1...


In [104]:
data.sort_values(['week', 'customer_id'], inplace=True)
display(data)
data.reset_index(drop=True, inplace=True)
display(data)

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,year,week,purchased,bestseller_rank,product_code,...,section_name,garment_group_no,garment_group_name,detail_desc,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
103699,2020-05-13,28847241659200,880768001,0.027102,2,2020,85,1.0,999.0,880768,...,Divided Projects,1010,Blouses,Cropped top in softly draping satin with a dra...,1.0,1.0,ACTIVE,Regularly,21.0,f3dd793c657b414a4f7b0738f78b9223f7a6a8b844a0ad...
5449079,2020-05-13,28847241659200,841260011,0.016012,2,2020,85,0.0,1.0,841260,...,Womens Everyday Collection,1005,Jersey Fancy,T-shirt in soft jersey with a round neckline w...,1.0,1.0,ACTIVE,Regularly,21.0,f3dd793c657b414a4f7b0738f78b9223f7a6a8b844a0ad...
5449080,2020-05-13,28847241659200,741356002,0.032260,2,2020,85,0.0,2.0,741356,...,Womens Everyday Collection,1025,Shorts,"Short, 5-pocket shorts in washed denim with a ...",1.0,1.0,ACTIVE,Regularly,21.0,f3dd793c657b414a4f7b0738f78b9223f7a6a8b844a0ad...
5449081,2020-05-13,28847241659200,720125001,0.032061,2,2020,85,0.0,3.0,720125,...,Ladies H&M Sport,1005,Jersey Fancy,Sports tights in fast-drying functional fabric...,1.0,1.0,ACTIVE,Regularly,21.0,f3dd793c657b414a4f7b0738f78b9223f7a6a8b844a0ad...
5449082,2020-05-13,28847241659200,706016001,0.032410,2,2020,85,0.0,4.0,706016,...,Divided Collection,1009,Trousers,High-waisted jeans in washed superstretch deni...,1.0,1.0,ACTIVE,Regularly,21.0,f3dd793c657b414a4f7b0738f78b9223f7a6a8b844a0ad...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15368394,2020-05-29,18446571879212697038,866261001,0.016751,1,2020,93,0.0,8.0,866261,...,"Womens Swimwear, beachwear",1018,Swimwear,"Fully lined bikini bottoms with a low, V-shape...",,,ACTIVE,NONE,46.0,d9a4c4ee64dab2396525cc8639d8534db2b2503c9d9645...
15368395,2020-05-29,18446571879212697038,880749002,0.021444,1,2020,93,0.0,9.0,880749,...,Divided Collection,1013,Dresses Ladies,"Short, wide dress in a viscose crêpe weave. Ro...",,,ACTIVE,NONE,46.0,d9a4c4ee64dab2396525cc8639d8534db2b2503c9d9645...
15368396,2020-05-29,18446571879212697038,733749001,0.004975,1,2020,93,0.0,10.0,733749,...,Divided Complements Other,1002,Jersey Basic,Cropped top in soft cotton jersey with conceal...,,,ACTIVE,NONE,46.0,d9a4c4ee64dab2396525cc8639d8534db2b2503c9d9645...
15368397,2020-05-29,18446571879212697038,874078003,0.016748,1,2020,93,0.0,11.0,874078,...,"Womens Swimwear, beachwear",1018,Swimwear,Fully lined bikini bottoms with a mid waist an...,,,ACTIVE,NONE,46.0,d9a4c4ee64dab2396525cc8639d8534db2b2503c9d9645...


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,year,week,purchased,bestseller_rank,product_code,...,section_name,garment_group_no,garment_group_name,detail_desc,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,2020-05-13,28847241659200,880768001,0.027102,2,2020,85,1.0,999.0,880768,...,Divided Projects,1010,Blouses,Cropped top in softly draping satin with a dra...,1.0,1.0,ACTIVE,Regularly,21.0,f3dd793c657b414a4f7b0738f78b9223f7a6a8b844a0ad...
1,2020-05-13,28847241659200,841260011,0.016012,2,2020,85,0.0,1.0,841260,...,Womens Everyday Collection,1005,Jersey Fancy,T-shirt in soft jersey with a round neckline w...,1.0,1.0,ACTIVE,Regularly,21.0,f3dd793c657b414a4f7b0738f78b9223f7a6a8b844a0ad...
2,2020-05-13,28847241659200,741356002,0.032260,2,2020,85,0.0,2.0,741356,...,Womens Everyday Collection,1025,Shorts,"Short, 5-pocket shorts in washed denim with a ...",1.0,1.0,ACTIVE,Regularly,21.0,f3dd793c657b414a4f7b0738f78b9223f7a6a8b844a0ad...
3,2020-05-13,28847241659200,720125001,0.032061,2,2020,85,0.0,3.0,720125,...,Ladies H&M Sport,1005,Jersey Fancy,Sports tights in fast-drying functional fabric...,1.0,1.0,ACTIVE,Regularly,21.0,f3dd793c657b414a4f7b0738f78b9223f7a6a8b844a0ad...
4,2020-05-13,28847241659200,706016001,0.032410,2,2020,85,0.0,4.0,706016,...,Divided Collection,1009,Trousers,High-waisted jeans in washed superstretch deni...,1.0,1.0,ACTIVE,Regularly,21.0,f3dd793c657b414a4f7b0738f78b9223f7a6a8b844a0ad...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17846241,2020-05-29,18446571879212697038,866261001,0.016751,1,2020,93,0.0,8.0,866261,...,"Womens Swimwear, beachwear",1018,Swimwear,"Fully lined bikini bottoms with a low, V-shape...",,,ACTIVE,NONE,46.0,d9a4c4ee64dab2396525cc8639d8534db2b2503c9d9645...
17846242,2020-05-29,18446571879212697038,880749002,0.021444,1,2020,93,0.0,9.0,880749,...,Divided Collection,1013,Dresses Ladies,"Short, wide dress in a viscose crêpe weave. Ro...",,,ACTIVE,NONE,46.0,d9a4c4ee64dab2396525cc8639d8534db2b2503c9d9645...
17846243,2020-05-29,18446571879212697038,733749001,0.004975,1,2020,93,0.0,10.0,733749,...,Divided Complements Other,1002,Jersey Basic,Cropped top in soft cotton jersey with conceal...,,,ACTIVE,NONE,46.0,d9a4c4ee64dab2396525cc8639d8534db2b2503c9d9645...
17846244,2020-05-29,18446571879212697038,874078003,0.016748,1,2020,93,0.0,11.0,874078,...,"Womens Swimwear, beachwear",1018,Swimwear,Fully lined bikini bottoms with a mid waist an...,,,ACTIVE,NONE,46.0,d9a4c4ee64dab2396525cc8639d8534db2b2503c9d9645...


In [171]:
train = data[data.week != test_week] 
display("train data=",train)
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()
#In test data all purchased=0 and rank given to each item, either 999 or 1 to 12. For different customerids when the artcileis same, rank is constant
display("test data=",test) # what all customers buy in the test week=93

'train data='

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,year,week,purchased,bestseller_rank,product_code,...,section_name,garment_group_no,garment_group_name,detail_desc,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,2020-05-13,28847241659200,880768001,0.027102,2,2020,85,1.0,999.0,880768,...,Divided Projects,1010,Blouses,Cropped top in softly draping satin with a dra...,1.0,1.0,ACTIVE,Regularly,21.0,f3dd793c657b414a4f7b0738f78b9223f7a6a8b844a0ad...
1,2020-05-13,28847241659200,841260011,0.016012,2,2020,85,0.0,1.0,841260,...,Womens Everyday Collection,1005,Jersey Fancy,T-shirt in soft jersey with a round neckline w...,1.0,1.0,ACTIVE,Regularly,21.0,f3dd793c657b414a4f7b0738f78b9223f7a6a8b844a0ad...
2,2020-05-13,28847241659200,741356002,0.032260,2,2020,85,0.0,2.0,741356,...,Womens Everyday Collection,1025,Shorts,"Short, 5-pocket shorts in washed denim with a ...",1.0,1.0,ACTIVE,Regularly,21.0,f3dd793c657b414a4f7b0738f78b9223f7a6a8b844a0ad...
3,2020-05-13,28847241659200,720125001,0.032061,2,2020,85,0.0,3.0,720125,...,Ladies H&M Sport,1005,Jersey Fancy,Sports tights in fast-drying functional fabric...,1.0,1.0,ACTIVE,Regularly,21.0,f3dd793c657b414a4f7b0738f78b9223f7a6a8b844a0ad...
4,2020-05-13,28847241659200,706016001,0.032410,2,2020,85,0.0,4.0,706016,...,Divided Collection,1009,Trousers,High-waisted jeans in washed superstretch deni...,1.0,1.0,ACTIVE,Regularly,21.0,f3dd793c657b414a4f7b0738f78b9223f7a6a8b844a0ad...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11181640,2020-06-29,18443851993991110350,881031001,0.023862,2,2020,92,0.0,8.0,881031,...,Ladies H&M Sport,1005,Jersey Fancy,Cycling shorts in fast-drying functional fabri...,1.0,1.0,ACTIVE,Regularly,31.0,193d006133fd3a00999562d5e4c7ca2ecdb23b7297e15b...
11181641,2020-06-29,18443851993991110350,823118001,0.016473,2,2020,92,0.0,9.0,823118,...,"Womens Swimwear, beachwear",1018,Swimwear,"Fully lined, non-wired bikini top with adjusta...",1.0,1.0,ACTIVE,Regularly,31.0,193d006133fd3a00999562d5e4c7ca2ecdb23b7297e15b...
11181642,2020-06-29,18443851993991110350,824352001,0.015230,2,2020,92,0.0,10.0,824352,...,Divided Collection,1025,Shorts,Shorts in a crinkled viscose weave. High waist...,1.0,1.0,ACTIVE,Regularly,31.0,193d006133fd3a00999562d5e4c7ca2ecdb23b7297e15b...
11181643,2020-06-29,18443851993991110350,599580038,0.016469,2,2020,92,0.0,11.0,599580,...,"Womens Swimwear, beachwear",1018,Swimwear,Fully lined bikini bottoms with a mid waist an...,1.0,1.0,ACTIVE,Regularly,31.0,193d006133fd3a00999562d5e4c7ca2ecdb23b7297e15b...


'test data='

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,year,week,purchased,bestseller_rank,product_code,...,section_name,garment_group_no,garment_group_name,detail_desc,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
11181645,2020-06-23,28847241659200,859076001,0.023712,2,2020,93,0.0,999.0,859076,...,Womens Tailoring,1009,Trousers,Ankle-length trousers woven in a viscose blend...,1.0,1.0,ACTIVE,Regularly,21.0,f3dd793c657b414a4f7b0738f78b9223f7a6a8b844a0ad...
11181646,2020-06-23,28847241659200,862272001,0.050831,2,2020,93,0.0,999.0,862272,...,Womens Tailoring,1008,Dressed,Fitted jacket in woven fabric with notch lapel...,1.0,1.0,ACTIVE,Regularly,21.0,f3dd793c657b414a4f7b0738f78b9223f7a6a8b844a0ad...
11181647,2020-06-26,28847241659200,838825003,0.030492,2,2020,93,0.0,999.0,838825,...,Ladies H&M Sport,1005,Jersey Fancy,"Cropped, sleeveless sports top in fast-drying ...",1.0,1.0,ACTIVE,Regularly,21.0,f3dd793c657b414a4f7b0738f78b9223f7a6a8b844a0ad...
11181648,2020-05-13,28847241659200,881031001,0.024700,2,2020,93,0.0,1.0,881031,...,Ladies H&M Sport,1005,Jersey Fancy,Cycling shorts in fast-drying functional fabri...,1.0,1.0,ACTIVE,Regularly,21.0,f3dd793c657b414a4f7b0738f78b9223f7a6a8b844a0ad...
11181649,2020-05-13,28847241659200,866383006,0.024703,2,2020,93,0.0,2.0,866383,...,"Womens Swimwear, beachwear",1018,Swimwear,Lined bikini top with padded cups for a larger...,1.0,1.0,ACTIVE,Regularly,21.0,f3dd793c657b414a4f7b0738f78b9223f7a6a8b844a0ad...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17846241,2020-05-29,18446571879212697038,866261001,0.016751,1,2020,93,0.0,8.0,866261,...,"Womens Swimwear, beachwear",1018,Swimwear,"Fully lined bikini bottoms with a low, V-shape...",,,ACTIVE,NONE,46.0,d9a4c4ee64dab2396525cc8639d8534db2b2503c9d9645...
17846242,2020-05-29,18446571879212697038,880749002,0.021444,1,2020,93,0.0,9.0,880749,...,Divided Collection,1013,Dresses Ladies,"Short, wide dress in a viscose crêpe weave. Ro...",,,ACTIVE,NONE,46.0,d9a4c4ee64dab2396525cc8639d8534db2b2503c9d9645...
17846243,2020-05-29,18446571879212697038,733749001,0.004975,1,2020,93,0.0,10.0,733749,...,Divided Complements Other,1002,Jersey Basic,Cropped top in soft cotton jersey with conceal...,,,ACTIVE,NONE,46.0,d9a4c4ee64dab2396525cc8639d8534db2b2503c9d9645...
17846244,2020-05-29,18446571879212697038,874078003,0.016748,1,2020,93,0.0,11.0,874078,...,"Womens Swimwear, beachwear",1018,Swimwear,Fully lined bikini bottoms with a mid waist an...,,,ACTIVE,NONE,46.0,d9a4c4ee64dab2396525cc8639d8534db2b2503c9d9645...


In [106]:
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values
display(train_baskets)

array([13, 17, 17, ..., 15, 21, 17])

In [109]:
columns_to_use =['customer_id','article_id','bestseller_rank','week']
display(columns_to_use)

['customer_id', 'article_id', 'bestseller_rank', 'week']

In [110]:
%%time
#training data is the data with many columns, y value is purchased/not 
train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]  # use only test data for week 93 

CPU times: user 159 ms, sys: 220 ms, total: 379 ms
Wall time: 596 ms


In [253]:
display(train_X.head(5))

Unnamed: 0,customer_id,article_id,bestseller_rank,week
0,28847241659200,880768001,999.0,85
1,28847241659200,841260011,1.0,85
2,28847241659200,741356002,2.0,85
3,28847241659200,720125001,3.0,85
4,28847241659200,706016001,4.0,85


In [254]:
train_y.head(5) #purchased or not 

0    1.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: purchased, dtype: float64

In [None]:
#LGBM model training

In [118]:
from lightgbm.sklearn import LGBMRanker

In [119]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)
print(ranker)

LGBMRanker(boosting_type='dart', importance_type='gain', metric='ndcg',
           n_estimators=1, objective='lambdarank', verbose=10)


In [120]:
%%time

ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.000000
[LightGBM] [Debug] init for col-wise cost 0.000039 seconds, init for row-wise cost 0.229054 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 533
[LightGBM] [Info] Number of data points in the train set: 11181645, number of used features: 4
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 10
CPU times: user 6.81 s, sys: 947 ms, total: 7.75 s
Wall time: 2.33 s


In [121]:
#Get feature importances
for i in ranker.feature_importances_.argsort()[::-1]:
   
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

bestseller_rank 0.9982842048975095
week 0.001420193679458586
article_id 0.0002956014230319974
customer_id 0.0


In [122]:
%time

test['preds'] = ranker.predict(test_X)
display(test.head(15))

c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()  #top 12 pred per customer
display(c_id2predicted_article_ids.head(5))
bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()

print(bestsellers_last_week)

CPU times: user 1e+03 ns, sys: 1e+03 ns, total: 2 µs
Wall time: 3.1 µs


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,year,week,purchased,bestseller_rank,product_code,...,garment_group_no,garment_group_name,detail_desc,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,preds
11181645,2020-06-23,28847241659200,859076001,0.023712,2,2020,93,0.0,999.0,859076,...,1009,Trousers,Ankle-length trousers woven in a viscose blend...,1.0,1.0,ACTIVE,Regularly,21.0,f3dd793c657b414a4f7b0738f78b9223f7a6a8b844a0ad...,0.148984
11181646,2020-06-23,28847241659200,862272001,0.050831,2,2020,93,0.0,999.0,862272,...,1008,Dressed,Fitted jacket in woven fabric with notch lapel...,1.0,1.0,ACTIVE,Regularly,21.0,f3dd793c657b414a4f7b0738f78b9223f7a6a8b844a0ad...,0.148984
11181647,2020-06-26,28847241659200,838825003,0.030492,2,2020,93,0.0,999.0,838825,...,1005,Jersey Fancy,"Cropped, sleeveless sports top in fast-drying ...",1.0,1.0,ACTIVE,Regularly,21.0,f3dd793c657b414a4f7b0738f78b9223f7a6a8b844a0ad...,0.148984
11181648,2020-05-13,28847241659200,881031001,0.0247,2,2020,93,0.0,1.0,881031,...,1005,Jersey Fancy,Cycling shorts in fast-drying functional fabri...,1.0,1.0,ACTIVE,Regularly,21.0,f3dd793c657b414a4f7b0738f78b9223f7a6a8b844a0ad...,-0.19186
11181649,2020-05-13,28847241659200,866383006,0.024703,2,2020,93,0.0,2.0,866383,...,1018,Swimwear,Lined bikini top with padded cups for a larger...,1.0,1.0,ACTIVE,Regularly,21.0,f3dd793c657b414a4f7b0738f78b9223f7a6a8b844a0ad...,-0.19186
11181650,2020-05-13,28847241659200,871581002,0.016446,2,2020,93,0.0,3.0,871581,...,1005,Jersey Fancy,"Jersey playsuit with a smocked bodice, small f...",1.0,1.0,ACTIVE,Regularly,21.0,f3dd793c657b414a4f7b0738f78b9223f7a6a8b844a0ad...,-0.19186
11181651,2020-05-13,28847241659200,372860069,0.005229,2,2020,93,0.0,4.0,372860,...,1021,Socks and Tights,Fine-knit trainer socks in a soft cotton blend.,1.0,1.0,ACTIVE,Regularly,21.0,f3dd793c657b414a4f7b0738f78b9223f7a6a8b844a0ad...,-0.194947
11181652,2020-05-13,28847241659200,817353008,0.023234,2,2020,93,0.0,5.0,817353,...,1013,Dresses Ladies,"Short dress in woven fabric with a V-neck, but...",1.0,1.0,ACTIVE,Regularly,21.0,f3dd793c657b414a4f7b0738f78b9223f7a6a8b844a0ad...,-0.194947
11181653,2020-05-13,28847241659200,871581001,0.016337,2,2020,93,0.0,6.0,871581,...,1005,Jersey Fancy,"Jersey playsuit with a smocked bodice, small f...",1.0,1.0,ACTIVE,Regularly,21.0,f3dd793c657b414a4f7b0738f78b9223f7a6a8b844a0ad...,-0.19186
11181654,2020-05-13,28847241659200,448509014,0.040584,2,2020,93,0.0,7.0,448509,...,1009,Trousers,"5-pocket, ankle-length jeans in washed, sturdy...",1.0,1.0,ACTIVE,Regularly,21.0,f3dd793c657b414a4f7b0738f78b9223f7a6a8b844a0ad...,-0.194947


[881031001, 866383006, 871581002, 372860069, 817353008, 871581001, 448509014, 866261001, 880749002, 733749001, 874078003, 912418001]


In [123]:
#sub = pd.read_csv('/Users/seema/Downloads/h-and-m-personalized-fashion-recommendations/sample_submission.csv')

In [124]:
'''
%%time
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_last_week
    preds.append(pred[:12])
'''

CPU times: user 3.53 s, sys: 220 ms, total: 3.75 s
Wall time: 3.91 s


In [125]:
# preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
# sub.prediction = preds

In [126]:
# sub_name = 'basic_model_submission'
# sub.to_csv(f'{sub_name}.csv.gz', index=False)

In [127]:
#sub['prediction'].shape

(1371980,)

In [255]:
#sub.head(20)

In [138]:
# Load Data
DIR = "/Users/seema/Downloads/h-and-m-personalized-fashion-recommendations"
transactions = pd.read_csv(os.path.join(DIR, "transactions_train.csv"))

display(transactions.head(5))


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


In [139]:
#Add new columns to transaction table. Transaction year, week,. For each customer post 2020-07-15 which week, year did they purchase artcile
# Create new columns
transactions["t_dat"] = pd.to_datetime(transactions["t_dat"])
transactions["year"] = transactions["t_dat"].dt.year

shape_2018 = transactions.query("year == 2018").shape[0]
shape_2019 = transactions.query("year == 2019").shape[0] + shape_2018
shape_2020 = transactions.query("year == 2020").shape[0] + shape_2019

transactions["week"] = transactions["t_dat"].dt.week
transactions.loc[shape_2018:shape_2019-1, "week"] = transactions.loc[shape_2018:shape_2019-1, "week"] + transactions.query("year == 2018")["week"].max() - 1
transactions.loc[shape_2019:shape_2020-1, "week"] = transactions.loc[shape_2019:shape_2020-1, "week"] + transactions.query("year == 2019")["week"].max() - 1
transactions["week"] = transactions["week"] - 37

transactions = transactions.query("t_dat >= '2020-05-04' & t_dat <= '2020-07-12'")
display(transactions)
display(transactions.groupby(["week"])["t_dat"].agg(["min", "max"]))

  transactions["week"] = transactions["t_dat"].dt.week


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,year,week
25360130,2020-05-04,0001177027259b455f979d85a278e4b280205d4de5cce4...,863456003,0.025407,2,2020,84
25360131,2020-05-04,0001177027259b455f979d85a278e4b280205d4de5cce4...,863456003,0.025407,2,2020,84
25360132,2020-05-04,0001177027259b455f979d85a278e4b280205d4de5cce4...,570002090,0.013542,2,2020,84
25360133,2020-05-04,0001177027259b455f979d85a278e4b280205d4de5cce4...,570002090,0.013542,2,2020,84
25360134,2020-05-04,0001177027259b455f979d85a278e4b280205d4de5cce4...,863456005,0.025407,2,2020,84
...,...,...,...,...,...,...,...
28945638,2020-07-12,fffbdd2f8e59d45c0fb50a14b0ea555f7daa5307b3f922...,810557006,0.025407,2,2020,93
28945639,2020-07-12,fffbdd2f8e59d45c0fb50a14b0ea555f7daa5307b3f922...,871243001,0.030492,2,2020,93
28945640,2020-07-12,fffbdd2f8e59d45c0fb50a14b0ea555f7daa5307b3f922...,798407010,0.010153,2,2020,93
28945641,2020-07-12,fffbdd2f8e59d45c0fb50a14b0ea555f7daa5307b3f922...,871243003,0.030492,2,2020,93


Unnamed: 0_level_0,min,max
week,Unnamed: 1_level_1,Unnamed: 2_level_1
84,2020-05-04,2020-05-10
85,2020-05-11,2020-05-17
86,2020-05-18,2020-05-24
87,2020-05-25,2020-05-31
88,2020-06-01,2020-06-07
89,2020-06-08,2020-06-14
90,2020-06-15,2020-06-21
91,2020-06-22,2020-06-28
92,2020-06-29,2020-07-05
93,2020-07-06,2020-07-12


In [190]:
val_week=93
val = transactions[transactions['week']==val_week].reset_index()
#transactions["customer_id"] = customer_hex_id_to_int(transactions["customer_id"])
#display(val)
val_cus = val.groupby('customer_id', sort=False)['article_id'].apply(list).reset_index()
print(val_cus.dtypes)
print(transactions.dtypes)
display(val_cus)
#val_cus['customer_id'] = customer_hex_id_to_int(val_cus['customer_id'])
#display(val_cus.query('customer_id= 00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657''))

customer_id    uint64
article_id     object
dtype: object
t_dat               datetime64[ns]
customer_id                 uint64
article_id                   int64
price                      float64
sales_channel_id             int64
year                         int64
week                         int64
dtype: object


Unnamed: 0,customer_id,article_id
0,5215038009126703949,"[875469002, 840351001, 524529001]"
1,4969337857234925111,"[509091026, 509091026]"
2,2160337529732391090,[840607001]
3,14474665695916352015,"[826508002, 841793003, 826508005]"
4,14060041565966549171,"[849738003, 810557003, 822311003, 835704004]"
...,...,...
77154,10776785599529806856,[818886004]
77155,9046159785883995059,"[870940001, 870940001, 852190001, 767473006]"
77156,520673247692616493,[833499002]
77157,15545599357008835407,"[848883001, 848883001, 848883001, 869131001]"


In [191]:
#predictions to a list

sub_pred = sub.groupby('customer_id', sort=False)['prediction'].apply(list).reset_index()

sub_pred["customer_id"] = customer_hex_id_to_int(sub_pred["customer_id"])
display(sub_pred)
print(sub_pred.dtypes)

Unnamed: 0,customer_id,prediction
0,6883939031699146327,[0881031001 0866383006 0871581002 0372860069 0...
1,11246327431398957306,[0881031001 0866383006 0871581002 0372860069 0...
2,18439897732908966680,[0881031001 0866383006 0871581002 0372860069 0...
3,18352672461570950206,[0881031001 0866383006 0871581002 0372860069 0...
4,18162778555210377306,[0589440005 0827971001 0818320001 0881031001 0...
...,...,...
1371975,7551062398649767985,[0739590033 0832321002 0832320002 0860949003 0...
1371976,9305341941720086711,[0834217009 0841699003 0511105013 0835802004 0...
1371977,10160427316885688932,[0881031001 0866383006 0871581002 0372860069 0...
1371978,2551401172826382186,[0882810001 0881031001 0866383006 0871581002 0...


customer_id    uint64
prediction     object
dtype: object


In [192]:
#making dictionary of the predictions
predic = dict(zip(sub_pred['customer_id'],sub_pred['prediction']))
display(predic)


{6883939031699146327: ['0881031001 0866383006 0871581002 0372860069 0817353008 0871581001 0448509014 0866261001 0880749002 0733749001 0874078003 0912418001'],
 11246327431398957306: ['0881031001 0866383006 0871581002 0372860069 0817353008 0871581001 0448509014 0866261001 0880749002 0733749001 0874078003 0912418001'],
 18439897732908966680: ['0881031001 0866383006 0871581002 0372860069 0817353008 0871581001 0448509014 0866261001 0880749002 0733749001 0874078003 0912418001'],
 18352672461570950206: ['0881031001 0866383006 0871581002 0372860069 0817353008 0871581001 0448509014 0866261001 0880749002 0733749001 0874078003 0912418001'],
 18162778555210377306: ['0589440005 0827971001 0818320001 0881031001 0866383006 0871581002 0871581001 0866261001 0880749002 0874078003 0912418001 0372860069'],
 15969713857127118246: ['0881031001 0866383006 0871581002 0372860069 0817353008 0871581001 0448509014 0866261001 0880749002 0733749001 0874078003 0912418001'],
 15044033931471387178: ['0881031001 08663

In [193]:
def calc_map(y_true, prediction, k=None):
    if k is None: k=len(prediction)
    relevant = np.isin(prediction[:k], y_true) # relevant[i]==1 if y_score[i] is correct
    patk = np.cumsum(relevant)/np.arange(1, len(prediction[:k])+1) # patk[0]==P@1, patk[1]==P@2, ...
    return(np.sum(patk*relevant)/min(len(y_true), k)) # as defined by the competition

target = dict(zip(val_cus['customer_id'],val_cus['article_id']))


In [214]:
#MAP testing
map=0

for u in c_id2predicted_article_ids.keys():
  #print(u)
    if u in target.keys():
        map+=calc_map(np.array(target[u]), np.array(c_id2predicted_article_ids[u]), k=12)

map/len(target.keys())

0.0073786593522280714

In [None]:
#MAP obtained for ranking is 0.0073786593522280714