In [1]:
import pandas as pd
from sklearn import preprocessing
import numpy as np

# Loading Datasets

In [2]:
df_articles = pd.read_csv('articles.csv', dtype={"article_id": "str"})

In [3]:
df_transactions = pd.read_csv('transactions_train.csv', dtype={"article_id": "str"})

In [5]:
df_customers = pd.read_csv('customers.csv')

# Prepocessing and Creating New Features

In [6]:
df_transactions['t_dat'] = pd.to_datetime(df_transactions['t_dat'])

In [7]:
#calculating week with final week of data has week 104
df_transactions['week'] = 104 - (df_transactions['t_dat'].max() - df_transactions['t_dat']).dt.days // 7

In [8]:
df_articles['article_id'] = df_articles['article_id'].astype(int)

In [9]:
df_transactions['article_id'] = df_transactions['article_id'].astype(int)

In [10]:
df_customers['age'].median()

32.0

In [11]:
#Replacing nan values with -1
df_customers['age'] = df_customers['age'].fillna(-1)
for col in ['FN', 'Active']:
    df_customers[col].fillna(-1, inplace=True)
    df_customers[col] = df_customers[col].astype('int')

In [12]:
#Replacing empty club_member_status and fashion_news_frequency with Unknown 
df_customers['club_member_status'] = df_customers['club_member_status'].fillna('Unknown')
df_customers['fashion_news_frequency'] = df_customers['fashion_news_frequency'].fillna('Unknown')

In [13]:
#using label encoder to encode the club_member_status
club_number_status_encoder = preprocessing.LabelEncoder()
club_number_status_encoder.fit(df_customers['club_member_status'])
df_customers['club_member_status'] = club_number_status_encoder.transform(df_customers['club_member_status'])
np.save('club_member_status_classes.npy', club_number_status_encoder.classes_)

In [14]:
#using label encoder to encode the fashion_news_frequency
fashion_news_frequency_encoder = preprocessing.LabelEncoder()
fashion_news_frequency_encoder.fit(df_customers['fashion_news_frequency'])
df_customers['fashion_news_frequency'] = fashion_news_frequency_encoder.transform(df_customers['fashion_news_frequency'])
np.save('fashion_news_frequency_classes.npy', fashion_news_frequency_encoder.classes_)

In [15]:
#using label encoder to encode the postal_code
postal_code_encoder = preprocessing.LabelEncoder()
postal_code_encoder.fit(df_customers['postal_code'])
df_customers['postal_code'] = postal_code_encoder.transform(df_customers['postal_code'])
np.save('postal_code_classes.npy', postal_code_encoder.classes_)

In [16]:
df_customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1371980 entries, 0 to 1371979
Data columns (total 7 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   customer_id             1371980 non-null  object 
 1   FN                      1371980 non-null  int64  
 2   Active                  1371980 non-null  int64  
 3   club_member_status      1371980 non-null  int64  
 4   fashion_news_frequency  1371980 non-null  int64  
 5   age                     1371980 non-null  float64
 6   postal_code             1371980 non-null  int64  
dtypes: float64(1), int64(5), object(1)
memory usage: 73.3+ MB


In [17]:
df_transactions.sort_values(['t_dat', 'customer_id'], inplace=True)

# Feature Extraction

In [18]:
'''We need to create df such that last purchase can be used to predict the next purchase. We want to consider if these are the last 10 days
purchase then what is the chance of purchase in the current week.
'''
test_week = df_transactions.week.max() + 1 #Test week is 105. So, data of 95-104th week will be used to predict for 105th week
df_transactions = df_transactions[df_transactions.week > df_transactions.week.max() - 10] #Last 10 weeks data is considered

### Generating candidates

In [19]:
#extract unique customer_id and week grroup
unique_customer_week = df_transactions.groupby('customer_id')['week'].unique()

In [20]:
#c2weeks2shifted_weeks will store two alternate puchase weeks as previous_purchase_week to current_purchase for each customers
c2weeks2shifted_weeks = {}

for c_id, weeks in unique_customer_week.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    #last purchase week of customer will be used to determine the final purchase probability
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week

In [21]:
#Copying the Transaction dataframe
candidates_last_purchase = df_transactions.copy()

In [22]:
#Updating weeks as current week will be used to predict the purchase chance for next week.
weeks = []
for i, (c_id, week) in enumerate(zip(df_transactions['customer_id'], df_transactions['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])
candidates_last_purchase.week=weeks

### Bestseller Candidates

In [23]:
#Calculating Mean price for each week and article_id group
mean_price = df_transactions.groupby(['week', 'article_id'])['price'].mean()

In [24]:
#Calculate top 12 article on the basis of purchase of article id count per week.
#Assign rank to top 12 articles
sales = df_transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank').astype('int8')

In [25]:
#Combining Top 12 articles of week to the price . Again current week is going to be used for next week. So, Incremented the current week by 1
bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1

In [26]:
#Extracting unique week and customer_id for each week, this will be used to generate candidates
unique_transactions = df_transactions .groupby(['week', 'customer_id']) .head(1).drop(columns=['article_id', 'price']).copy()
#Combine each customer with top 12 articles as samples.
candidates_bestsellers = pd.merge(unique_transactions,bestsellers_previous_week,on='week',)
#This generated candidates will be used as test dataset and also, assign test_week as the week value for this dataframe 
test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions.week = test_week

In [27]:
#Previous best seller and Candidate bestseller are combined to get sampling data
candidates_bestsellers_test_week = pd.merge(test_set_transactions,bestsellers_previous_week,on='week')
candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)

### Combining transactions and candidates / negative examples

In [28]:
#Assigning purchase target value as 1 because it 
df_transactions['purchased'] = 1

In [29]:
df_transactions.shape

(2762872, 7)

In [30]:
#Concating df_transactions, candidates_last_purchase, candidates_bestsellers
data = pd.concat([df_transactions, candidates_last_purchase, candidates_bestsellers])
#Articles which are not being purchase customer in data. It will work as negative sampling. So, they will have purchased value 0
data.purchased.fillna(0, inplace=True)

In [31]:
#keeping unique custome_id, article_id and week
data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

### Add bestseller information

In [32]:
#Adding bestseller product information by merging
data = pd.merge(data,bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],on=['week', 'article_id'],how='left')

In [33]:
#User who made less than 10 puchase(non frequent customer), removing those users from the list
data_customer = data.loc[data['purchased'] ==1,:].groupby('customer_id').size().reset_index().rename(columns={0:'count'})
non_freq_customer = data_customer.loc[data_customer['count'] < 10, 'customer_id'].tolist()
data = data.loc[~data['customer_id'].isin(non_freq_customer), :]

In [34]:
del data_customer

In [35]:
data.shape

(5516360, 8)

In [36]:
data.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased,bestseller_rank
0,2020-07-15,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,796239001,0.022017,2,95,1.0,
1,2020-07-15,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,880696008,0.084729,2,95,1.0,
2,2020-07-15,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,885827001,0.050831,2,95,1.0,
3,2020-07-15,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,868544001,0.025407,2,95,1.0,
4,2020-07-15,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,888519001,0.030492,2,95,1.0,


In [37]:
#Other tha top 12 assigning 999 as bestseller rank and remove last week data as there will be empty bestseller field for all the rows/
data = data[data.week != data.week.min()]
data.bestseller_rank.fillna(999, inplace=True)

In [38]:
#Merging with with customer and article id to get all the information of article id and customerid 
data = pd.merge(data, df_articles, on='article_id', how='left')
data = pd.merge(data, df_customers, on='customer_id', how='left')

In [39]:
#Sorting dataframe according to the week and customer_id
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [40]:
#Splitting in train and test data
train = data[data.week != test_week]
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

In [41]:
#Creating train basket with grouping on week and customer with article_id count
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

In [42]:
#Columns used in training
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'bestseller_rank']

In [43]:
train_X = train[columns_to_use]
train_y = train['purchased']
test_X = test[columns_to_use]

In [44]:
train[['article_id','purchased']].head(10)

Unnamed: 0,article_id,purchased
0,869095001,1.0
1,866097004,1.0
2,921646001,1.0
3,796239001,0.0
4,880696008,0.0
5,885827001,0.0
6,868544001,0.0
7,888519001,0.0
8,888010002,0.0
9,851802001,0.0


In [45]:
train_X.head()

Unnamed: 0,article_id,product_type_no,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,index_group_no,section_no,garment_group_no,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,bestseller_rank
0,869095001,92,1010016,72,5,2,3040,1,64,1020,1,1,0,3,44.0,202732,999.0
1,866097004,91,1010017,9,4,5,3527,1,64,1020,1,1,0,3,44.0,202732,999.0
2,921646001,254,1010016,9,4,5,8316,26,5,1005,1,1,0,3,44.0,202732,999.0
3,796239001,265,1010016,32,5,3,3090,1,15,1023,1,1,0,3,44.0,202732,999.0
4,880696008,265,1010016,32,7,3,1941,1,18,1010,1,1,0,3,44.0,202732,999.0


In [46]:
train_X.shape

(4136829, 17)

In [47]:
test_X.shape

(1257962, 17)

# Modeling

In [48]:
!pip install lightgbm



In [49]:
from lightgbm.sklearn import LGBMRanker

In [50]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=200,
    importance_type='gain',
    verbose=100,
    learning_rate=0.01,
    random_state=42
)

In [51]:
ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
     verbose=20,
    eval_metric='ndcg'
)



[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.058509
[LightGBM] [Debug] init for col-wise cost 0.000065 seconds, init for row-wise cost 0.176458 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 1079
[LightGBM] [Info] Number of data points in the train set: 4136829, number of used features: 17
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 10
[L

In [52]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

bestseller_rank 0.9557118261864753
article_id 0.02092231936665285
product_type_no 0.008635312933885126
garment_group_no 0.005630683972760533
department_no 0.004127778209940176
colour_group_code 0.0015658662031746815
section_no 0.0010130501481700061
perceived_colour_master_id 0.0007294997503880602
perceived_colour_value_id 0.0007257200407689689
graphical_appearance_no 0.0006335585654140805
index_group_no 0.00030438462237016034
age 0.0
fashion_news_frequency 0.0
club_member_status 0.0
Active 0.0
FN 0.0
postal_code 0.0


# submission

In [None]:
#Using model to make predictions on test
test['preds'] = ranker.predict(test_X)

#Sort according to prediction value and customer id and store article id
c_id2predicted_article_ids = test.sort_values(['customer_id', 'preds'], ascending=False).groupby('customer_id')['article_id'].apply(list).to_dict()
#If customer is not available then bestseller_last_week will be used to assign predictions
bestsellers_last_week = bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()


In [None]:
#Read submission file
sub = pd.read_csv('sample_submission.csv')

In [None]:
#Store Prediction 
preds = []
for c_id in sub.customer_id:
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_last_week
    preds.append(pred[:12])
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [62]:
sub.to_csv('submission_ltr.csv', index=False)