In [1]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

In [2]:
#Load csv files 
df_articles = pd.read_csv('articles.csv')
df_transactions = pd.read_csv('transactions_train.csv')
df_customers = pd.read_csv('customers.csv')

In [3]:
N = 12 #Number of item required to recommend

In [4]:
#Assigning age bins to each customer
age_bin = [-1, 19, 29, 39, 49, 59, 69, 119]
df_customers['age_bins'] = pd.cut(df_customers['age'], age_bin)

In [5]:
df_customers['age_bins'].drop_duplicates()

0      (39.0, 49.0]
1      (19.0, 29.0]
3      (49.0, 59.0]
5               NaN
7      (29.0, 39.0]
16    (69.0, 119.0]
31     (59.0, 69.0]
33     (-1.0, 19.0]
Name: age_bins, dtype: category
Categories (7, interval[int64, right]): [(-1, 19] < (19, 29] < (29, 39] < (39, 49] < (49, 59] < (59, 69] < (69, 119]]

In [6]:
df_transactions['t_dat'] = pd.to_datetime(df_transactions['t_dat'])

In [8]:
#looping over each age bracket
for each_age_bracket in age_bin:
    df = df_transactions.copy()
    
    if str(each_age_bracket) == 'nan':
        df_customer_age_bracket = df_customers[df_customers['age_bins'].isnull()]
    else:
        df_customer_age_bracket = df_customers[df_customers['age_bins'] == each_age_bracket]
    
    df_customer_age_bracket = df_customers.drop(['age_bins'], axis=1)
    
    df = df.merge(df_customer_age_bracket[['customer_id', 'age']], on='customer_id', how='inner')
    print(df.head())
    
    #Calculate week for each date
    df['week'] = 104 - (df['t_dat'].max() - df['t_dat']).dt.days // 7
    
    #Weekly count of each articles
    weekly_sales = df.drop('customer_id', axis=1).groupby(['week', 'article_id']).size().reset_index().rename(columns={0: 'count'})
    #calculate week and article id wise purchase count
    df = df.merge(weekly_sales, on=['week', 'article_id'], how = 'left')
    df = df.merge(weekly_sales.loc[weekly_sales['week']==104, ['count', 'article_id']], on='article_id', suffixes=("", "_target"))
    df['count_target'].fillna(0, inplace=True)   
    
    #Normalize article counts with respect to week to remove bias and seasonlity 
    df['norm_count'] = df['count_target'] / df['count']
    #Target sales for articles irrespective of the customer
    sales = df.drop('customer_id', axis=1).groupby('article_id')['norm_count'].sum()
    
    #Default general predictions for customers who  doesn't have purchased
    default_pred = sales.nlargest(N).index.tolist()
    default_pred = ['0' + str(article_id) for article_id in default_pred] #Adding 0 in start of the article id
    default_pred_str =  ' '.join(default_pred)
    del sales
    
    #purchase dict contain customer article value dict
    customer_article_value_dict = {}
    df['days_from_maximum_date'] = (df['t_dat'].max() - df['t_dat'] ).dt.days + 1    # adding 1 to avoid divided by zero case
    
    #Assigning weight according to day from maximum purchase date
    df['weight'] = 25000 / np.sqrt(df['days_from_maximum_date']) + 100000 * np.exp(-0.2*df['days_from_maximum_date']) - 1000
    df['value'] = df['norm_count'] * df['weight'] 

    #Summing the weekly value of article on customer id and article id group
    df_customer_article_val = df.groupby(['customer_id', 'article_id']).agg({'value': 'sum'}).reset_index()
    #Assigning rank to articles on customer level on the basis of value
    df_customer_article_val['rank'] = df_customer_article_val.groupby("customer_id")["value"].rank("dense", ascending=False)
    #Retaining only top 12 articles for each customer
    df_customer_article_val = df_customer_article_val[df_customer_article_val['rank'] <= 12]
    
    #Sort in terms of value
    purchase_df = df_customer_article_val.sort_values(['customer_id', 'value'], ascending = False).reset_index(drop = True)
    #adding 0 in each article_id for the submission
    purchase_df['prediction'] = '0' + purchase_df['article_id'].astype(str) + ' '
    #aggregating on the customer_id level
    purchase_df = purchase_df.groupby('customer_id').agg({'prediction': sum}).reset_index()
    purchase_df['prediction'] = purchase_df['prediction'].str.strip()
    
    submisison_df  = pd.read_csv('sample_submission.csv')
    
    #Only the users which come in the age bracket for which loop is running
    submisison_df = submisison_df.merge(df_customer_age_bracket[['customer_id', 'age']], on='customer_id', how='inner')
    submisison_df['customer_id2'] = submisison_df['customer_id']
    submisison_df = submisison_df.merge(purchase_df, left_on = 'customer_id2', right_on = 'customer_id', how = 'left',
                   suffixes = ('', '_'))
    submisison_df['prediction'] = submisison_df['prediction'].fillna(default_pred_str)
    submisison_df['prediction'] = submisison_df['prediction'] + ' ' +  default_pred_str
    submisison_df['prediction'] = submisison_df['prediction'].str.strip()
    submisison_df['prediction'] = submisison_df['prediction'].str[:131] #Removing if it is greater than 12 in the length
    submisison_df = submisison_df[['customer_id', 'prediction']]
    submisison_df.to_csv(f'submission_age_' + str(each_age_bracket) + '.csv',index=False)#Storing the file as csv
    print(f'Saved prediction for '+ 'submission_age_' + str(each_age_bracket) + '.csv')
print('Finished.\n')

       t_dat                                        customer_id  article_id  \
0 2018-09-20  000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...   663713001   
1 2018-09-20  000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...   541518023   
2 2018-09-24  000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...   663713001   
3 2019-03-01  000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...   578020002   
4 2020-02-03  000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...   723529001   

      price  sales_channel_id   age  
0  0.050831                 2  24.0  
1  0.030492                 2  24.0  
2  0.050831                 2  24.0  
3  0.013542                 2  24.0  
4  0.025407                 2  24.0  
Saved prediction forsubmission_age_-1.csv
       t_dat                                        customer_id  article_id  \
0 2018-09-20  000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...   663713001   
1 2018-09-20  000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...   541518023   
2 2018-09-24  0000

In [9]:
for counter, age_bracket in enumerate(age_bin):
    df_age_bracket  = pd.read_csv('submission_age_' + str(age_bracket) + '.csv')
    if counter == 0:
        df_for_submission = df_age_bracket
    else:
        df_for_submission = pd.concat([df_for_submission, df_age_bracket], axis=0)


df_for_submission.to_csv('submission_age.csv', index=False)