In [None]:
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

# Load files

In [None]:
general_path = '../input/h-and-m-personalized-fashion-recommendations/'

In [None]:
articles = pd.read_csv(general_path + 'articles.csv')
customers = pd.read_csv(general_path + 'customers.csv')
sample_submission = pd.read_csv(general_path + 'sample_submission.csv')
transactions_train = pd.read_csv(general_path + 'transactions_train.csv')

# Get info about transactions

In [None]:
transactions_train.info()

In [None]:
transactions_train.sample(5)

In [None]:
transactions_train['article_id'] = '0' + transactions_train['article_id'].astype(str)

In [None]:
articles['article_id'] = '0' + articles['article_id'].astype(str)

We should transform 't_dat' to datetime format.

In [None]:
transactions_train['t_dat'] = pd.to_datetime(transactions_train['t_dat'], format='%Y-%m-%d')

In [None]:
maximum_history_date = transactions_train['t_dat'].max()

# Creating features for personal recomendation
## Location

In [None]:
print(f'We have {len(customers)} unique customers')
print(f'And {len(customers["postal_code"].unique())} unique locations')

In [None]:
customer_per_location = customers.pivot_table(index='postal_code', aggfunc={'customer_id': 'count'})

In [None]:
customer_per_location.sort_values(by=['customer_id'])

We could split all customers for two group:
1. 2c29ae653a9282cce4151bd87643c907644e09541abc28ae87dea0d1f6603b1c - big_city
2. Others

In [None]:
customers['location'] = 'other'
customers.loc[customers['postal_code'] == '2c29ae653a9282cce4151bd87643c907644e09541abc28ae87dea0d1f6603b1c',
          'location'] = 'big_city'

## Price

In [None]:
transactions_train['price'].hist(bins=30, figsize=(16, 5))
plt.title('Prices')
plt.show()

## Articles characteristics
Will try to detect most comprehensive category

In [None]:
articles.nunique()

In [None]:
articles.sample(1)

In [None]:
articles['index_name'].unique()

We can use 'index_name' as a category function that divides articles into: Women's, Men's and children's clothing.

## Age

In [None]:
customers['age_group'] = '<20'
customers.loc[customers['age'] > 20, 'age_group'] = '20-45'
customers.loc[customers['age'] > 45, 'age_group'] = '>45'

# Let's use features than I previously create
## Step 1: Create a view in which we can see clients preferences

In [None]:
def aggregate_to_list(row):
    return [i for i in row]

In [None]:
# Let's add article characteristics for each transaction.
transactions_train = pd.merge(transactions_train,
                              articles[['article_id', 'index_name']],
                              on='article_id',
                              how='left')

In [None]:
# Let's add client age_group for each transaction.
transactions_train = pd.merge(transactions_train,
                              customers[['customer_id', 'age_group']],
                              on='customer_id',
                              how='left')

In [None]:
sample_submission = customers[['customer_id', 'age_group']]
sample_submission = pd.merge(sample_submission,
                             transactions_train.pivot_table(index=['customer_id', 'index_name'],
                                                            aggfunc={'article_id': ['count', 
                                                                                    aggregate_to_list]}
                                                           ).reset_index(),
                             on='customer_id',
                             how='left')

sample_submission.columns = ['customer_id', 'age_group', 'index_name', 'article_purchased', 
                             'article_id_count']

## Step 2: Remove not actual articules
H&M is (probably) a fast fashion company, and we can remove some products that have not been sold for a long time.

In [None]:
articles_sale_interval = transactions_train.pivot_table(index='article_id', aggfunc={'t_dat': ['min', 'max']}).reset_index()
articles_sale_interval.columns = [i[0] if (pd.isna(i[1]) or i[1] == '') else i[0] + '_' + i[1] for i in articles_sale_interval.columns]
long_time_have_not_sold = articles_sale_interval[articles_sale_interval['t_dat_max'] < maximum_history_date - pd.Timedelta('30 days')]

Remove this styles

In [None]:
len_before = len(transactions_train)
transactions_train = transactions_train[~transactions_train['article_id'].isin(long_time_have_not_sold['article_id'])]
print('Removed {:.2%} of articles'.format(1 - (len(transactions_train) / len_before)))

In [None]:
transactions_train = transactions_train[transactions_train['t_dat'] > 
                                        transactions_train['t_dat'].max() - pd.Timedelta('15 days')]

## Step 3: Creating recommendation ratings

In [None]:
# Group all articles into groups by age and characteristics of the articles
group_recomendation = transactions_train.pivot_table(index=['age_group', 'index_name', 'article_id'],
                                                     aggfunc={'customer_id': 'count'}).reset_index()

In [None]:
# Sort, remove small, and rename
group_recomendation.sort_values(by=['age_group', 'index_name', 'customer_id'], ascending=False, 
                                inplace=True)
group_recomendation.query('customer_id > 2', inplace=True)
group_recomendation.rename({'customer_id': 'article_raiting'}, inplace=True)

## Step 4: Add top articles for each group for each client

In [None]:
sample_submission = pd.merge(sample_submission,
                             group_recomendation.pivot_table(index=['age_group', 'index_name'],
                                                             aggfunc={'article_id': aggregate_to_list}
                                                            ).reset_index(),
                             on=['age_group', 'index_name'],
                             how='left')
sample_submission.rename(columns={'article_id' : 'top_article_id'}, inplace=True)

## Step 5: Determine how many articles you need to recommend to each client so that the total is 12

In [None]:
sample_submission['group_possibility'] = sample_submission.groupby('customer_id'
                                                                  )['article_id_count'].transform('sum')
sample_submission.query('group_possibility != 0', inplace=True)
sample_submission['article_id_count'] /= sample_submission['group_possibility']
sample_submission['article_id_count'] *= 12
sample_submission['article_id_count'] = sample_submission['article_id_count'].astype(float).round().astype('Int64')
sample_submission.rename(columns={'article_id_count': 'qty_to_recomend'}, inplace=True)

In [None]:
sample_submission.info()

In [None]:
sample_submission = sample_submission[sample_submission['qty_to_recomend'] != 0]

## Step 6: From top remove styles which client already bought.

In [None]:
def articles_remove(row):
    bought = row['article_purchased']
    recomendation = []
    if type(bought) != list:
        recomendation.append(row['top_article_id'])
    i = 0
    while len(recomendation) < row['qty_to_recomend'] and i < len(row['top_article_id']):
        current_acticle = row['top_article_id'][i]
        i += 1
        if current_acticle in bought:
            continue
            
        if i == len(row['top_article_id']):
            break
        recomendation.append(current_acticle)
    return recomendation

In [None]:
sample_submission[sample_submission['customer_id'] == '0118ed570ff6ff085cde55a6e801c6861a4e9ff9a8d9e82ee36b33c8b3af8f59']

In [None]:
sample_submission['recomendation'] = sample_submission.apply(articles_remove, axis=1)

In [None]:
def lists_aggregate_to_list(row):
    output = []
    for i in row:

        output += i
    return output

In [None]:
sample_submission = sample_submission.pivot_table(index='customer_id',
                                                  aggfunc={'recomendation': lists_aggregate_to_list}
                                                 ).reset_index()

In [None]:
unknown_customer_id = customers[~customers['customer_id'].isin(sample_submission['customer_id'])]

In [None]:
unknown_customer_id = unknown_customer_id[['customer_id']]

In [None]:
unknown_customer_id['prediction'] = ' '.join((transactions_train.pivot_table(index='article_id', 
                                                                             aggfunc={'customer_id': 
                                                                                      'count'})
                                                                .reset_index()
                                                                .sort_values(by='customer_id',
                                                                             ascending=False)
                                                )['article_id'].values[:12])

In [None]:
sample_submission['prediction'] = sample_submission['recomendation'].str.join(' ')

In [None]:
sample_submission = pd.concat([sample_submission, unknown_customer_id])

In [None]:
sample_submission.drop('recomendation', axis=1, inplace=True)

In [None]:
sample_submission.to_csv('predict.csv', index=False)