### Import required libraries and set options

In [1]:
# Set the path for the data files
FILE_PATH = '../data/raw/'

In [2]:
# Import required libraries
import pandas as pd
import numpy as np

### Import required datasets

In [3]:
# Import train data
train_data = pd.read_csv(FILE_PATH + 'train.csv')

# Import test data
test_data = pd.read_csv(FILE_PATH + 'test.csv')

# Import mapping file
map_data = pd.read_csv(FILE_PATH + 'product_attributes.csv')

In [4]:
# Print shape of all the datasets
print('Shape of the train_data is : ', train_data.shape)
print('Shape of the test_data is : ', test_data.shape)
print('Shape of the map_data is : ', map_data.shape)

Shape of the train_data is :  (132551, 4)
Shape of the test_data is :  (2350, 1)
Shape of the map_data is :  (18750, 3)


### Inspect the data

In [5]:
# Inspect how the train data looks like
train_data.head()

Unnamed: 0,UserId,productid,Quantity,OrderDate
0,18075,12322648,1,01/04/18
1,6820,12371370,1,01/04/18
2,6820,12973004,1,01/04/18
3,6820,12657560,1,01/04/18
4,6820,11659914,1,01/04/18


In [6]:
# Inspect how the test data looks like
test_data.head()

Unnamed: 0,UserId
0,2
1,28
2,36
3,38
4,41


In [7]:
# Inspect how the product attributes look like
map_data.head()

Unnamed: 0,productid,attribute_name,attributevalue
0,11145600,Fit,37
1,11145600,Sleeve Length,23
2,11145600,Fabric,16
3,11145600,Color,25
4,11145600,Neckline,51


### Rearrange the map data to make a single row per product

In [8]:
# Pivot the data
map_data = map_data\
           .pivot_table(index='productid', columns='attribute_name', values='attributevalue', fill_value=0)\
           .rename_axis(None, axis=1)\
           .reset_index()

In [9]:
# Inspect the data
map_data.head()

Unnamed: 0,productid,Category,Collection,Color,Fabric,Fit,Material,Neckline,Season,Sleeve Length
0,11139192,1,0,41.0,2.0,0,0,0.0,42,0
1,11139194,1,0,25.0,2.0,0,0,0.0,42,0
2,11139524,1,0,95.0,16.0,14,0,4.0,42,23
3,11139560,1,0,41.0,2.0,14,0,4.0,42,127
4,11139588,1,0,25.0,58.0,0,0,0.0,42,0


**Note that the test data contains user ids only. Which means most of the other features should be in the train data.**

### Map the product attributes to the train data

In [10]:
# Merge train data with map data
train_data = pd.merge(train_data, map_data, how="left", on="productid")

In [11]:
# Inspect the head of the train_data
train_data.head()

Unnamed: 0,UserId,productid,Quantity,OrderDate,Category,Collection,Color,Fabric,Fit,Material,Neckline,Season,Sleeve Length
0,18075,12322648,1,01/04/18,7.0,0.0,17.0,2.0,36.0,0.0,8.0,5.0,3.0
1,6820,12371370,1,01/04/18,7.0,0.0,35.0,2.0,24.0,0.0,8.0,5.0,13.0
2,6820,12973004,1,01/04/18,7.0,0.0,40.0,2.0,24.0,0.0,4.0,6.0,3.0
3,6820,12657560,1,01/04/18,1.0,0.0,32.0,2.0,14.0,0.0,4.0,6.0,3.0
4,6820,11659914,1,01/04/18,1.0,0.0,20.0,2.0,14.0,0.0,8.0,5.0,3.0


### Separate into user-item and item-item matrices

In [12]:
# User item matrix
user_item_mat = train_data[['UserId', 'productid']]\
                .pivot_table(index='UserId', columns='productid', values='productid', aggfunc='size', fill_value=0)\
                .rename_axis(None, axis=1)\
                .reset_index()

In [13]:
# Item item matrix
item_item_mat = train_data.copy().drop(['UserId', 'Quantity', 'OrderDate'], axis=1)