# EDA

#### Loading in dependencies and datasets

In [47]:
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns

import json
from pprint import pprint

# %matplotlib inline

In [48]:
beauty_info_train = pd.read_csv('./dataset/beauty_data_info_train_competition.csv')
beauty_info_val = pd.read_csv('./dataset/beauty_data_info_val_competition.csv')
beauty_profile = pd.read_json('./dataset/beauty_profile_train.json')

fashion_info_train = pd.read_csv('./dataset/fashion_data_info_train_competition.csv')
fashion_info_val = pd.read_csv('./dataset/fashion_data_info_val_competition.csv')
fashion_profile = pd.read_json('./dataset/fashion_profile_train.json')

mobile_info_train = pd.read_csv('./dataset/mobile_data_info_train_competition.csv')
mobile_info_val = pd.read_csv('./dataset/mobile_data_info_val_competition.csv')
mobile_profile = pd.read_json('./dataset/mobile_profile_train.json')

In [49]:
datasets = {
    'beauty': {
        'train': beauty_info_train, 
        'val': beauty_info_val,
        'profile': beauty_profile
    },
    'fashion': {
        'train': fashion_info_train,
        'val': fashion_info_val,
        'profile': fashion_profile
    },
    'mobile':{
        'train': mobile_info_train,
        'val': mobile_info_val,
        'profile': mobile_profile
    }
}

In [50]:
for category in datasets:
    print('\n')
    for table in datasets[category]:
        print('{:7s} {:10s}: {:7s} Rows, {:2s} Columns'.format(category, table, 
                                                         str(datasets[category][table].shape[0]), 
                                                         str(datasets[category][table].shape[1])))



beauty  train     : 286583  Rows, 8  Columns
beauty  val       : 76545   Rows, 3  Columns
beauty  profile   : 470     Rows, 5  Columns


fashion train     : 275142  Rows, 8  Columns
fashion val       : 69498   Rows, 3  Columns
fashion profile   : 70      Rows, 5  Columns


mobile  train     : 160330  Rows, 14 Columns
mobile  val       : 40417   Rows, 3  Columns
mobile  profile   : 2433    Rows, 11 Columns


In [51]:
num_val_rows = (beauty_info_val.shape[0]*beauty_profile.shape[1] 
                + fashion_info_val.shape[0]*fashion_profile.shape[1] 
                + mobile_info_val.shape[0]*mobile_profile.shape[1])

print('Submission file should have: {} rows'.format(num_val_rows))

Submission file should have: 1174802 rows


In [52]:
def get_attributes_lookup(df):
    
    attributes_lookup = {}
    num_total_categories = 0
    
    for attribute in df.columns:
        num_categories_per_attribute = 0
        
        cat_keys = df[attribute][df[attribute].notnull()].values
        cat_values = df[attribute][df[attribute].notnull()].index
        attributes_lookup[attribute] = dict(zip(cat_keys, cat_values))
        
        num_total_categories += len(cat_keys)
        
    if num_total_categories > len(df):
        print('1 profile type falls under more than attribute. Manually check profiles.')
        print('No. of df rows: ', len(df))
        print('No. of categories: ', num_total_categories)
    elif num_total_categories < len(df):
        print('Some rows in original dataframe might be unaccounted for or entirely composed of NaNs. Check input df.')
        print('No. of df rows: ', len(df))
        print('No. of categories: ', num_total_categories)
    elif num_total_categories == len(df):
        print('All good. Number of lookup categories == rows in input df.')
        
    return attributes_lookup

#### Beauty

In [53]:
display(beauty_info_train.head(2))
display(beauty_info_val.head(2))
display(beauty_profile.head(2))

Unnamed: 0,itemid,title,image_path,Benefits,Brand,Colour_group,Product_texture,Skin_type
0,307504,nyx sex bomb pallete natural palette,beauty_image/6b2e9cbb279ac95703348368aa65da09.jpg,1.0,157.0,,,
1,461203,etude house precious mineral any cushion pearl...,beauty_image/20450222d857c9571ba8fa23bdedc8c9.jpg,,73.0,11.0,7.0,


Unnamed: 0,itemid,title,image_path
0,370855998,flormar 7 white cream bb spf 30 40ml,beauty_image/1588591395c5a254bab84042005f2a9f.jpg
1,637234604,maybelline clear smooth all in one bb cream sp...,beauty_image/920985ed9587ea20f58686ea74e20f93.jpg


Unnamed: 0,Benefits,Brand,Colour_group,Product_texture,Skin_type
1 warna,,,1.0,,
10 color,,,39.0,,


In [55]:
beauty_lookup = get_attributes_lookup(beauty_profile)

All good. Number of lookup categories == rows in input df.


#### Fashion

In [56]:
display(fashion_info_train.head(2))
display(fashion_info_val.head(2))
display(fashion_profile.head(10))

Unnamed: 0,itemid,title,image_path,Pattern,Collar Type,Fashion Trend,Clothing Material,Sleeves
0,2282553,retro floral dress,fashion_image/78d17fdb159bba51a4250dc3d583245e,2.0,,3.0,,
1,13822218,dress floral sifon,fashion_image/2f77dac9965bbfdb03cbd3724b3552c5,2.0,,,4.0,


Unnamed: 0,itemid,title,image_path
0,381034175,fashion wanita cardigan drape terbuka lengan p...,new_fashion_image/9ba6bf09ae89c2b9069faf569e7a...
1,396355150,bayar di tempat fashion wanita fg sweater hood...,new_fashion_image/f465cd2e55352e3ed9ab49b16257...


Unnamed: 0,Pattern,Collar Type,Fashion Trend,Clothing Material,Sleeves
basic,,,4.0,,
boat neck,,6.0,,,
bohemian,,,8.0,,
brocade,15.0,,,,
button down,,9.0,,,
camouflage,8.0,,,,
cartoon,17.0,,,,
check,18.0,,,,
chiffon,,,,4.0,
cotton,,,,18.0,


In [57]:
fashion_lookup = get_attributes_lookup(fashion_profile)

All good. Number of lookup categories == rows in input df.


#### Mobile

In [58]:
display(mobile_info_train.head(2))
display(mobile_info_val.head(2))
display(mobile_profile.head(10))

Unnamed: 0,itemid,title,image_path,Operating System,Features,Network Connections,Memory RAM,Brand,Warranty Period,Storage Capacity,Color Family,Phone Model,Camera,Phone Screen Size
0,2346660,apple iphone 4s back glass spare part original...,mobile_image/a9c8f0fdd6587deed197634066cf7eee.jpg,,,,,2.0,,,12.0,1526.0,,
1,2816338,iphone 4s 64gb white,mobile_image/3b9a11608551b11b9330268e0d055e01.jpg,,,,,2.0,,3.0,12.0,,,


Unnamed: 0,itemid,title,image_path
0,1520485457,new promo iphone 5s 16gb gold ex resmi ibox,mobile_image/876d4a1fe29e056855fa6f9643757b1c.jpg
1,1520516704,new promo xiaomi note 5a prime 3 32gb tam,mobile_image/e2b902b7cd35cd50f061d8e2a3ba7178.jpg


Unnamed: 0,Operating System,Features,Network Connections,Memory RAM,Brand,Warranty Period,Storage Capacity,Color Family,Phone Model,Camera,Phone Screen Size
1 month,,,,,,11.0,,,,,
1 mp,,,,,,,,,,4.0,
1 year,,,,,,13.0,,,,,
1.5gb,,,,0.0,,,0.0,,,,
10 months,,,,,,7.0,,,,,
10 years,,,,,,4.0,,,,,
10gb,,,,2.0,,,14.0,,,,
10mp,,,,,,,,,,11.0,
11 months,,,,,,6.0,,,,,
128gb,,,,,,,1.0,,,,


In [60]:
mobile_lookup = get_attributes_lookup(mobile_profile)

1 profile type falls under more than attribute. Manually check profiles.
No. of df rows:  2433
No. of categories:  2443


In [61]:
# Memory RAM and Storage Capacity use the same categories, but they mean different things so must keep both
# 10 rows
mobile_profile[mobile_profile.notnull().sum(axis=1) !=1 ]

Unnamed: 0,Operating System,Features,Network Connections,Memory RAM,Brand,Warranty Period,Storage Capacity,Color Family,Phone Model,Camera,Phone Screen Size
1.5gb,,,,0.0,,,0.0,,,,
10gb,,,,2.0,,,14.0,,,,
16gb,,,,9.0,,,17.0,,,,
1gb,,,,8.0,,,8.0,,,,
2gb,,,,6.0,,,6.0,,,,
3gb,,,,7.0,,,15.0,,,,
4gb,,,,5.0,,,9.0,,,,
512mb,,,,1.0,,,2.0,,,,
6gb,,,,4.0,,,7.0,,,,
8gb,,,,3.0,,,5.0,,,,


In [63]:
mobile_lookup['Memory RAM']

{0.0: '1.5gb',
 2.0: '10gb',
 9.0: '16gb',
 8.0: '1gb',
 6.0: '2gb',
 7.0: '3gb',
 5.0: '4gb',
 1.0: '512mb',
 4.0: '6gb',
 3.0: '8gb'}

In [64]:
mobile_lookup['Storage Capacity']

{0.0: '1.5gb',
 14.0: '10gb',
 1.0: '128gb',
 11.0: '128mb',
 17.0: '16gb',
 8.0: '1gb',
 16.0: '256gb',
 13.0: '256mb',
 6.0: '2gb',
 12.0: '32gb',
 15.0: '3gb',
 9.0: '4gb',
 10.0: '4mb',
 4.0: '512gb',
 2.0: '512mb',
 3.0: '64gb',
 7.0: '6gb',
 5.0: '8gb'}

In [65]:
mobile_info_train[mobile_info_train['Memory RAM'].notnull()][['itemid', 'title', 'Memory RAM', 'Storage Capacity']].head()

Unnamed: 0,itemid,title,Memory RAM,Storage Capacity
7,9503620,iphone 4g 8gb,3.0,
10,26702248,samsung galaxy j5 j 500g 8 gb hitam,3.0,
11,26702291,samsung galaxy j1 mini sm j105 8gb white,3.0,
12,40385323,iphone 5 white 16gb fullset mulus,9.0,
14,46093583,samsung galaxy j1,9.0,12.0


#### Checking on % null values

In [66]:
fashion_info_train.head()

Unnamed: 0,itemid,title,image_path,Pattern,Collar Type,Fashion Trend,Clothing Material,Sleeves
0,2282553,retro floral dress,fashion_image/78d17fdb159bba51a4250dc3d583245e,2.0,,3.0,,
1,13822218,dress floral sifon,fashion_image/2f77dac9965bbfdb03cbd3724b3552c5,2.0,,,4.0,
2,33555935,korean white chiffon collar dress,fashion_image/6dbe2e7cba5ddbb750d2144d8f248f11,,13.0,10.0,4.0,
3,65755120,women s trendy apricot o neck solid chiffon bl...,fashion_image/dc9b21429604148fc0342d12694f3294,,3.0,,4.0,
4,65857438,big sale baju gamis pesta india aysilla pancar...,fashion_image/6c25c578dd8edce742a805f891f1a51f,,,6.0,17.0,


In [67]:
for vertical in datasets:
    print('\n')
    print(vertical.upper())
    for column in datasets[vertical]['train'].columns[3:]:
        print('% null in {:20s}: {:.2f}'.format(column, datasets[vertical]['train'][column].isnull().sum()/len(datasets[vertical]['train'])))



BEAUTY
% null in Benefits            : 0.60
% null in Brand               : 0.17
% null in Colour_group        : 0.58
% null in Product_texture     : 0.15
% null in Skin_type           : 0.80


FASHION
% null in Pattern             : 0.40
% null in Collar Type         : 0.59
% null in Fashion Trend       : 0.47
% null in Clothing Material   : 0.36
% null in Sleeves             : 0.35


MOBILE
% null in Operating System    : 0.72
% null in Features            : 0.75
% null in Network Connections : 0.85
% null in Memory RAM          : 0.64
% null in Brand               : 0.03
% null in Warranty Period     : 0.70
% null in Storage Capacity    : 0.61
% null in Color Family        : 0.47
% null in Phone Model         : 0.47
% null in Camera              : 0.85
% null in Phone Screen Size   : 0.80


#### Exploring Submission set up

In [310]:
# what happens if it's NaN in the training dataset? 
# Are we suppose to know that it's not a relevant category?
sample_submission = pd.read_csv('./dataset/data_info_val_sample_submission.csv')
sample_submission.head(10)

Unnamed: 0,id,tagging
0,370855998_Benefits,3 2
1,370855998_Brand,246 98
2,370855998_Colour_group,29 26
3,370855998_Product_texture,3 4
4,370855998_Skin_type,7 7
5,637234604_Benefits,4 2
6,637234604_Brand,338 248
7,637234604_Colour_group,42 15
8,637234604_Product_texture,7 4
9,637234604_Skin_type,5 0


In [71]:
beauty_att = beauty_info_train.columns[3:].tolist()
fashion_att = fashion_info_train.columns[3:].tolist()
mobile_att = mobile_info_train.columns[3:].tolist()

In [73]:
# creating sample submission form
# beauty-fashion-mobile
sample_submission.tail(2)

Unnamed: 0,id,tagging
1104510,1498091427_Camera,5 13
1104511,1498091427_Phone Screen Size,4 0


In [311]:
def generate_submission_df(dataset='val', *vertical):
    """
    Generates a blank submission df for a given number of verticals ('beauty', 'fashion', 'mobile').
    Returns only concatenated ids, with no tagging column
    
    Parameters:
    -----------
    dataset: string ('train' or 'val')
             'train' takes the ids from the train dataset (with labels), 
             'val' takes ids from validation/test dataset (no labels)
    
    vertical: strings
              'beauty', 'fashion', 'mobile'
              Sample submission file was generated in the above order
    """
    
    submission_ids = []
    
    for vert in vertical:
        for itemid in datasets[vert][dataset]['itemid']:
            for attribute in datasets[vert]['train'].columns[3:]:
                submission_ids.append(str(itemid)+'_'+attribute)
    
    submission_df = pd.DataFrame({
        'id': submission_ids
    })
        
    return submission_df

In [157]:
submission_df = generate_submission_df(dataset='train', 'beauty', 'fashion', 'mobile')
print('No. of rows: ', len(submission_df))
submission_df.head(2)

No. of rows:  4572255


Unnamed: 0,id
0,307504_Benefits
1,307504_Brand


#### Baseline Calculations

In [199]:
nan_labels = {}
for vert in ['beauty', 'fashion', 'mobile']:
    nan_labels[vert] = {}
    for attr in datasets[vert]['train'].columns[3:]:
        nan_label = datasets[vert]['train'][attr].dropna().max() + 1
        nan_labels[vert][attr] = nan_label

pprint(nan_labels)

{'beauty': {'Benefits': 7.0,
            'Brand': 400.0,
            'Colour_group': 45.0,
            'Product_texture': 9.0,
            'Skin_type': 8.0},
 'fashion': {'Clothing Material': 19.0,
             'Collar Type': 16.0,
             'Fashion Trend': 11.0,
             'Pattern': 20.0,
             'Sleeves': 4.0},
 'mobile': {'Brand': 56.0,
            'Camera': 15.0,
            'Color Family': 26.0,
            'Features': 7.0,
            'Memory RAM': 10.0,
            'Network Connections': 4.0,
            'Operating System': 7.0,
            'Phone Model': 2277.0,
            'Phone Screen Size': 6.0,
            'Storage Capacity': 17.0,
            'Warranty Period': 14.0}}


In [161]:
def get_baseline_preds(vertical):
    
    baseline_preds = {}
    df = datasets[vertical]['train']
    
    for attribute in df.columns[3:]:
        top_2 = [str(int(val)) for val in df[attribute].value_counts().index[:2]]
        baseline_preds[attribute] = ' '.join(top_2)
    
    return baseline_preds

def generate_baseline_preds(dataset='val', *vertical):
    
    def extract_att_from_id(df_id):
        idx = df_id.find('_') + 1
        return baseline_preds[df_id[idx:]]

    submission_df = pd.DataFrame()
    
    for vert in vertical:
        
        baseline_preds = get_baseline_preds(vert)
        
        vert_sub_df = generate_submission_df(dataset, vert)
        vert_sub_df['tagging'] = vert_sub_df['id'].apply(extract_att_from_id)
        
        submission_df = pd.concat([submission_df, vert_sub_df])
    
    return submission_df

In [141]:
baseline_submission = generate_baseline_preds('val', 'beauty', 'fashion', 'mobile')
baseline_submission.to_csv('./submissions/baseline-submission.csv', index=False)

In [150]:
# # never tested before!
# def submit_to_kaggle(submission_df, file_name, message, path='./submissions/'):
    
#     assert len(submission_df) == 1174802
#     submission_df.to_csv(path+file_name, index=False)

#     # submit through CLI
#     ! kaggle competitions submit -c ndsc-advanced -f file_name -m message

#### predicting baseline on train set to test MAP@2

In [312]:
def transform_train_df(vert='beauty'):
    """
    Melts train_df in order to get actual labels for itemid-attribute id
    """
    
    df = datasets[vert]['train'].copy().drop(['title', 'image_path'], axis=1)
    df = df.melt(id_vars='itemid')
    df['id'] = df['itemid'].astype(str)+'_'+df['variable']
    df['value_filled'] = df['value'].fillna(df['variable'].map(nan_labels[vert]))
    
    return df

In [151]:
# source: https://www.kaggle.com/pestipeti/explanation-of-map5-scoring-metric
def map_per_image(label, predictions):
    """Computes the precision score of one image.

    Parameters
    ----------
    label : string
            The true label of the image
    predictions : list
            A list of predicted elements (order does matter, 2 predictions allowed per image)

    Returns
    -------
    score : double
    """ 
    label = str(int(label))
    predictions = predictions.split(' ') # own code
    
    try:
        return 1 / (predictions[:2].index(label) + 1)
    except ValueError:
        return 0.0

def map_per_set(labels, predictions):
    """Computes the average over multiple images.

    Parameters
    ----------
    labels : list
             A list of the true labels. (Only one true label per images allowed!)
    predictions : list of list
             A list of predicted elements (order does matter, 2 predictions allowed per image)

    Returns
    -------
    score : double
    """
    return np.mean([map_per_image(l, p) for l,p in zip(labels, predictions)])

In [307]:
val_submission = pd.DataFrame()

for vert in ['beauty', 'fashion', 'mobile']:
    
    actual_labels = transform_train_df(vert)
    baseline_preds = generate_baseline_preds('train', vert)
    
    vert_sub = pd.merge(actual_labels, baseline_preds, on='id')
    val_submission = pd.concat([val_submission, vert_sub])

len(val_submission)

4572255

In [313]:
# check baselines with MAP on validation set
map_per_set(val_submission['value_filled'], val_submission['tagging'])

0.20680517600177592