In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!apt-get install p7zip
!p7zip -d -f -k /kaggle/input/mercari-price-suggestion-challenge/train.tsv.7z
!p7zip -d -f -k /kaggle/input/mercari-price-suggestion-challenge/test.tsv.7z

In [None]:
import os,sys
import pandas as pd

### Load the data

In [None]:
df_train = pd.read_csv('train.tsv', sep='\t')

In [None]:
df_test = pd.read_csv('test.tsv', sep='\t')

In [None]:
df_train.info()

### Column: name

In [None]:
df_train['name'].describe()

In [None]:
# Find most popular values
df_train['name'].value_counts().head(10)

In [None]:
# Case differences: Bundle and BUNDLE => lowercase & find most popular value 
df_train['name'].apply(lambda x: x.lower()).value_counts().head(10)

In [None]:
# Display length 
df_train['name'].astype(str).apply(len).hist()

In [None]:
# Find different names of similar products 
df_train[df_train['name'].apply(lambda x: x.lower().find('eyeshadow') >= 0)]

NOTES:
* Name contains both NSUBJ ("what" is being sold, i.e. "eyeshadow palette") and adjectives that further describe the item, i.e. "faced sweet peach" => we can try using spacy to extract NSUBJ
* In some cases multiple subjects are proposed for same (i.e. "2 NEW eyeshadow palettes"), which affects the price

### Column: "item_condition_id"

In [None]:
# Unique condition values
df_train['item_condition_id'].unique()

In [None]:
df_train['item_condition_id'].value_counts()

1.  Brand new
2.  Like new
3.  Almost not used
4.  Has some wear
5.  Damaged, for parts

In [None]:
# Compute average price vs. item condition
df_train.groupby('item_condition_id')['price'].mean()

In [None]:
# Examples of ""
df_train[df_train['item_condition_id'] == 5].head()

Broken items are eitehr electronics or jewelry, which explains the (higher) price

### Column: "category_name"

In [None]:
df_train['category_name'].head()

In [None]:
df_train['category_name'].describe()

In [None]:
# How many items are missing a "category_name"
df_train['category_name'].isna().sum()

In [None]:
# PERCENTAGE of entries missing a "category_name" 
100.*df_train['category_name'].isna().sum()/df_train.shape[0]

In [None]:
# Split "category_name" by '/'
df_train['category_list'] = df_train['category_name'].astype(str).apply(lambda x: x.split('/'))

In [None]:
# How many levels are in the hierachy
df_train['category_list'].apply(len).value_counts()

Status:

* 0.42% of items lack "category name"

* There are either 3/4/5 levels in the hierarchy 
 in all cases where category_name is present (non-null)

In [None]:
# Create separate columns for category levels
df_train['level0'] = df_train['category_list'].apply(lambda x: x[0] if len(x) >= 1 else np.nan)
df_train['level1'] = df_train['category_list'].apply(lambda x: x[1] if len(x) >= 3 else np.nan)
df_train['level2'] = df_train['category_list'].apply(lambda x: x[2] if len(x) >= 3 else np.nan)
df_train['level3'] = df_train['category_list'].apply(lambda x: x[3] if len(x) >= 4 else np.nan)
df_train['level4'] = df_train['category_list'].apply(lambda x: x[4] if len(x) >= 5 else np.nan)

In [None]:
# print a number of unique categories in each level of the hierarchy
df_categories = df_train[['level0','level1','level2','level3','level4']].sort_values(['level0','level1','level2','level3','level4']).drop_duplicates()
for level in ['level0','level1','level2','level3','level4']:
    print(level,df_categories[level].nunique())

Status:
* we can ignore 'level3' and 'level4'

In [None]:
# Check for overlap between layers 0 and 1
set(df_categories['level0'].unique()).intersection(df_categories['level1'].unique())

In [None]:
# Intersect levels 0 and 2
set(df_categories['level0'].unique()).intersection(df_categories['level2'].unique())

In [None]:
# Intersect levels 1 and 2
set(df_categories['level1'].unique()).intersection(df_categories['level2'].unique())

Status:   
    
        * It is sufficient to consider only 3 levels (0,1,2) in the hierarchy
        
        * There is apparently an overlap between different triplets 
          (level0,level1,level2)
        
        * Some categories are included within other categories
          i.e. 'Atletic Apparel' and 'Apparel'
        
        * Some categories COMBINE other categories
            'Backpack'
            'Backpacks & Carriers'
            'Backpacks, Bags & Briefcases'

Idea: Bag of words on categories

    
    * Split all categories by '/' and &
    * Lowercase (and lemmatize?) each word (Books->book)
    * Create a boolean variable 

### Column: "brand_name"

In [None]:
df_train['brand_name'].describe()

In [None]:
df_train['brand_name'].value_counts()

In [None]:
# How many brands have 5 or more items
(df_train['brand_name'].value_counts() >= 5).sum()

### Column: "price"

In [None]:
df_train["price"].describe()

In [None]:
df_train["price"].hist()

In [None]:
# Plot price in the log domain
np.log(1+df_train["price"]).hist()

As expected, the plot is skewed in the linear domain and almost symmetric in the logarithmic domain, which is consistent withe the recommended error estimate (RMSLE)

In [None]:
# price 0 or less
(df_train["price"] <= 0).sum()

Status:

    
    874 items are listed as "free",
    which is likely an error in the data

### Column: "shipping"

In [None]:
df_train['shipping'].value_counts()

In [None]:
# Correlation between shipping and price
df_train.groupby('shipping')['price'].mean()

As expected, items where shipping is paid by seller are more expensive (by approx. 8$)

### Column: item description

item_description - the full description of the item. Note that we have cleaned the data to remove text that look like prices (e.g. $20) to avoid leakage. These removed prices are represented as [rm]

In [None]:
df_train['item_description'].value_counts()

In [None]:
# count missing descriptions
df_train['item_description'].isna().sum()

In [None]:
# Description length histogram
df_train['item_description'].astype(str).apply(lambda x: len(x)).hist()

In [None]:
# Logarithm of the description length histogram
np.log(1+df_train['item_description'].astype(str).apply(lambda x: len(x))).hist()

In [None]:
# Let's have a look of some extremely long descriptions
df_train['item_description'][df_train['item_description'].astype(str).apply(lambda x: len(x)) > 800].iloc[0]

Question:
    What does NIB mean in the phrase above ?
    
Answer:
    NIB = "new in box"
    
    Sometimes 'item_description' consists of SHORTCUTS

    NIB  - new in box
    NWOT - new without tags
    BNWT - brand new without tags
    NWT  - new with tags
    
Actions:

* Detect all acronyms based on the list of Ebay acronims:
        http://www.bnibwt.com/
        OR https://www.ebay.com.sg/pages/help/account/acronyms.html
* Convert acronyms into plain English
* Remove '[rm]' from the description

## Summary

**name**

    * In many cases description is excessively short
    * Short description SOMETIMES is a brand name (i.e. Crocs)
    * Whenever description is short, we can replace it with the 
      first phrase of the "item description"
   
    * Extract NSUBJ (Nominal Subject) using Spacy.This is an actual SUBJECT being sold.
    
    * In some cases, multiple items are proposed for a sale 
      "2 Magnetic Mac Eyeshadow Palettes"
      "Two new 12in x 12in scrapbooks"
      "Set of three American Eagle sunglasses"
      "2set of 5T Pjs"
          

**item_condition_id**

    1  Brand new
    2  Like new
    3  Almost not used
    4  Has some wear
    5  Damaged, for parts
    
    [This feature can be translated into text] 

**category_name**

    * It is sufficient to consider only 3 levels (0,1,2) in the hierarchy

    * There is apparently an overlap between different triplets 
      (level0,level1,level2):

      For example:

            'Sports & Outdoors/Apparel/Men'

            likely intersects with 

            'Men/Athletic Apparel/Jackets'
            'Men/Athletic Apparel/Pants'

    * There are categories which COMBINE other categories (logical OR)
      example: Bags & Carriers
    
    * Idea: Bag of words on categories
            (this idea naturally handles logical OR of categories) 

    
**brand_name**

    4810 unique brand names
    around 2000 have 5+ items
    
    Issue:
         Sometimes the SELLER_NAME is put in the BAND_NAME column 
         (while the product BRAND_NAME is hidden inside "name")
    
        ACTION: 
           *   make a list of KNOWN brand names
           *   search for KNOWN brand names inside text fields
               * in order to REMOVE instances where the seller name appears instead of brand name
               * in order to CORRECT instances where the seller name appears instead of the brand name    
               
price 

    * drop items with 0 or NA as a price

shipping

    binary feature

item_description
    
    Sometimes 'item_description' consists of SHORTCUTS

    NIB  - new in box
    NWOT - new without tags
    BNWT - brand new without tags
    NWT  - new with tags
    
    Action

    Detect all acronyms based on the list of Ebay acronims:
        http://www.bnibwt.com/
        OR https://www.ebay.com.sg/pages/help/account/acronyms.html
    Convert acronyms into plain English
    
    
    * In many cases punctiation is removed and phrases are merged.
      ACTION: Split the text into phrases using NLTK
    
    * Truncate excessively long descriptions 
    * Remove [rm]
    * Remove \x.. and \u..
    * "2 x" means two items => essential when defining the price
    * "iPhone charger 4 all 5's and 6's+"  
       Q: how do we make obvious that these all are iPhone versions ?    
