# Mercari Price Suggestion Challenge

## Links
https://stats.stackexchange.com/questions/270546/how-does-keras-embedding-layer-work

#### Imports

In [36]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.cross_validation import train_test_split

import matplotlib.pyplot as plt
%matplotlib inline 

import math


#### Load the data

In [37]:
TRAIN_FOLDER = "Data/train.tsv"
TEST_FOLDER = "Data/test.tsv"

print("Loading data...")
train = pd.read_table(TRAIN_FOLDER)
test = pd.read_table(TEST_FOLDER)
print(train.shape)
print(test.shape)

Loading data...
(1482535, 8)
(693359, 7)


#### Visualize the data

In [38]:
train.head(3)

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...


#### Handle missing data

In [39]:
print("Handling missing values...")
def handle_missing(dataset):
    #Llenar valores vacios de nombre de categoria
    dataset.category_name.fillna(value="missing", inplace=True)
    #Llenar valores vacios de nombre de la marca
    dataset.brand_name.fillna(value="missing", inplace=True)
    #Llenar valores vacios de nombre de la descripcion
    dataset.item_description.fillna(value="missing", inplace=True)
    return (dataset)

train = handle_missing(train)
test = handle_missing(test)
print(train.shape)
print(test.shape)

Handling missing values...
(1482535, 8)
(693359, 7)


#### Process categorical data

In [40]:
print("Handling categorical variables...")
label_encoder = LabelEncoder()

label_encoder.fit(np.hstack([train.category_name, test.category_name]))
#Codificar la cateogria como un valor numerico
train.category_name = label_encoder.transform(train.category_name)
test.category_name = label_encoder.transform(test.category_name)

label_encoder.fit(np.hstack([train.brand_name, test.brand_name]))
#Codificar la marca como un valor numerico
train.brand_name = label_encoder.transform(train.brand_name)
test.brand_name = label_encoder.transform(test.brand_name)
del label_encoder


Handling categorical variables...


#### Process text

In [41]:
print("Text to seq process...")
from keras.preprocessing.text import Tokenizer
raw_text = np.hstack([train.item_description.str.lower(), train.name.str.lower()])

print("   Fitting tokenizer...")
tok_raw = Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True,split=" ", char_level=False)
tok_raw.fit_on_texts(raw_text)
print("Transforming text to seq...")

#Vectorización del texto y conversión a secuencia
train["seq_item_description"] = tok_raw.texts_to_sequences(train.item_description.str.lower())
test["seq_item_description"] = tok_raw.texts_to_sequences(test.item_description.str.lower())
train["seq_name"] = tok_raw.texts_to_sequences(train.name.str.lower())
test["seq_name"] = tok_raw.texts_to_sequences(test.name.str.lower())
train.head(3)

Text to seq process...
   Fitting tokenizer...
Transforming text to seq...


Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,seq_item_description,seq_name
0,0,MLB Cincinnati Reds T Shirt Size XL,3,829,5265,10.0,1,No description yet,"[12, 68, 79]","[3852, 8823, 6896, 208, 84, 6, 155]"
1,1,Razer BlackWidow Chroma Keyboard,3,86,3889,52.0,0,This keyboard is in great condition and works ...,"[29, 2627, 10, 7, 39, 17, 1, 207, 51, 19, 1113...","[10760, 25565, 16369, 2627]"
2,2,AVA-VIV Blouse,1,1277,4588,10.0,1,Adorable top with a hint of lace and a key hol...,"[604, 60, 9, 4, 5347, 11, 192, 1, 4, 886, 1290...","[7634, 10563, 666]"


#### Change dataframe names

In [45]:
train.columns = ['id', 'name', 'condition', 'category', 'brand', 'price', 'shipping', 'description', 'seq_description', 'seq_name']
test.columns = ['id', 'name', 'condition', 'category', 'brand', 'shipping', 'description', 'seq_description', 'seq_name']


#### Create dummy varibale for item condition

In [49]:
#Crear dummies para la variable de condición
train = pd.get_dummies(train, columns = ["condition"])
test = pd.get_dummies(test, columns = ["condition"])

#### Extract train and test

In [52]:
#Separar base de datos de entrenamiento
train, validation = train_test_split(train, random_state=123, train_size=0.99)
print(train.shape)
print(validation.shape)

(1467709, 14)
(14826, 14)


#### Keras data definition

In [53]:
from keras.preprocessing.sequence import pad_sequences

def get_keras_data(dataset):
    X = {
        'name': pad_sequences(dataset.seq_name)
        ,'description': pad_sequences(dataset.seq_description)
        ,'brand': np.array(dataset.brand)
        ,'category': np.array(dataset.category)
        ,'condition_1': np.array(dataset.condition_1)
        ,'condition_2': np.array(dataset.condition_2)
        ,'condition_3': np.array(dataset.condition_3)
        ,'condition_4': np.array(dataset.condition_4)
        ,'condition_5': np.array(dataset.condition_5)
        ,'shipping': np.array(dataset.shipping)
    }
    return X

X_train = get_keras_data(train)
X_valid = get_keras_data(validation)
X_test = get_keras_data(test)

#### Model definition

In [None]:
#KERAS MODEL DEFINITION
from keras.layers import Input, Dropout, Dense, BatchNormalization, Activation, concatenate, GRU, Embedding, Flatten, BatchNormalization
from keras.models import Model
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping
from keras import backend as K

def get_callbacks(filepath, patience=2):
    es = EarlyStopping('val_loss', patience=patience, mode="min")
    msave = ModelCheckpoint(filepath, save_best_only=True)
    return [es, msave]

def rmsle_cust(y_true, y_pred):
    first_log = K.log(K.clip(y_pred, K.epsilon(), None) + 1.)
    second_log = K.log(K.clip(y_true, K.epsilon(), None) + 1.)
    return K.sqrt(K.mean(K.square(first_log - second_log), axis=-1))

def get_model():
    #params
    dr_r = 0.1
    
    #Inputs
    name = Input(shape=[X_train["name"].shape[1]], name="name")
    item_desc = Input(shape=[X_train["item_desc"].shape[1]], name="item_desc")
    brand_name = Input(shape=[1], name="brand_name")
    category_name = Input(shape=[1], name="category_name")
    item_condition = Input(shape=[1], name="item_condition")
    num_vars = Input(shape=[X_train["num_vars"].shape[1]], name="num_vars")
    
    #Embeddings layers
    emb_name = Embedding(MAX_TEXT, 50)(name)
    emb_item_desc = Embedding(MAX_TEXT, 50)(item_desc)
    emb_brand_name = Embedding(MAX_BRAND, 10)(brand_name)
    emb_category_name = Embedding(MAX_CATEGORY, 10)(category_name)
    emb_item_condition = Embedding(MAX_CONDITION, 5)(item_condition)
    
    #rnn layer
    rnn_layer1 = GRU(16) (emb_item_desc)
    rnn_layer2 = GRU(8) (emb_name)
    
    #main layer
    main_l = concatenate([
        Flatten() (emb_brand_name)
        , Flatten() (emb_category_name)
        , Flatten() (emb_item_condition)
        , rnn_layer1
        , rnn_layer2
        , num_vars
    ])
    main_l = Dropout(dr_r) (Dense(128) (main_l))
    main_l = Dropout(dr_r) (Dense(64) (main_l))
    
    #output
    output = Dense(1, activation="linear") (main_l)
    
    #model
    model = Model([name, item_desc, brand_name
                   , category_name, item_condition, num_vars], output)
    model.compile(loss="mse", optimizer="adam", metrics=["mae", rmsle_cust])
    
    return model

    
model = get_model()
model.summary()