In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.cross_validation import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
train = pd.read_csv('../input/mercar/train.tsv', sep='\t')
test = pd.read_csv('../input/mercari-price-suggestion-challenge/test_stg2.tsv', sep='\t')
print(train.shape)
print(test.shape)

In [5]:
def rmsle(y, y_pred):
    return (np.sum((np.log(y_pred + 1) - (np.log(y + 1))) ** 2) / len(y)) ** 0.5

In [6]:
def preprocessing_data(data):
    data.category_name.fillna(value='missing', inplace=True)
    data.brand_name.fillna(value='missing', inplace=True)
    data.item_description.fillna(value='missing', inplace=True)
    return data

In [7]:
train = preprocessing_data(train)
test = preprocessing_data(test)

In [8]:
le = LabelEncoder()
le.fit(np.hstack([train.category_name, test.category_name]))
train.category_name = le.transform(train.category_name)
test.category_name = le.transform(test.category_name)

le.fit(np.hstack([train.brand_name, test.brand_name]))
train.brand_name = le.transform(train.brand_name)
test.brand_name = le.transform(test.brand_name)

In [9]:
from keras.preprocessing.text import Tokenizer
text_raw = np.hstack([train.item_description.str.lower(), train.name.str.lower()])

tok = Tokenizer()
tok.fit_on_texts(text_raw)

train['seq_item_description'] = tok.texts_to_sequences(train.item_description.str.lower())
test['seq_item_description'] = tok.texts_to_sequences(test.item_description.str.lower())
train['seq_name'] = tok.texts_to_sequences(train.name.str.lower())
test['seq_name'] = tok.texts_to_sequences(test.name.str.lower())

In [10]:
max_name_seq = np.max([np.max(train.seq_name.apply(lambda x: len(x))),
                       np.max(test.seq_name.apply(lambda x: len(x)))])
max_seq_item_des = np.max([np.max(train.seq_item_description.apply(lambda x: len(x))),
                          np.max(test.seq_item_description.apply(lambda x: len(x)))])

In [11]:
MAX_NAME_SEQ = 17
MAX_ITEM_DESCRIPTION = 70
MAX_TEXT = np.max([np.max(train.seq_name.max()), np.max(test.seq_name.max()),
                   np.max(train.seq_item_description.max()), np.max(test.seq_item_description.max())]) + 2
MAX_BRAND_NAME = np.max([train.brand_name.max(), test.brand_name.max()]) + 1
MAX_CATEGORY_NAME = np.max([train.category_name.max(), test.category_name.max()]) + 1
MAX_CONDITION = np.max([train.item_condition_id.max(), test.item_condition_id.max()]) + 1

In [12]:
train['target'] = np.log(train.price + 1)
target_scaler = MinMaxScaler(feature_range=(-1, 1))
train['target'] = target_scaler.fit_transform(train['target'].reshape(-1, 1))

dtrain, dvalid = train_test_split(train, random_state=42, train_size = 0.95)

In [13]:
from keras.preprocessing.sequence import pad_sequences
    
def get_keras_data(data):
    X = {
        'name': pad_sequences(data.seq_name, maxlen=MAX_NAME_SEQ),
        'item_desc': pad_sequences(data.seq_item_description, maxlen=MAX_ITEM_DESCRIPTION),
        'brand_name': np.array(data.brand_name),
        'category_name': np.array(data.category_name),
        'item_condition': np.array(data.item_condition_id),
        'num_vars': np.array(data[['shipping']])
    }
    return X
    
X_train = get_keras_data(dtrain)
X_valid = get_keras_data(dvalid)
X_test = get_keras_data(test)

In [None]:
from keras.layers import Input, Dropout, Dense, BatchNormalization, Activation, concatenate, GRU, Embedding, Flatten
from keras.models import Model
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping
from keras import backend

def def_get_callback(filepath, patience=2):
    es = EarlyStopping('val_loss', partience=partience, mode='min')
    msave = ModelCheckpoint(filepath, save_best_only=True)
    return [es, msave]

def rmsle_cust(y_true, y_pred):
    first_log = backend.log(backend.clip(y_pred, backend.epsilon(), None) + 1)
    second_log = backend.log(backend.clip(y_true, backend.epsilon(), None) + 1)
    return backend.sqrt(backend.mean(backend.square(first_log - second_log), axis=-1))

def get_model():
    
    name = Input(shape=[X_train['name'].shape[1]], name='name')
    item_desc = Input(shape=[X_train['item_desc'].shape[1]], name='item_desc')
    brand_name = Input(shape=[1], name='brand_name')
    category_name = Input(shape=[1], name='category_name')
    item_condition = Input(shape=[1], name='item_condition')
    num_vars = Input(shape=[X_train['num_vars'].shape[1]], name='num_vars')
    
    emb_name = Embedding(MAX_TEXT, 30)(name)
    emb_item_desc = Embedding(MAX_TEXT, 30)(item_desc)
    emb_brand_name = Embedding(MAX_BRAND_NAME, 10)(brand_name)
    emb_category_name = Embedding(MAX_CATEGORY_NAME, 10)(category_name)
    emb_item_condition = Embedding(MAX_CONDITION, 5)(item_condition)
    
    rnn_layer1 = GRU(40) (emb_item_desc)
    rnn_layer2 = GRU(20) (emb_name)
    rnn_layer3 = GRU(10)(emb_brand_name)
    rnn_layer4 = GRU(10)(emb_category_name)
    
    main_l = concatenate([Flatten() (emb_brand_name), Flatten() (emb_category_name), Flatten() (emb_item_condition), rnn_layer1, rnn_layer2, rnn_layer3, rnn_layer4, num_vars])
    main_l = Dropout(0.1) (Dense(512, activation='relu') (main_l))
    main_l = Dropout(0.1) (Dense(64, activation='relu') (main_l))
    
    output = Dense(1, activation='linear') (main_l)
    
    model = Model([name, item_desc, brand_name, category_name, item_condition, num_vars], output)
    model.compile(loss='mse', optimizer='adam', metrics=['mae', rmsle_cust])
    
    return model

model = get_model()

In [None]:
batch_size = 20000
epochs = 5

model.fit(X_train, dtrain.target, epochs=epochs, batch_size=batch_size, validation_data=(X_valid, dvalid.target), verbose=1)

In [None]:
val_preds = model.predict(X_valid)
val_preds = target_scaler.inverse_transform(val_preds)
val_preds = np.exp(val_preds) + 1

y_true = np.array(dvalid.price.values)
y_pred = val_preds[:,0]
rmsle(y_true, y_pred)

In [None]:
preds_train = model.predict(X_train, batch_size=batch_size)
preds_train = target_scaler.inverse_transform(preds_train)
preds_train = np.exp(preds_train) - 1
dtrain['price_rnn'] = preds_train

preds_valid = model.predict(X_valid, batch_size=batch_size)
preds_valid = target_scaler.inverse_transform(preds_valid)
preds_valid = np.exp(preds_valid) - 1
dvalid['price_rnn'] = preds_valid

In [None]:
from xgboost import XGBRegressor

In [None]:
X_train_xgb = dtrain[['item_condition_id', 'category_name', 'brand_name', 'shipping', 'price_rnn']]
y_train_xgb = dtrain.price

X_valid_xgb = dvalid[['item_condition_id', 'category_name', 'brand_name', 'shipping', 'price_rnn']]
y_valid_xgb = dvalid.price

In [None]:
model_xgb = XGBRegressor(booster='gbtree', max_depth=13, n_estimators=250, eta=0.05, reg_lambda=4, reg_alpha=2)
model_xgb.fit(X_train_xgb, y_train_xgb)

pred = model_xgb.predict(X_valid_xgb)

print(rmsle(y_valid_xgb, pred))

In [None]:
preds_test = model.predict(X_test, batch_size=batch_size)
preds_test = target_scaler.inverse_transform(preds_test)
preds_test = np.exp(preds_test) - 1
test['price_rnn'] = preds_test

In [None]:
test_xgb = test[['item_condition_id', 'category_name', 'brand_name', 'shipping', 'price_rnn']]
preds = model_xgb.predict(test_xgb)
index = pd.Series(np.arange(len(test_xgb)), name='test_id')
price = pd.Series(preds, name='price')
submission = pd.concat([index, price], axis=1)
submission.to_csv('submission.csv', index=False)