In [None]:
!apt-get install p7zip

In [None]:
!p7zip -d -f -k /kaggle/input/mercari-price-suggestion-challenge/train.tsv.7z

In [None]:
!p7zip -d -f -k /kaggle/input/mercari-price-suggestion-challenge/test.tsv.7z

In [None]:
!unzip ../input/mercari-price-suggestion-challenge/test_stg2.tsv.zip

In [None]:
import os,sys
import pandas as pd
import numpy as np

In [None]:
root_dir = os.getcwd()

In [None]:
model_dir = os.path.join(root_dir,'model')

In [None]:
# read in unpacked training data
df_train = pd.read_csv('train.tsv', sep='\t')

In [None]:
df_train.shape

In [None]:
df_train['name']=df_train['name'].astype(str)
df_train['category_name']=df_train['category_name'].astype(str)
df_train['item_description']=df_train['item_description'].astype(str)
df_train['category_text'] = df_train['category_name'].apply(lambda x: ' '.join(str(x).split('/')))
df_train['text'] = df_train[['name','category_text','item_description']].apply(lambda x: ''.join(x), axis=1)
# convert to lowercase
df_train['text'] = df_train['text'] .apply(lambda x: x.lower())

### Load tokenizer

In [None]:
import sentencepiece as spm
sp = spm.SentencePieceProcessor()

In [None]:
sp.Load('../input/bpe-model-text-lower-10k/bpe_model_text_lower_10k.model')

### Apply tokenizer

In [None]:
offset = 1
df_train['tokenized'] = df_train['text'].apply(lambda x: [k+offset for k in sp.EncodeAsIds(x)])

In [None]:
df_train['token_cnt'] = df_train['tokenized'].apply(len)

### Decide about sequence length

In [None]:
df_train['token_cnt'].quantile(.90)

### Prepare Keras model

In [None]:
import os,sys
import pandas as pd
import pickle
import numpy as np

In [None]:
import tensorflow as tf
from tensorflow import keras

In [None]:
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Dense
from keras.layers.core import Dropout
from keras.layers.core import Activation
from keras.layers.embeddings import Embedding
from keras.layers import LSTM

### Convert data into (padded) numpy arrays

In [None]:
token_truncate_len = 90

In [None]:
# Create an encoded, truncated array
train_np = pad_sequences(df_train['tokenized'].values,maxlen=token_truncate_len,truncating='post')

In [None]:
train_np.shape

### Prepare X/Y numpy arrays

In [None]:
X_train = train_np

In [None]:
y_train = df_train['price'].values

In [None]:
max_features = 10000+1

In [None]:
# arbitrary choice, needs to be refined 
embedding_dimension = 20

In [None]:
from keras.optimizers import Adam
from keras.losses import mean_squared_logarithmic_error
from keras.layers import Bidirectional

In [None]:
model=Sequential()
model.add(
    Embedding(
        max_features,
        embedding_dimension,
        input_length=token_truncate_len
    )
)
#model.add(Bidirectional(CuDNNLSTM(embedding_dimension)))
model.add(Bidirectional(LSTM(embedding_dimension)))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('linear'))

opt = Adam(lr=1e-3, decay=1e-3 / 200)

optimizer = opt
model.compile(
    loss = mean_squared_logarithmic_error,
    optimizer=opt)


In [None]:
%%time
model.fit(
    X_train,
    y_train,
    epochs=1
)

### Export the model

In [None]:
# export da model
model.save('mercari_bilstm_model.h5')

### Apply model to the train data

In [None]:
z_train = model.predict(X_train)

In [None]:
z_train = z_train.reshape(-1,)

### Compute prediction error on the train data

In [None]:
def rmsle(y_true, y_pred):
    assert len(y_true) == len(y_pred)
    return np.sqrt(np.mean(np.power(np.log1p(y_true + 1) - np.log1p(y_pred + 1), 2)))

In [None]:
rmsle(y_train,z_train)

### Load the test data 

In [None]:
df_test = pd.read_csv('test_stg2.tsv', sep='\t')

In [None]:
df_test['name']=df_test['name'].astype(str)
df_test['category_name']=df_test['category_name'].astype(str)
df_test['item_description']=df_test['item_description'].astype(str)
df_test['category_text'] = df_test['category_name'].apply(lambda x: ' '.join(str(x).split('/')))
df_test['text'] = df_test[['name','category_text','item_description']].apply(lambda x: ''.join(x), axis=1)
# convert to lowercase
df_test['text'] = df_test['text'] .apply(lambda x: x.lower())

In [None]:
offset = 1
df_test['tokenized'] = df_test['text'].apply(lambda x: [k+offset for k in sp.EncodeAsIds(x)])

In [None]:
X_test = pad_sequences(df_test['tokenized'].values,maxlen=token_truncate_len,truncating='post')

In [None]:
X_test.shape

In [None]:
z_test = model.predict(X_test)
z_test = z_test.reshape(-1,)

### Export test scores for submission

In [None]:
test_result_df = pd.DataFrame(data={'test_id':df_test.index,'price':z_test})

In [None]:
# create submission
test_result_df.to_csv("submission.csv", index = False)