In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/mercari-price-suggestion-challenge/train.tsv.7z
/kaggle/input/mercari-price-suggestion-challenge/test.tsv.7z
/kaggle/input/mercari-price-suggestion-challenge/sample_submission_stg2.csv.zip
/kaggle/input/mercari-price-suggestion-challenge/test_stg2.tsv.zip
/kaggle/input/mercari-price-suggestion-challenge/sample_submission.csv.7z


In [2]:
!apt-get install p7zip
!p7zip -d -f -k ../input/mercari-price-suggestion-challenge/train.tsv.7z
!unzip -o ../input/mercari-price-suggestion-challenge/sample_submission_stg2.csv.zip
!unzip -o ../input/mercari-price-suggestion-challenge/test_stg2.tsv.zip
!p7zip -d -f -k ../input/mercari-price-suggestion-challenge/test.tsv.7z




p7zip is already the newest version (16.02+dfsg-7build1).
p7zip set to manually installed.
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.

7-Zip (a) [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=C.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.00GHz (50653),ASM,AES-NI)

Scanning the drive for archives:
  0M Scan ../input/mercari-price-suggestion-challenge/                                                      1 file, 77912192 bytes (75 MiB)

Extracting archive: ../input/mercari-price-suggestion-challenge/train.tsv.7z
--
Path = ../input/mercari-price-suggestion-challenge/train.tsv.7z
Type = 7z
Physical Size = 77912192
Headers Size = 122
Method = LZMA2:24
Solid = -
Blocks = 1

  0%      3% - train.tsv                  7% - train.tsv

In [3]:
%%time 
#preprocessing
#show time consumed
from datetime import datetime
start_real = datetime.now() #start measuring the time used

import pandas as pd

#read training and testing data
train_df = pd.read_table('train.tsv')
test_df = pd.read_table('test.tsv')
print(train_df.shape, test_df.shape)

(1482535, 8) (693359, 7)
CPU times: user 7.35 s, sys: 825 ms, total: 8.17 s
Wall time: 8.39 s


In [4]:
train_df = train_df.drop(train_df[(train_df.price < 3.0)].index)
train_df.shape

(1481661, 8)

In [5]:
%%time
#confirm word count of item's name and description

def wordCount(text):
    """
    Parameter
        text(str): item's name and description
    """

    try:
        if text == 'No description yet':
            return 0 #no word then reutn 0
        else:
            text = text.lower()
            words = [w for w in text.split(' ')] #split by space
            return len(words)
    except:
        return 0

#record len of 'name' in 'name_len'
train_df['name_len'] = train_df['name'].apply(lambda x: wordCount(x))
test_df['name_len'] = test_df['name'].apply(lambda x: wordCount(x))

#record len of 'description' in 'desc_len'
train_df['desc_len'] = train_df['item_description'].apply(lambda x: wordCount(x))
test_df['desc_len'] = test_df['item_description'].apply(lambda x: wordCount(x))

CPU times: user 10.3 s, sys: 49.8 ms, total: 10.4 s
Wall time: 10.4 s


In [6]:
%%time 
import numpy as np

#log transform
train_df['target'] = np.log1p(train_df.price)

CPU times: user 28.5 ms, sys: 0 ns, total: 28.5 ms
Wall time: 27.9 ms


In [7]:
%%time

def split_cat(text):
    """
    Parameters:
        text(str): category's name
            -split by '/'
            -if data not exist, return 'no label'
    """

    try: return text.split('/')
    except: return ('No Label', 'No Label', 'No Label')

#split training data
train_df['subcat_0'] , train_df['subcat_1'], train_df['subcat_2'] = \
    zip(* train_df['category_name'].apply(lambda x: split_cat(x)))
#test data
test_df['subcat_0'], test_df['subcat_1'], test_df['subcat_2'] = \
    zip(* test_df['category_name'].apply(lambda x: split_cat(x)))

CPU times: user 6.72 s, sys: 385 ms, total: 7.1 s
Wall time: 7.1 s


In [8]:
%%time

#concat train and test
full_set = pd.concat([train_df, test_df])

#find all brand name and create list of brand
all_brands = set(full_set['brand_name'].values)

#replace missing values in 'brand_name' (NaN) by 'missing'
train_df['brand_name'].fillna(value = 'missing', inplace = True)
test_df['brand_name'].fillna(value = 'missing',  inplace = True)

#get number of missing value of train data
train_premissing = len(train_df.loc[train_df['brand_name'] == 'missing'])

#get number of missing value of test data
test_premissing = len(test_df.loc[test_df['brand_name'] == 'missing'])

def brandFinder(line):

    brand = line[0] #index [0] to be brand name
    name = line[1] #index [1] to be item name
    namesplit = name.split(' ') #split by space

    if brand == 'missing': #is missing
        for x in namesplit: #get item name from single word
            if x in all_brands: #if item name (single word) in brand list, return item name
                return name
    if name in all_brands: #not missing
        return name #return item name
    
    return brand #if not in brand list

#replace brand name
train_df['brand_name'] = train_df[['brand_name','name']].apply(brandFinder, axis = 1)
test_df['brand_name'] = test_df[['brand_name', 'name']].apply(brandFinder, axis =1)

#get number of missing value after replacement
train_len = len(train_df.loc[train_df['brand_name'] == 'missing'])
test_len = len(test_df.loc[test_df['brand_name'] == 'missing'])
train_found = train_premissing - train_len
test_found = test_premissing - test_len
print(train_premissing)
print(train_found)
print(test_premissing)
print(test_found)

632336
137342
295525
64154
CPU times: user 31 s, sys: 502 ms, total: 31.5 s
Wall time: 31.6 s


In [9]:
%%time
#separate train and test dataframe in 99:1
#since 1% is already large enough (over 10k)

from sklearn.model_selection import train_test_split
import gc #garbage collection
train_dfs, dev_dfs = train_test_split(train_df,
                                      random_state = 123,
                                      train_size = 0.99,
                                      test_size = 0.01,)

n_trains = train_dfs.shape[0] #shape of test data
n_devs = dev_dfs.shape[0] #shape of validation data
n_tests = test_df.shape[0] #shape of test data

print('Training: \t', n_trains, 'examples')
print('Validating \t:', n_devs, 'examples')
print('Testing: \t', n_tests, 'examples')
del train_df
gc.collect()

Training: 	 1466844 examples
Validating 	: 14817 examples
Testing: 	 693359 examples
CPU times: user 2.16 s, sys: 169 ms, total: 2.33 s
Wall time: 2.6 s


0

In [10]:
%%time
#Concat training, validating and testing data
full_df = pd.concat([train_dfs, dev_dfs, test_df])

def fill_missing_values(df):
    #category of item
    df.category_name.fillna(value = 'missing', inplace = True)
    #brand name
    df.brand_name.fillna(value = 'missing', inplace = True)
    #description
    df.item_description.fillna(value = 'missing', inplace = True)
    
    #description (No description yet -> missing)
    df.item_description.replace('No description yet', 
                                'missing', inplace = True)
    return df

full_df = fill_missing_values(full_df)

CPU times: user 2.16 s, sys: 116 ms, total: 2.28 s
Wall time: 2.28 s


In [11]:
from sklearn.preprocessing import LabelEncoder

print('Prcessing categorical data')

#create LabelEncoder
le = LabelEncoder()

#encode 'category_name', 'brand_name'
le.fit(full_df.category_name)
full_df['category'] = le.transform(full_df.category_name)

le.fit(full_df.brand_name)
full_df.brand_name = le.transform(full_df.brand_name)

le.fit_transform(full_df.subcat_0)
full_df.subcat_0 = le.transform(full_df.subcat_0)
le.fit_transform(full_df.subcat_1)
full_df.subcat_1 = le.transform(full_df.subcat_1)
le.fit_transform(full_df.subcat_2)
full_df.subcat_2 = le.transform(full_df.subcat_2)


del le
gc.collect()

Prcessing categorical data


0

In [12]:
%%time
#apply label encoding to the concatenated df
from tensorflow.keras.preprocessing.text import Tokenizer

#concat list of (item description, item name, item category_name)
print("Transforming text data to sequence data")
raw_text = np.hstack([full_df.item_description.str.lower(),
                     full_df.name.str.lower(),
                     full_df.category_name.str.lower(),])

print('Sequences shape, ',raw_text.shape)

print("Fitting tokenizer")
tok_raw = Tokenizer()
tok_raw.fit_on_texts(raw_text)
print("Transforming text to sequences")
full_df['seq_item_description'] = tok_raw.texts_to_sequences(\
    full_df.item_description.str.lower())
full_df['seq_name'] = tok_raw.texts_to_sequences(\
    full_df.name.str.lower())

del tok_raw
gc.collect()

Transforming text data to sequence data
Sequences shape,  (6525060,)
Fitting tokenizer
Transforming text to sequences
CPU times: user 3min 19s, sys: 2.15 s, total: 3min 21s
Wall time: 3min 24s


0

In [13]:
#unify the size of item name, item description and item category
MAX_NAME_SEQ = 10       #item name size limit
MAX_ITEM_DESC_SEQ = 75  #item description size limit
MAX_CATEGORY_SEQ = 8    #item category size limit

#define embedding layer input size

#item name and item description word size: limit + 100
MAX_TEXT = np.max(
    [np.max(full_df.seq_name.max()),
     np.max(full_df.seq_item_description.max())]) + 100

#item category word size = limit + 1
MAX_CATEGORY = np.max(full_df.category.max()) +1

#brand word size = limit + 1
MAX_BRAND = np.max(full_df.brand_name.max()) +1

#item condition = limit + 1
MAX_CONDITION = np.max(full_df.item_condition_id.max()) +1

#Description word  length = limit + 1
MAX_DESC_LEN = np.max(full_df.desc_len.max()) +1

#item name length = limit + 1
MAX_NAME_LEN = np.max(full_df.name_len.max()) + 1

#subcat word = limit + 1
MAX_SUBCAT_0 = np.max(full_df.subcat_0.max()) + 1
MAX_SUBCAT_1 = np.max(full_df.subcat_1.max()) + 1
MAX_SUBCAT_2 = np.max(full_df.subcat_2.max()) + 1

In [14]:
%%time
from tensorflow.keras.preprocessing.sequence import pad_sequences

def get_rnn_data(dataset):
    """
    Parameter:
        dataset: all data
    """
    X ={
        #item name
        #MAX_NAME_SEQ = 10
        'name' : pad_sequences(dataset.seq_name,
                                maxlen = MAX_NAME_SEQ),

        #item description
        'item_desc': pad_sequences(dataset.seq_item_description,
                                   maxlen = MAX_ITEM_DESC_SEQ),
        #brand_name
        'brand_name': np.array(dataset.brand_name),

        #item condition
        'item_condition': np.array(dataset.item_condition_id,),

        #num_vars
        'num_vars': np.array(dataset[['shipping']]),

        #item description
        'desc_len':np.array(dataset[['desc_len']]),

        #item name
        'name_len': np.array(dataset[['name_len']]),

        #item subcat
        'subcat_0': np.array(dataset.subcat_0),
        'subcat_1': np.array(dataset.subcat_1),
        'subcat_2': np.array(dataset.subcat_2)
    }

    return X
    
#training data: index 0 to training data index
train = full_df[:n_trains]

#validation
dev = full_df[n_trains:n_trains + n_devs]

#testing data
test = full_df[n_trains + n_devs: ]

#get training dict
X_train = get_rnn_data(train)
#transform training item price from 1D to 2D
#(1466844 -> (1466844,1))
Y_train = train.target.values.reshape(-1, 1)

#get validation dict
X_dev = get_rnn_data(dev)
Y_dev = dev.target.values.reshape(-1,1)

#get testing dict
X_test = get_rnn_data(test)

del full_df
gc.collect()

CPU times: user 22.5 s, sys: 334 ms, total: 22.9 s
Wall time: 22.9 s


50

In [15]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dropout, Dense, Embedding, Flatten
from tensorflow.keras.layers import concatenate, GRU
from tensorflow.keras.optimizers import Adam

np.random.seed(123) #random number
#define rmse, use to predict situation
#use this rmse after log transformation
def rmsle(Y, Y_pred):
    assert Y.shape == Y_pred
    return np.sqrt(np.mean(np.Square(Y_pred - Y)))

def new_rnn_model(lr = 0.001, decay = 0.0):
    """
    create RNN model
    Parameters:
        lr: learning rate
        decay: decay of learning rate
    """

    #input layer
    #item name, description, brand name, item condition, num_vars
    name        = Input(shape = [X_train['name'].shape[1]],
                                 name = 'name')
    item_desc   = Input(shape = [X_train['item_desc'].shape[1]],
                                 name = 'item_desc')
    brand_name  = Input(shape = [1], name = 'brand_name')
    item_condition = Input(shape = [1], name = 'item_condition')
    num_vars    = Input(shape = [X_train['num_vars'].shape[1]],
                                 name = 'num_vars')
    #item  name wording, item description wording
    name_len    = Input(shape = [1], name = 'name_len')
    desc_len    = Input(shape = [1], name = 'desc_len')

    #item subcat
    subcat_0    = Input(shape = [1], name = 'subcat_0')
    subcat_1    = Input(shape = [1], name = 'subcat_1')
    subcat_2    = Input(shape = [1], name = 'subcat_2')

    #embedding layers
    #item name embedding: input length = (MAX_TEXT), output = (20)
    ebd_name = Embedding(MAX_TEXT, 20)(name)
    #item description embedding: input length = (MAX_TEXT), output = (60)
    ebd_item_desc = Embedding(MAX_TEXT, 60)(item_desc)
    #embedding so on
    ebd_brand_name = Embedding(MAX_BRAND, 10)(brand_name)
    ebd_item_condition = Embedding(MAX_CONDITION, 5)(item_condition)
    ebd_name_len = Embedding(MAX_NAME_LEN, 5)(name_len)
    ebd_desc_len = Embedding(MAX_DESC_LEN, 5)(desc_len)
    ebd_subcat_0 = Embedding(MAX_SUBCAT_0, 10)(subcat_0)
    ebd_subcat_1 = Embedding(MAX_SUBCAT_1, 10)(subcat_1)
    ebd_subcat_2 = Embedding(MAX_SUBCAT_2, 10) (subcat_2)

    #Gate Recurrrent Unit
    rnn_layer1 = GRU(16)(ebd_item_desc) 
    rnn_layer2 = GRU(8) (ebd_name)

    #flatten
    main_l = concatenate([Flatten()(ebd_brand_name),
                          Flatten()(ebd_item_condition),
                          Flatten()(ebd_desc_len),
                          Flatten()(ebd_name_len),
                          Flatten()(ebd_subcat_0),
                          Flatten()(ebd_subcat_1),
                          Flatten()(ebd_subcat_2),
                          rnn_layer1,   #item description
                          rnn_layer2,   #name description
                          num_vars])

    #FUll connect
    main_l = Dropout(0.1)(Dense(512,
                                kernel_initializer = 'normal',
                                activation = 'relu')(main_l))
    
    main_l = Dropout(0.1)(Dense(256,
                                kernel_initializer = 'normal',
                                activation = 'relu')(main_l))
    
    main_l = Dropout(0.1)(Dense(128,
                                kernel_initializer = 'normal',
                                activation = 'relu')(main_l))

    main_l = Dropout(0.1)(Dense(64,
                                kernel_initializer = 'normal',
                                activation = 'relu')(main_l))
    
    #output layer
    output = Dense(1, 
                   activation = 'linear') (main_l)

    #input layer
    model = Model(  inputs=  [   name,
                                item_desc,
                                brand_name,
                                item_condition,
                                num_vars,
                                desc_len,
                                name_len,
                                subcat_0,
                                subcat_1,
                                subcat_2
                            ],
                    outputs = output)
    #set compiler
    model.compile(  loss = 'mse',
                    optimizer = Adam(   lr = lr,
                                        decay = decay))
    return model

#create model
model = new_rnn_model()
model.summary()

del model
gc.collect()

2022-07-21 14:25:59.501482: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-21 14:25:59.610015: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-21 14:25:59.610850: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-21 14:25:59.612998: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
brand_name (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
item_condition (InputLayer)     [(None, 1)]          0                                            
__________________________________________________________________________________________________
desc_len (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
name_len (InputLayer)           [(None, 1)]          0                                            
______________________________________________________________________________________________

5513

In [16]:
%%time
#batch size
BATCH_SIZE = 512 * 2
epochs = 3

#lr decay
exp_decay = lambda init, fin, steps: (init / fin)**(1/(steps-1))-1
steps = int(len(X_train['name']) / BATCH_SIZE) * epochs
lr_init = 0.005
lr_fin = 0.001
lr_decay = exp_decay(lr_init, lr_fin, steps)

#create model
rnn_model = new_rnn_model(lr = lr_init, decay = lr_decay)
#training model
rnn_model.fit(  X_train,
                Y_train,
                epochs = epochs,
                batch_size = BATCH_SIZE,
                validation_data = (X_dev, Y_dev),
                verbose = 1)

2022-07-21 14:26:04.009258: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 440053200 exceeds 10% of free system memory.
2022-07-21 14:26:04.485524: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/3


2022-07-21 14:26:08.853594: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005


Epoch 2/3
Epoch 3/3
CPU times: user 1min 59s, sys: 10.5 s, total: 2min 10s
Wall time: 2min 26s


<keras.callbacks.History at 0x7f8e3896e290>

In [17]:
#Evaluate the results and measure error
%%time
print('Evaluating the model on validation data')
#predict the data using the trained model
Y_dev_preds_rnn = rnn_model.predict(X_dev,
                                    batch_size = BATCH_SIZE)

#use rmlse() to find error
print('RMSLE error:', rmsle(Y_dev,
                            Y_dev_preds_rnn))

UsageError: Line magic function `%%time` not found.


In [None]:
rnn_preds = rnn_model.predict(  X_test,
                                batch_size = BATCH_SIZE,
                                verbose = 1)

#exp transformation (eliminate the log transformation )
rnn_preds = np.expm1(rnn_preds)
del rnn_model
gc.collect()

stop_real = datetime.now()
execution_time_real = stop_real - start_real
print(execution_time_real)