### Mercari Price 
The files consist of a list of product listings. These files are tab-delimited.

Fields:
- train_id or test_id - the id of the listing

- name - the title of the listing. Note that we have cleaned the data to remove text that look like prices (e.g. $20) to avoid  leakage. These removed prices are represented as [rm]

- item_condition_id - the condition of the items provided by the seller

- category_name - category of the listing

- brand_name

- price - the price that the item was sold for. This is the target variable that you will predict. The unit is USD. This column doesn't exist in test.tsv since that is what you will predict.

- shipping - 1 if shipping fee is paid by seller and 0 by buyer

- item_description - the full description of the item. Note that we have cleaned the data to remove text that look like prices (e.g. $20) to avoid leakage. These removed prices are represented as [rm]

In [1]:
import os

In [2]:
# check if in colab
RunningInCOLAB = 'google.colab' in str(get_ipython())
if RunningInCOLAB:
    print("Running in colab")
    from google.colab import drive
    drive.mount('/content/gdrive', force_remount=True)
    colab_root = '/content/drive'
    root_dir = "/content/gdrive/My Drive/"
    base_dir = root_dir + 'project-mercari-price/'
    if not os.path.isdir(base_dir):
        os.mkdir(base_dir)
else:
    root_dir= os.getcwd()
    base_dir = root_dir
    
os.chdir(base_dir)

Running in colab
Mounted at /content/gdrive


In [3]:
dataset_downloaded_path = os.path.join(base_dir, "dataset_downloaded.ignore")
dataset_downloaded = os.path.isfile(dataset_downloaded_path)
dataset_downloaded

True

In [4]:
if not dataset_downloaded:
  # install kaggle to download dataset
  ! pip install kaggle python-dotenv

In [5]:
# set to True if you want to save kaggle credentials into a .env file
persist_credentials = False

if not dataset_downloaded:
  # create .env file containing KAGGLE_USER and KAGGLE_KEY
  kaggle_env = os.path.join(base_dir, '.env')
  if not os.path.isfile(kaggle_env):
    with open(kaggle_env, 'w') as envfile:
      kaggle_user = input("Insert kaggle username")
      kaggle_key = input("Insert kaggle key; generate one from kaggle account")
      if persist_credentials:
        envfile.write(f"""
        KAGGLE_USERNAME={kaggle_user}
        KAGGLE_KEY={kaggle_key}
        """)

      # set env vars
      os.environ["KAGGLE_USERNAME"] = kaggle_user
      os.environ["KAGGLE_KEY"] = kaggle_key

      del kaggle_user
      del kaggle_key

In [6]:
if not dataset_downloaded:
  # loading env vars if .env file exists
  if os.path.isfile(kaggle_env):
    from dotenv import load_dotenv
    load_dotenv(dotenv_path=kaggle_env)
  print(os.environ.get("KAGGLE_USERNAME"))

In [7]:
if not dataset_downloaded:
  # download and extract dataset
  ! kaggle competitions download -c mercari-price-suggestion-challenge

  # create file so that we know we already downloaded
  with open(dataset_downloaded_path, 'w') as dd_file:
    dataset_downloaded = True
    dd_file.write("")

  print('cwd: ', os.getcwd())
  os.listdir()

In [8]:
if not dataset_downloaded:
  ! 7z x train.tsv.7z
  ! 7z x test.tsv.7z

In [9]:
os.listdir()

['train.tsv',
 'test.tsv',
 '.env',
 'sample_submission.csv.7z',
 'test_stg2.tsv.zip',
 'train.tsv.7z',
 'test.tsv.7z',
 'sample_submission_stg2.csv.zip',
 'dataset_downloaded',
 '.git',
 '.gitignore',
 'Notebook.Rmd',
 'Readme.md',
 'requirements.txt',
 'Untitled.ipynb',
 'git',
 'dataset_downloaded.ignore']

In [10]:
import pandas as pd
from nltk.tokenize import word_tokenize
import tensorflow as tf
from tensorflow import keras

In [11]:
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  """Entry point for launching an IPython kernel.


In [12]:
dtypes={
    'name': 'string',
    'item_condition_id': 'byte',
    'category_name': 'string',
    'brand_name': 'string',
    'price': 'float',
    'shipping': 'boolean',
    'item_description': 'string'
}
data = pd.read_csv("train.tsv", sep='\t', index_col="train_id", dtype=dtypes)
data

  mask |= (ar1 == a)


Unnamed: 0_level_0,name,item_condition_id,category_name,brand_name,price,shipping,item_description
train_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,True,No description yet
1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,False,This keyboard is in great condition and works ...
2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,True,Adorable top with a hint of lace and a key hol...
3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,True,New with tags. Leather horses. Retail for [rm]...
4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,False,Complete with certificate of authenticity
...,...,...,...,...,...,...,...
1482530,Free People Inspired Dress,2,Women/Dresses/Mid-Calf,Free People,20.0,True,"Lace, says size small but fits medium perfectl..."
1482531,Little mermaid handmade dress,2,Kids/Girls 2T-5T/Dresses,Disney,14.0,False,Little mermaid handmade dress never worn size 2t
1482532,21 day fix containers and eating plan,2,Sports & Outdoors/Exercise/Fitness accessories,,12.0,False,"Used once or twice, still in great shape."
1482533,World markets lanterns,3,Home/Home Décor/Home Décor Accents,,45.0,True,There is 2 of each one that you see! So 2 red ...


In [13]:
data.dtypes

name                  string
item_condition_id       int8
category_name         string
brand_name            string
price                float64
shipping             boolean
item_description      string
dtype: object

In [14]:
data.shape

(1482535, 7)

In [15]:
for column in data.columns:
    print("number of null value in {} : {}".format(column,data[column].isnull().sum()))

number of null value in name : 0
number of null value in item_condition_id : 0
number of null value in category_name : 6327
number of null value in brand_name : 632682
number of null value in price : 0
number of null value in shipping : 0
number of null value in item_description : 4


In [16]:
data = data[data["item_description"].notna()]
data.shape

(1482531, 7)

In [17]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [18]:
'''max_features = 20000
embedding_dim = 128
sequence_length = 500

vectorize_layer = TextVectorization(
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)'''

'max_features = 20000\nembedding_dim = 128\nsequence_length = 500\n\nvectorize_layer = TextVectorization(\n    max_tokens=max_features,\n    output_mode="int",\n    output_sequence_length=sequence_length,\n)'

In [19]:
#description= data["item_description"].to_numpy()

In [20]:
# call `adapt` on a text-only dataset to create the vocabulary
#vectorize_layer.adapt(description)

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
type(data["item_description"].values[0])

str

In [23]:
sentences = data["item_description"].values

In [24]:
len(sentences)

1482531

In [25]:
type(sentences[0])

str

In [26]:
y = data["price"].values
sentences_train, sentences_validation, y_train, y_validation = train_test_split(sentences, y, test_size=0.25, random_state=1000)

In [27]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [28]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences_train)

In [29]:
X_train = tokenizer.texts_to_sequences(sentences_train)
X_validation = tokenizer.texts_to_sequences(sentences_validation)

In [30]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

178233

In [31]:
len(tokenizer.word_index)

178232

In [32]:
[(key,value) for key,value in tokenizer.word_index.items() if value in range(1,10)]

[('and', 1),
 ('the', 2),
 ('a', 3),
 ('for', 4),
 ('in', 5),
 ('new', 6),
 ('to', 7),
 ('with', 8),
 ('size', 9)]

In [33]:
len(sentences_train)

1111898

In [34]:
len(X_train)

1111898

In [35]:
sentences_train[0]

'For Clara only. Red boyfriend tshirt and american eagle shorts.'

In [36]:
X_train[0]

[4, 49, 139, 1808, 1350, 1, 344, 591, 222]

In [37]:
data[data["item_description"] == sentences_train[0]]

Unnamed: 0_level_0,name,item_condition_id,category_name,brand_name,price,shipping,item_description
train_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1363265,Clara's bundle.,3,Women/Athletic Apparel/Shirts & Tops,,13.0,False,For Clara only. Red boyfriend tshirt and ameri...


In [38]:
from keras.preprocessing.sequence import pad_sequences

maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_validation = pad_sequences(X_validation, padding='post', maxlen=maxlen)

print(X_train[0, :])

[   4   49  139 1808 1350    1  344  591  222    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]


In [39]:
len(X_train[0])

100

In [46]:
import keras.backend as K
msle = tf.keras.losses.MeanSquaredLogarithmicError()

def root_mean_squared_logarithmic_error(y_true, y_pred):
    return K.sqrt(msle(y_true, y_pred))


In [47]:
from keras.models import Sequential
from keras import layers

embedding_dim = 50

model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=maxlen))
model.add(layers.Flatten())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='linear'))
model.compile(optimizer='adam',
              loss='mse',
              metrics=[tf.keras.metrics.RootMeanSquaredError(), 'mean_squared_logarithmic_error', root_mean_squared_logarithmic_error])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 50)           8911650   
_________________________________________________________________
flatten_2 (Flatten)          (None, 5000)              0         
_________________________________________________________________
dense_4 (Dense)              (None, 10)                50010     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 11        
Total params: 8,961,671
Trainable params: 8,961,671
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(X_train, y_train,
                    epochs=5,
                    verbose=True,
                    validation_data=(X_validation, y_validation),
                    batch_size=256)

Epoch 1/5
Epoch 2/5
Epoch 3/5

In [44]:
#history1= history

In [None]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=True)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history)