### Mercari Price 
The files consist of a list of product listings. These files are tab-delimited.

Fields:
- train_id or test_id - the id of the listing

- name - the title of the listing. Note that we have cleaned the data to remove text that look like prices (e.g. $20) to avoid  leakage. These removed prices are represented as [rm]

- item_condition_id - the condition of the items provided by the seller

- category_name - category of the listing

- brand_name

- price - the price that the item was sold for. This is the target variable that you will predict. The unit is USD. This column doesn't exist in test.tsv since that is what you will predict.

- shipping - 1 if shipping fee is paid by seller and 0 by buyer

- item_description - the full description of the item. Note that we have cleaned the data to remove text that look like prices (e.g. $20) to avoid leakage. These removed prices are represented as [rm]

In [None]:
import os

In [None]:
# check if in colab
RunningInCOLAB = 'google.colab' in str(get_ipython())
if RunningInCOLAB:
    print("Running in colab")
    from google.colab import drive
    drive.mount('/content/gdrive', force_remount=True)
    colab_root = '/content/drive'
    root_dir = "/content/gdrive/My Drive/"
    base_dir = root_dir + 'project-mercari-price/'
    if not os.path.isdir(base_dir):
        os.mkdir(base_dir)
else:
    root_dir= os.getcwd()
    base_dir = root_dir
    
os.chdir(base_dir)

In [None]:
dataset_downloaded_path = os.path.join(base_dir, "dataset_downloaded.ignore")
dataset_downloaded = os.path.isfile(dataset_downloaded_path)
dataset_downloaded

In [None]:
if not dataset_downloaded:
  # install kaggle to download dataset
  ! pip install kaggle python-dotenv

In [None]:
# set to True if you want to save kaggle credentials into a .env file
persist_credentials = False

if not dataset_downloaded:
  # create .env file containing KAGGLE_USER and KAGGLE_KEY
  kaggle_env = os.path.join(base_dir, '.env')
  if not os.path.isfile(kaggle_env):
    with open(kaggle_env, 'w') as envfile:
      kaggle_user = input("Insert kaggle username")
      kaggle_key = input("Insert kaggle key; generate one from kaggle account")
      if persist_credentials:
        envfile.write(f"""
        KAGGLE_USERNAME={kaggle_user}
        KAGGLE_KEY={kaggle_key}
        """)

      # set env vars
      os.environ["KAGGLE_USERNAME"] = kaggle_user
      os.environ["KAGGLE_KEY"] = kaggle_key

      del kaggle_user
      del kaggle_key

In [None]:
if not dataset_downloaded:
  # loading env vars if .env file exists
  if os.path.isfile(kaggle_env):
    from dotenv import load_dotenv
    load_dotenv(dotenv_path=kaggle_env)
  print(os.environ.get("KAGGLE_USERNAME"))

In [None]:
if not dataset_downloaded:
  # download and extract dataset
  ! kaggle competitions download -c mercari-price-suggestion-challenge

  # create file so that we know we already downloaded
  with open(dataset_downloaded_path, 'w') as dd_file:
    dataset_downloaded = True
    dd_file.write("")

  print('cwd: ', os.getcwd())
  os.listdir()

In [None]:
if not dataset_downloaded:
  ! 7z x train.tsv.7z
  ! 7z x test.tsv.7z

In [None]:
os.listdir()

In [None]:
import pandas as pd
from nltk.tokenize import word_tokenize
import tensorflow as tf
from tensorflow import keras

In [None]:
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

In [None]:
dtypes={
    'name': 'string',
    'item_condition_id': 'byte',
    'category_name': 'string',
    'brand_name': 'string',
    'price': 'float',
    'shipping': 'boolean',
    'item_description': 'string'
}
data = pd.read_csv("train.tsv", sep='\t', index_col="train_id", dtype=dtypes)
data

In [None]:
data.dtypes

In [None]:
data.shape

In [None]:
for column in data.columns:
    print("number of null value in {} : {}".format(column,data[column].isnull().sum()))

In [None]:
data = data[data["item_description"].notna()]
data.shape

In [None]:
from keras.preprocessing.text import text_to_word_sequence
'''# define the document
text = data["item_description"].iloc[1]
print(text)
# tokenize the document
result = text_to_word_sequence(text)
print(result)'''

#### split words by space, filters out punctuation and converts text to lowercase

In [None]:
data["item_description_tokens"] = data["item_description"].progress_apply(text_to_word_sequence)  

In [None]:
import nltk

stop_words=nltk.corpus.stopwords.words('english')

data["item_description_tokens"] = data["item_description"].progress_apply(lambda sentence : [word for word in sentence if word not in stop_words])

In [None]:
import matplotlib.pyplot as plt
import itertools
from collections import Counter
import seaborn as sns


def flat_list(l):
    return  [item for sublist in l for item in sublist]


def plot_common_tokens(tokens, title, n=20):
    sentences = (list(itertools.chain(tokens)))
    flat_sentences = flat_list(sentences)
    counts = Counter(flat_sentences)
    #print(counts.most_common(30))
    common_words = [word[0] for word in counts.most_common(n)]
    common_counts = [word[1] for word in counts.most_common(n)]
    fig = plt.figure(figsize=(18,6))
    sns.barplot(x=common_words, y=common_counts)
    plt.title(title)
    plt.show()


In [None]:
plot_common_tokens(data["item_description_tokens"], "Most Common Tokens from Descriptions")

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
type(data["item_description"].values[0])

In [None]:
sentences = data["item_description"].values

In [None]:
len(sentences)

In [None]:
type(sentences[0])

In [None]:
y = data["price"].values
sentences_train, sentences_validation, y_train, y_validation = train_test_split(sentences, y, test_size=0.25, random_state=1000)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences_train)

In [None]:
X_train = tokenizer.texts_to_sequences(sentences_train)
X_validation = tokenizer.texts_to_sequences(sentences_validation)

In [None]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

In [None]:
len(tokenizer.word_index)

In [None]:
[(key,value) for key,value in tokenizer.word_index.items() if value in range(1,10)]

In [None]:
len(sentences_train)

In [None]:
len(X_train)

In [None]:
sentences_train[0]

In [None]:
X_train[0]

In [None]:
data[data["item_description"] == sentences_train[0]]

In [None]:
from keras.preprocessing.sequence import pad_sequences

maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_validation = pad_sequences(X_validation, padding='post', maxlen=maxlen)

print(X_train[0, :])

In [None]:
len(X_train[0])

In [None]:
import keras.backend as K
msle = tf.keras.losses.MeanSquaredLogarithmicError()

def root_mean_squared_logarithmic_error(y_true, y_pred):
    return K.sqrt(msle(y_true, y_pred))


In [None]:
from keras.models import Sequential
from keras import layers

embedding_dim = 50

model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=maxlen))
model.add(layers.Flatten())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='linear'))
model.compile(optimizer='adam',
              loss='mse',
              metrics=[tf.keras.metrics.RootMeanSquaredError(), 'mean_squared_logarithmic_error', root_mean_squared_logarithmic_error])
model.summary()

In [None]:
history = model.fit(X_train, y_train,
                    epochs=5,
                    verbose=True,
                    validation_data=(X_validation, y_validation),
                    batch_size=256)

In [None]:
#history1= history

In [None]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=True)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history)