# Mercari Price Suggestion Challenge

## Import Packages

In [16]:
import nltk
import re
import pickle

import pandas as pd
import numpy as np

from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.sparse import hstack
from sklearn.metrics import mean_squared_error

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## Utility Functions

In [3]:
def text_length(text, no_desc_string):

  '''Function to compute the text length only for items with descriptions'''
  
  try:
    if text in no_desc_string:
      return 0
    else:
      return len(text.split())
  except:
    return 0

In [4]:
def categ_split(text):
  
  '''Function to split the category into three parts'''

  if len(text.split('/')) == 1:
    return 'missing', 'missing', 'missing'
  else:    
    main_categ, sub_categ_one, sub_categ_two = text.split('/')[:3]
    return main_categ, sub_categ_one, sub_categ_two

In [None]:
brand_set = pickle.load(open("brand_set.pkl", 'rb'))

In [5]:
# https://www.kaggle.com/valkling/mercari-rnn-2ridge-models-with-notes-0-42755
def missing_brand(features):

  '''Function to fill the missing brands with words from the name feature'''
  
  brand = features[0]
  name = features[1]
  if brand == 'missing':
    for word in name.split():
      if word in brand_set:        
        return word
  if name in brand_set:    
    return name  
  return brand

In [6]:
eng_stopwords = stopwords.words('english')

In [7]:
def text_preprocess(text, eng_stopwords):

  '''Function to perform text preprocessing'''

  text = decontracted(text) #Funtion to perform decontractions
  text = re.sub("[\-\\\n\t]", " ", text)  #Regex to remove all \n, \t, - and \
  text = re.sub("[^A-Za-z0-9]", " ", text)  #Regex to remove all the words except A-Za-z0-9
  text = re.sub('\s\s+', ' ', str(text))  #Regex to remove all the extra spaces
  text = text.lower() #Converts everything to lower case
  text = " ".join([word for word in text.split() if word not in eng_stopwords]) #Remove stopwords
  return text

In [8]:
# https://stackoverflow.com/a/47091490/4084039
def decontracted(phrase):

    '''Function to perform decontraction'''
    
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

## Function 1

In [90]:
def final_fun_1(datapoint):

  # **********Text Preprocessing**********
  
  no_desc_string = 'No description yet'  
  datapoint['item_desc_length'] = datapoint['item_description'].apply(lambda x : text_length(x, no_desc_string))
  datapoint['name_length'] = datapoint['name'].apply(lambda x : text_length(x, no_desc_string))
  datapoint['category_name'].fillna(value='missing', inplace=True)
  datapoint['main_categ'], datapoint['sub_categ_one'], datapoint['sub_categ_two'] = zip(
      *datapoint['category_name'].apply(lambda x : categ_split(x)))
  datapoint['brand_name'].fillna(value='missing', inplace=True)  
  datapoint['brand'] = datapoint[['brand_name', 'name']].apply(missing_brand, axis=1)
  datapoint['item_description'].fillna(value='missing', inplace=True)
  datapoint['item_description'] = datapoint['item_description'].replace(no_desc_string, 'missing')
  datapoint['item_desc_preprocess'] = datapoint['item_description'].apply(lambda x : text_preprocess(x, eng_stopwords))
  datapoint['name_preprocess'] = datapoint['name'].apply(lambda x : text_preprocess(x, eng_stopwords))
  datapoint.drop(columns=['train_id', 'name', 'category_name', 'brand_name', 'price', 'item_description'], axis=1, inplace=True)

  # **********Feature Transformation**********  
  
  item_cond_vectorizer, ship_vectorizer, main_categ_vectorizer, sub_categ_one_vectorizer, sub_categ_two_vectorizer, 
  brand_vectorizer, item_desc_length_minmax_scaler, name_length_minmax_scaler, name_preprocess_vectorizer, 
  item_desc_preprocess_vectorizer = pickle.load(open("train_fit_transform.pkl", 'rb'))
  
  item_condition = item_cond_vectorizer.transform(datapoint['item_condition_id'].values.reshape(-1,1))  
  shipping = ship_vectorizer.transform(datapoint['shipping'].values.reshape(-1,1))  
  main_categ = main_categ_vectorizer.transform(datapoint['main_categ'].astype(str))  
  sub_categ_one = sub_categ_one_vectorizer.transform(datapoint['sub_categ_one'].astype(str))  
  sub_categ_two = sub_categ_two_vectorizer.transform(datapoint['sub_categ_two'].astype(str))  
  brand = brand_vectorizer.transform(datapoint['brand'].astype(str))  
  item_desc_length = item_desc_length_minmax_scaler.transform(datapoint['item_desc_length'].values.reshape(-1,1))  
  name_length = name_length_minmax_scaler.transform(datapoint['name_length'].values.reshape(-1,1))  
  name = name_preprocess_vectorizer.transform(datapoint['name_preprocess'].values)  
  item_desc_tfidf = item_desc_preprocess_vectorizer.transform(datapoint['item_desc_preprocess'].values)

  # **********Feature Merging**********

  feature_merged_tfidf = hstack((item_condition, shipping, main_categ, sub_categ_one, sub_categ_two, 
                         brand, item_desc_length, name_length, name, 
                         item_desc_tfidf)).tocsr()

  # **********Model Prediction**********

  lgbm_model = pickle.load(open("lgbm_model_tfidf_modif_0_4320.pkl", 'rb'))
  y_pred = lgbm_model.predict(feature_merged_tfidf)
  
  return y_pred

In [91]:
df = pd.read_csv('train.tsv', delimiter='\t')
datapoint = df.loc[[0]]
y_pred = final_fun_1(datapoint)

In [92]:
print("y_pred: ", y_pred)

y_pred:  [2.37330328]


In [96]:
print("y_target: ", df.loc[0][5])
print("y_pred: ", np.exp(2.37330328))

y_target:  10.0
y_pred:  10.732787193558792


## Function 2

In [25]:
def final_fun_2(datapoint, y_target):

  # **********Text Preprocessing**********
  
  no_desc_string = 'No description yet'  
  datapoint['item_desc_length'] = datapoint['item_description'].apply(lambda x : text_length(x, no_desc_string))
  datapoint['name_length'] = datapoint['name'].apply(lambda x : text_length(x, no_desc_string))
  datapoint['category_name'].fillna(value='missing', inplace=True)
  datapoint['main_categ'], datapoint['sub_categ_one'], datapoint['sub_categ_two'] = zip(
      *datapoint['category_name'].apply(lambda x : categ_split(x)))
  datapoint['brand_name'].fillna(value='missing', inplace=True)
  datapoint['brand'] = datapoint[['brand_name', 'name']].apply(missing_brand, axis=1)
  datapoint['item_description'].fillna(value='missing', inplace=True)
  datapoint['item_description'] = datapoint['item_description'].replace(no_desc_string, 'missing')
  datapoint['item_desc_preprocess'] = datapoint['item_description'].apply(lambda x : text_preprocess(x, eng_stopwords))
  datapoint['name_preprocess'] = datapoint['name'].apply(lambda x : text_preprocess(x, eng_stopwords))
  datapoint.drop(columns=['train_id', 'name', 'category_name', 'brand_name', 'price', 'item_description'], axis=1, inplace=True)

  # **********Feature Transformation**********  

  item_cond_vectorizer, ship_vectorizer, main_categ_vectorizer, sub_categ_one_vectorizer, sub_categ_two_vectorizer, 
  brand_vectorizer, item_desc_length_minmax_scaler, name_length_minmax_scaler, name_preprocess_vectorizer, 
  item_desc_preprocess_vectorizer = pickle.load(open("train_fit_transform.pkl", 'rb'))
  
  item_condition = item_cond_vectorizer.transform(datapoint['item_condition_id'].values.reshape(-1,1))  
  shipping = ship_vectorizer.transform(datapoint['shipping'].values.reshape(-1,1))  
  main_categ = main_categ_vectorizer.transform(datapoint['main_categ'].astype(str))  
  sub_categ_one = sub_categ_one_vectorizer.transform(datapoint['sub_categ_one'].astype(str))  
  sub_categ_two = sub_categ_two_vectorizer.transform(datapoint['sub_categ_two'].astype(str))  
  brand = brand_vectorizer.transform(datapoint['brand'].astype(str))  
  item_desc_length = item_desc_length_minmax_scaler.transform(datapoint['item_desc_length'].values.reshape(-1,1))  
  name_length = name_length_minmax_scaler.transform(datapoint['name_length'].values.reshape(-1,1))  
  name = name_preprocess_vectorizer.transform(datapoint['name_preprocess'].values)  
  item_desc_tfidf = item_desc_preprocess_vectorizer.transform(datapoint['item_desc_preprocess'].values)

  # **********Feature Merging**********

  feature_merged_tfidf = hstack((item_condition, shipping, main_categ, sub_categ_one, sub_categ_two, 
                         brand, item_desc_length, name_length, name, 
                         item_desc_tfidf)).tocsr()

  # **********Model Prediction**********

  lgbm_model = pickle.load(open("lgbm_model_tfidf_modif_0_4320.pkl", 'rb'))
  y_pred = lgbm_model.predict(feature_merged_tfidf)
  y_pred = y_pred[0]  

  # **********RMSLE**********
  
  rmsle = np.sqrt(mean_squared_error([y_target], [y_pred]))

  return rmsle

In [26]:
datapoint = df.loc[[0]]
y_target = np.float(np.log(datapoint['price'].values+1))
rmsle = final_fun_2(datapoint, y_target)

In [27]:
print("RMSLE: ", rmsle)

RMSLE:  0.024591988306498003
