# Mercari Prediction - From Median and Beyond
The simplest model using medians achieved middle rank of the competition. Here is the code:

In [10]:
# import libraries
import numpy as np
import pandas as pd
import random

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

In [11]:
# Import data files, check dimensions
train = pd.read_csv('../input/train.tsv', sep = '\t')
# test = pd.read_csv('../input/test.tsv', sep = '\t')
test2 = pd.read_csv('../input/test_stg2.tsv', sep = '\t')
# submit = pd.read_csv('../input/sample_submission.csv')
submit2 = pd.read_csv('../input/sample_submission_stg2.csv')
print (train.shape, test2.shape, submit2.shape)

Some columns have NaN values. It is better to fill them first before the next step

In [12]:
# Clean data
train['item_condition_id'] = train['item_condition_id'].astype('category')
train['category_name'].fillna('No data/No data/No data', inplace=True)
test2['category_name'].fillna('No data/No data/No data', inplace=True)
train['brand_name'].fillna('None', inplace=True)
test2['brand_name'].fillna('None', inplace=True)
train['item_description'].fillna('No description yet', inplace=True)
train.isnull().sum()

In [13]:
# Randomly generate a few products to inspect
train.sample(5)

In [14]:
train['price'].describe().apply(lambda x: format(x, 'f'))

Here we extract the median price by each category and brand name and directly use that price to predict other produts with the same category and same brand name.

If, within the same category, a product with a brand name in the test set not seen in the training set, the median price of that category is used.

In [15]:
ref = train.groupby(['category_name', 'brand_name']).median()['price'].reset_index()
ref2 = train.groupby(['category_name']).median()['price'].reset_index()
test2 = pd.merge(test2, ref, how='left', on=['category_name','brand_name'])
test2.isnull().sum()

There are 9509 products in the test set which the same **category AND brand** is not seen in the training set. The category median price is imputed. To do this, we separate the testing set into two (on whether a price is available), give the price to those 9509 products, and merge them back together

In [16]:
submit_a = test2.loc[~test2.price.isnull(),['test_id','price']]
# submit_a.head()
test_b = test2.loc[test2.price.isnull(),:].drop('price',axis=1)
test_b = pd.merge(test_b, ref2, how='left', on=['category_name'])
test_b.price.fillna(train.price.median(), inplace=True)
test_b.price.describe()

From the output of describe(), count is 9509, indicates that all products now have price

In [17]:
submit_b = test_b.loc[:,['test_id','price']]
submit_q = pd.concat([submit_a, submit_b])
submit_q.shape

The dimension of test set is right. Now we export that to csv file for submission:

In [18]:
submit_q.to_csv('submit_q_late.csv', index=False)

This surprisingly simple model (without machine learning at all) achieved a 0.59062 score (RMSL error) and ranked in the middle of the private leaderboard, while 769 hard-working competitors have their models failed in stage 2 of the competition and got score 99 (I feel sorry for them).

# Now, it's time to add a bit of complexity...
The median prices we computed before become one feature, and then we will add item condition and shipping into the model. 
(The following codes are written after the competition closed. For demonstration purpose I evaluate the metrics with a validation set)

In [None]:
# Merge back the median prices into training set to form an input variable
train = pd.merge(train, ref, how='left', on=['category_name','brand_name'])
train.rename(columns = {'price_x':'price', 'price_y':'med_price'}, inplace = True)
train.head()

In [None]:
# Create training and validation dataframes
X = pd.concat([train.loc[:,['item_condition_id','shipping']], np.log(train.med_price+1)], axis=1)
y = np.log(train.price+1)

We use random forest regressor below.

In [None]:
'''Temporarily disabled
Xtr, Xv, ytr, yv = train_test_split(X, y, test_size=0.2, random_state=167)
model = RandomForestRegressor(random_state=100)
model.fit(Xtr,ytr)
yv_pred = model.predict(Xv)
'''

Let's visualize the actual price and predicted price in validation set.

In [None]:
'''plt.figure(figsize=(10,7))
plt.scatter(yv_pred, yv, s=10)
plt.xlabel('Log Predicted Price')
plt.ylabel('Log Actual Price')
plt.title('Actual vs Predicted Price in Validation Set')
plt.show()
'''

The competition uses root mean square log error as evaluation metric. We code the function below and use it to calculate for our models.

In [None]:
def rmsle(y_true,y_pred):
   assert len(y_true) == len(y_pred)
   return np.square(y_pred - y_true).mean() ** 0.5

In [None]:
# print ("RMSL error of the model is {:.4f}".format(rmsle(yv, yv_pred)))
# print ("RMSL error of the simplest median is {:.4f}".format(rmsle(yv, Xv.med_price)))

# Now, let's work on some NLP...

In [None]:
# Tfidf Vectorizer on names
'''
tfidf_obj = TfidfVectorizer(ngram_range = (1,1))
tfidf_train = tfidf_obj.fit_transform(train['name'].values.tolist())
print (tfidf_train.shape)
'''

In [None]:
# TruncatedSVD to reduce the name data into 10 dimensions
'''
n_comp = 10
svd_obj = TruncatedSVD(n_components=n_comp, algorithm = 'arpack')
train_svd = pd.DataFrame(svd_obj.fit_transform(tfidf_train))    
train_svd.columns = ['svd_name_'+str(i) for i in range(n_comp)]

train = pd.concat([train, train_svd], axis=1)
train.head()
'''

In [None]:
#X = pd.concat([X, train.iloc[:,-10:]], axis=1)

In [None]:
# Train a model
'''Temporarily disabled
Xtr, Xv, ytr, yv = train_test_split(X, y, test_size=0.2, random_state=369)
model2 = RandomForestRegressor()
model2.fit(Xtr, ytr)
yv_pred2 = model2.predict(Xv)

print ("RMSL error of the model is {:.4f}".format(rmsle(yv, yv_pred2)))
'''

An improvement from the simplest model.

**To be continued...**