
# Mercari Price Suggestion Algorithm

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import category_encoders as ce

In [None]:
# Install 7zip and unzip all files
!apt install --assume-yes p7zip-full
!7z x ../input/mercari-price-suggestion-challenge/train.tsv.7z -y
!7z x ../input/mercari-price-suggestion-challenge/test.tsv.7z -y
!7z x ../input/mercari-price-suggestion-challenge/test_stg2.tsv.zip -y
!7z x ../input/mercari-price-suggestion-challenge/sample_submission.csv.7z -y
!7z x ../input/mercari-price-suggestion-challenge/sample_submission_stg2.csv.zip -y

In [None]:
train = pd.read_csv("train.tsv",sep = '\t')

In [None]:
train.shape

In [None]:
train.head()

## Exploring Price

Hypothesis: Price will follow a log-normal distribution

In [None]:
train['log_price'] = np.log1p(train.price)

In [None]:
from scipy.stats import norm

fig, axes = plt.subplots(1, 3, figsize=(10,4))

axes[0].set_title('Price')
sns.distplot(train.price, ax=axes[0], kde=False)
axes[0].grid()

axes[1].set_title('Price < 75')
sns.distplot(train.price[train.price<75], ax=axes[1], kde=False)
axes[1].grid()

axes[2].set_title('log(Price + 1)')
sns.distplot(train.log_price, ax=axes[2], fit=norm, kde=False)
axes[2].set_xticks(range(0,9))
axes[2].grid()

fig.tight_layout()

The price follows a log-normal distribution with some spikes around round numbers such as 10, 25, 30, and 40 dollars.

## Exploring Data Attributes

In [None]:
def plot_distribution_and_violin(variable):
    fig, axes = plt.subplots(2,1,figsize=(5,6), sharex=True)
    axes[0].set_title(variable)
    sns.countplot(x=variable, data=train, palette="ch:.25", color="c", ax=axes[0])
    sns.violinplot(x=variable, y='log_price', palette="ch:.25", data=train, ax=axes[1])
    fig.tight_layout()

Hypothesis: Items in better condition will have a higher average price

In [None]:
plot_distribution_and_violin('item_condition_id')

Items where shipping is paid by seller (1) will be priced higher on average than where shipping is paid by buyer (0)

In [None]:
plot_distribution_and_violin('shipping')
print('shipping - 1 if shipping fee is paid by seller and 0 by buyer')

In [None]:
avg_log_price = train.groupby('shipping')['log_price'].mean()
diff = np.expm1(avg_log_price[0]) - np.expm1(avg_log_price[1])
print(f'Shipping shipping is paid by buyer ${diff:.3} greater than shipping is paid by seller')

Actually shipping paid by buyer was greater. Maybe that means "free shipping" is more commonly used for cheaper items.

#### Decompose category

Category can be decomposed into three levels (ie. Men/Tops/T-shirts). In the data they are delimited by forward slashes. Some of the third level categories have slashes in the name, but it is still reliable to split on the first two slashes.

In [None]:
train[train.category_name.str.count('/')!=2].groupby('category_name').count()#sort_values(ascending=False)

In [None]:
def transform_split_category_name(df):
    category_split = df['category_name'].str.split(r'/', n=2, expand=True)
    for i in [0,1,2]:
        df['cat_level_' + str(i)] = category_split[i]
    return df

In [None]:
train = transform_split_category_name(train)

In [None]:
train.head()

#### Analyze Description
Hypothesis: There will be some very common descriptions that can be treated as missing values.

In [None]:
gb = train.groupby('item_description')['train_id'].count()
gb[gb>500].sort_values(ascending=False)

There are 82,489 items with 'No description yet' which can be considered a missing value, but other very common short phrases are also duplicated often. 

The length of the description may have some predictive power.

## Analyze Brands

In [None]:
def transform_missing_brand(df):
    df['missing_brand'] = df.brand_name.isna()
    return df

In [None]:
train = transform_missing_brand(train)

In [None]:
plot_distribution_and_violin('missing_brand')

In [None]:
# Most common brands
train.groupby('brand_name')['train_id'].count().sort_values(ascending=False).head(40)

# Baseline Models

Here is a baseline model that predicts based on the average price of one field

In [None]:
class baseline_model():
    
    def fit(self):
        pass
    
    def predict(self, X):
        return np.zeros(shape=len(X))

In [None]:
m = baseline_model()
m.fit()
m.predict([[1,2,3],
           [4,5,6]])

## Test Train Split

In [None]:
X = train[[col for col in train.columns if col not in ['price', 'log_price']]]
y = train['log_price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Initial Model

- Linear Regression
- Target encoded categories
- Target encoded brand_name
- Item condition
- Shipping

In [None]:
# Select necessary columns
X_train = X_train[['item_condition_id', 'category_name', 'brand_name', 'shipping']]
X_test = X_test[['item_condition_id', 'category_name', 'brand_name', 'shipping']]

In [None]:
te = ce.TargetEncoder()
te.fit(X_train, y_train)
X_train = te.transform(X_train)
X_test = te.transform(X_test)

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr = LinearRegression()

In [None]:
lr.fit(X_train, y_train)

In [None]:
predictions = lr.predict(X_test)

In [None]:
predictions

In [None]:
sns.jointplot(predictions, y_test, kind="hex")

In [None]:
## Generate submissions
kaggle_test = pd.read_csv("test_stg2.tsv",sep = '\t')

In [None]:
kaggle_test = transform_split_category_name(kaggle_test)
X_kaggle_test = kaggle_test
X_kaggle_test = X_kaggle_test[['item_condition_id', 'category_name', 'brand_name', 'shipping']]
X_kaggle_test = te.transform(X_kaggle_test)

In [None]:
X_kaggle_test.head()

In [None]:
output = lr.predict(X_kaggle_test)

In [None]:
submission = np.expm1(output)

In [None]:
submission_df = pd.DataFrame({'test_id': kaggle_test.test_id, 'price': submission})

In [None]:
submission_df.to_csv('submission.csv', index=False)

In [None]:
submission_df.shape

Ideas:

- Reduce the dimensionality of the brand column by encoding similar brand to the same value (gucci, louis) > brand_1 (adidas, nike) > brand_2
- Feature (binary) is brand in title
- Identify certain n-grams that change value ('smoke free', 'minor damage')
- Remove tag words? Could be challenging...
- Look for typos in brand_name (probably not many because the field autofills)