# Competition

[Mercari Price Suggestion Challenge](https://www.kaggle.com/c/mercari-price-suggestion-challenge)

# Overview

メルカリに出品されている商品データ（カテゴリーやブランドなど）と価格をもとに<br>
商品データから価格を予測するモデルを作成する。

# Module

In [None]:
import gc
import sys
import scipy
import optuna
import datetime
import warnings
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt

from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from pandas.api.types import is_categorical_dtype
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Unfreeze

Kaggleで用意されたデータが、7z形式になっているため<br>
解凍の処理から行っていく。

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

下記は、SettingsのInternetをONにして実行しないと、エラーになる。

In [None]:
!apt-get install p7zip
!p7zip -d -f -k /kaggle/input/mercari-price-suggestion-challenge/train.tsv.7z
!p7zip -d -f -k /kaggle/input/mercari-price-suggestion-challenge/test.tsv.7z
!p7zip -d -f -k /kaggle/input/mercari-price-suggestion-challenge/sample_submission.csv.7z

In [None]:
!unzip /kaggle/input/mercari-price-suggestion-challenge/sample_submission_stg2.csv.zip
!unzip /kaggle/input/mercari-price-suggestion-challenge/test_stg2.tsv.zip

# Datasets

In [None]:
train = pd.read_csv('train.tsv', sep='\t')

print(train.shape)
train.head(10)

In [None]:
pd.DataFrame([['train_id', '商品ID'],
              ['name', '商品名'],
              ['item_condition_id', '商品の状態'],
              ['category_name', 'カテゴリー名'],
              ['brand_name', 'ブランド名'],
              ['price', '価格（単位：ドル）'],
              ['shipping', '送料が売手負担かどうか'],['item_description', '商品説明']],
              columns=['カラム', '意味'])

testデータは、test.tsvとtest_stg2.tsvがある。<br>
submitのときには、test_stg2を使用する。

In [None]:
test_dummy = pd.read_csv('test.tsv', sep='\t')

print(test_dummy.shape)
test_dummy.head(10)

In [None]:
pd.DataFrame([['test_id', '商品ID'],
              ['name', '商品名'],
              ['item_condition_id', '商品の状態'],
              ['category_name', 'カテゴリー名'],
              ['brand_name', 'ブランド名'],
              ['shipping', '送料が売手負担かどうか'],['item_description', '商品説明']],
              columns=['カラム', '意味'])

In [None]:
test = pd.read_csv('test_stg2.tsv', sep='\t')

print(test.shape)
test.head(10)

In [None]:
pd.DataFrame([['test_id', '商品ID'],
              ['name', '商品名'],
              ['item_condition_id', '商品の状態'],
              ['category_name', 'カテゴリー名'],
              ['brand_name', 'ブランド名'],
              ['shipping', '送料が売手負担かどうか'],['item_description', '商品説明']],
              columns=['カラム', '意味'])

In [None]:
del test_dummy
gc.collect()

# Analytics

In [None]:
train.info()

In [None]:
train.nunique()

In [None]:
train.isnull().sum()

In [None]:
train.corr()

In [None]:
sns.heatmap(train.corr(), annot=True, fmt='.2f')

In [None]:
train.describe()

# Visualization

In [None]:
sns.set()

In [None]:
train = train.drop(train[(train['price'] >= 200)].index, axis=0)

fig, ax = plt.subplots(1, 1, figsize=(8, 4))
sns.histplot(train['price'], bins=50, ax=ax)
plt.show()

In [None]:
train['log_price'] = np.log1p(train['price'])

fig, ax = plt.subplots(1, 1, figsize=(8, 4))
sns.histplot(train['log_price'], bins=50, ax=ax)
plt.show()

In [None]:
train = train.drop('log_price', axis=1)
gc.collect()

In [None]:
fig = sns.FacetGrid(train, hue='shipping', height=4, aspect=2)
fig.map(sns.kdeplot, 'price', shade=True)
plt.legend()

In [None]:
fig = sns.FacetGrid(train, hue='item_condition_id', height=4, aspect=2)
fig.map(sns.kdeplot, 'price', shade=True)
plt.legend()

# Preprocess

In [None]:
train_row = train.shape[0]
test_id = test['test_id']

In [None]:
target = np.log1p(train['price'])
train = train[[col for col in train.columns if col != 'price']]

print(train.shape)
print(target.shape)

In [None]:
train = train.drop('train_id', axis=1)
test = test.drop('test_id', axis=1)

In [None]:
X = pd.concat([train, test], axis=0)

print(X.shape)
X.head()

In [None]:
X.isnull().sum()

In [None]:
X['category_name'] = X['category_name'].fillna(value='missing')
X['brand_name'] = X['brand_name'].fillna(value='missing')
X['item_description'] = X['item_description'].fillna(value='missing')

In [None]:
cv = CountVectorizer()

name_data = cv.fit_transform(X['name'])
name_data

In [None]:
tv = TfidfVectorizer()

description_data = tv.fit_transform(X['item_description'])
description_data

In [None]:
lb = LabelBinarizer(sparse_output=True)

brand_data = lb.fit_transform(X['brand_name'])
brand_data

In [None]:
category_data = lb.fit_transform(X['category_name'])
category_data

In [None]:
condition_data = scipy.sparse.csr_matrix(pd.get_dummies(X['item_condition_id'], sparse = True).values)
condition_data

In [None]:
shipping_data = scipy.sparse.csr_matrix(pd.get_dummies(X['shipping'], sparse = True).values)
shipping_data

In [None]:
X_sparse = scipy.sparse.hstack((name_data, brand_data, description_data, category_data, condition_data, shipping_data)).tocsr()
X_sparse

In [None]:
train = X_sparse[:train_row]
test = X_sparse[train_row:]

In [None]:
del X, X_sparse, name_data, description_data, brand_data, category_data, condition_data, shipping_data, train_row, cv, tv, lb
gc.collect()

In [None]:
print(train.shape)
print(target.shape)

# Modeling

In [None]:
'''
X_tr, X_val, y_tr, y_val = train_test_split(train, target, test_size=0.2, random_state=666)

def create_model(trial):
    num_leaves = trial.suggest_int('num_leaves', 2, 30)
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    learning_rate = trial.suggest_uniform('learning_rate', 0.0001, 0.99)
    max_depth = trial.suggest_int('max_depth', 2, 10)
    min_child_samples = trial.suggest_int('min_child_samples', 100, 1200)
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 5, 90)
    bagging_freq = trial.suggest_int('bagging_freq', 1, 7)
    bagging_fraction = trial.suggest_uniform('bagging_fraction', 0.0001, 1.0)
    feature_fraction = trial.suggest_uniform('feature_fraction', 0.0001, 1.0)
    subsample = trial.suggest_uniform('subsample', 0.1, 1.0)
    colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.1, 1.0)
    
    model = lgb.LGBMRegressor(
        num_leaves=num_leaves,
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth, 
        min_child_samples=min_child_samples, 
        min_data_in_leaf=min_data_in_leaf,
        bagging_freq=bagging_freq,
        bagging_fraction=bagging_fraction,
        feature_fraction=feature_fraction,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        metric='rsme',
        random_state=666)
    return model

def objective(trial):
    model = create_model(trial)
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_val)
    rsme = np.sqrt(mean_squared_error(y_pred, y_val))
    return rsme

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=40)
params = study.best_params
print(params)
'''

In [None]:
params = {'num_leaves': 26,
          'n_estimators': 235,
          'learning_rate': 0.5826338528057535,
          'max_depth': 9,
          'min_child_samples': 734,
          'min_data_in_leaf': 69,
          'bagging_freq': 1,
          'bagging_fraction': 0.9765236276263333,
          'feature_fraction': 0.6665769584205234,
          'subsample': 0.41489303247984344,
          'colsample_bytree': 0.10712399681871226,
          'random_state': 666}

In [None]:
# msl:0.49
# {'num_leaves': 19, 'n_estimators': 294, 'learning_rate': 0.30629521899319057, 'max_depth': 8, 'min_child_samples': 836, 'min_data_in_leaf': 32, 'bagging_freq': 4, 'bagging_fraction': 0.1460764432004789, 'feature_fraction': 0.8707981099370118, 'subsample': 0.3296674411363768, 'colsample_bytree': 0.8999093468212471}

In [None]:
cls = lgb.LGBMRegressor(**params)
cls.fit(train, target)

In [None]:
del train, target, params
gc.collect()

# Submit

In [None]:
prediction = np.expm1(cls.predict(test))

In [None]:
del test
gc.collect()

In [None]:
submit = pd.DataFrame(test_id, columns=['test_id'])
submit['price'] = prediction

submit.head(10)

In [None]:
del test_id, prediction
gc.collect()

Submissionファイルは、/kaggle下ではなく、/kaggle/working下に作成する。<br>
また、submissionのCSVファイルについて、index=Falseにする。

In [None]:
submit.to_csv('./submission.csv', index=False)