In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!apt-get install p7zip
!p7zip -d -f -k /kaggle/input/mercari-price-suggestion-challenge/train.tsv.7z
!p7zip -d -f -k /kaggle/input/mercari-price-suggestion-challenge/test.tsv.7z
!p7zip -d -f -k /kaggle/input/mercari-price-suggestion-challenge/sample_submission.csv.7z

In [None]:
!unzip /kaggle/input/mercari-price-suggestion-challenge/sample_submission_stg2.csv.zip
!unzip /kaggle/input/mercari-price-suggestion-challenge/test_stg2.tsv.zip

# Data preparation

In [None]:
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
train_df = pd.read_csv('train.tsv', sep='\t')
train_df.shape

In [None]:
test_df = pd.read_csv('test_stg2.tsv', sep='\t')
test_df.shape

In [None]:
train_df.head()

In [None]:
train_df.info()

# Target's distribution

In [None]:
plt.figure(figsize=(6, 4))
sns.distplot(train_df['price'], kde=False)

In [None]:
sns.distplot(np.log1p(train_df['price']), kde=False)

## Apply log1p to price

In [None]:
train_df['price'] = np.log1p(train_df['price'])

# Overview other features

In [None]:
train_df['shipping'].value_counts()

In [None]:
train_df['item_condition_id'].value_counts()

In [None]:
train_df['item_description'].value_counts()

## 'No description yet'

In [None]:
train_df[train_df['item_description'] == 'No description yet']['item_description'].count()

# category_name

In [None]:
def split_cat(category_name):
    try:
        return category_name.split('/')
    except:
        return ['Other_Null', 'Other_Null', 'Other_Null']

In [None]:
train_df['cat_dae'], train_df['cat_jung'], train_df['cat_so'] = zip(*train_df['category_name'].apply(lambda x: split_cat(x)))
test_df['cat_dae'], test_df['cat_jung'], test_df['cat_so'] = zip(*test_df['category_name'].apply(lambda x: split_cat(x)))

In [None]:
print(train_df['cat_dae'].nunique())
print(train_df['cat_jung'].nunique())
print(train_df['cat_so'].nunique())

In [None]:
train_df['brand_name'] = train_df['brand_name'].fillna(value='Other_Null')
train_df['category_name'] = train_df['category_name'].fillna(value='Other_Null')
train_df['item_description'] = train_df['item_description'].fillna(value='Other_Null')

test_df['brand_name'] = test_df['brand_name'].fillna(value='Other_Null')
test_df['category_name'] = test_df['category_name'].fillna(value='Other_Null')
test_df['item_description'] = test_df['item_description'].fillna(value='Other_Null')

In [None]:
train_df.isnull().sum()

In [None]:
test_df.isnull().sum()

# brand_name

In [None]:
train_df['brand_name'].nunique()

In [None]:
train_df['brand_name'].value_counts()[:5]

# name

In [None]:
train_df['name'].nunique()

In [None]:
train_df['name'].value_counts()[:10]

# item_description

In [None]:
train_df['item_description'].str.len().mean()

In [None]:
train_df['item_description'][:2]

# Vectorize name, item_description

In [None]:
cnt_vec = CountVectorizer()

X_train_name = cnt_vec.fit_transform(train_df['name'])
X_test_name = cnt_vec.transform(test_df['name'])

In [None]:
print(X_train_name.shape)
print(X_test_name.shape)

In [None]:
tfidf_descp = TfidfVectorizer(max_features=50000, ngram_range=(1, 3), stop_words='english')

X_train_descp = tfidf_descp.fit_transform(train_df['item_description'])
X_test_descp = tfidf_descp.transform(test_df['item_description'])

In [None]:
print(X_train_descp.shape)
print(X_test_descp.shape)

# One-hot encoding via LabelBinarizer (for CSR sparse matrix)

In [None]:
from sklearn.preprocessing import LabelBinarizer

lb_brand_name = LabelBinarizer(sparse_output=True)
X_train_brand = lb_brand_name.fit_transform(train_df['brand_name'])
X_test_brand = lb_brand_name.transform(test_df['brand_name'])

lb_item_cond_id = LabelBinarizer(sparse_output=True)
X_train_item_condition_id = lb_item_cond_id.fit_transform(train_df['item_condition_id'])
X_test_item_condition_id = lb_item_cond_id.transform(test_df['item_condition_id'])

lb_shipping = LabelBinarizer(sparse_output=True)
X_train_shipping = lb_shipping.fit_transform(train_df['shipping'])
X_test_shipping = lb_shipping.transform(test_df['shipping'])

lb_cat_dae = LabelBinarizer(sparse_output=True)
X_train_cat_dae = lb_cat_dae.fit_transform(train_df['cat_dae'])
X_test_cat_dae = lb_cat_dae.transform(test_df['cat_dae'])

lb_cat_jung = LabelBinarizer(sparse_output=True)
X_train_cat_jung = lb_cat_jung.fit_transform(train_df['cat_jung'])
X_test_cat_jung = lb_cat_jung.transform(test_df['cat_jung'])

lb_cat_so = LabelBinarizer(sparse_output=True)
X_train_cat_so = lb_cat_so.fit_transform(train_df['cat_so'])
X_test_cat_so = lb_cat_so.transform(test_df['cat_so'])

In [None]:
print(type(X_train_brand), type(X_train_item_condition_id), type(X_train_shipping))
print(type(X_test_brand), type(X_test_item_condition_id), type(X_test_shipping))

print('X_train_brand shape:', X_train_brand.shape)
print('X_train_item_cond_id shape:', X_train_item_condition_id.shape)
print('X_train_shipping shape:', X_train_shipping.shape)
print('X_train_cat_dae shape:', X_train_cat_dae.shape)
print('X_train_cat_jung shape:', X_train_cat_jung.shape)
print('X_train_cat_so shape:', X_train_cat_so.shape)

print('X_test_brand shape:', X_test_brand.shape)
print('X_test_item_cond_id shape:', X_test_item_condition_id.shape)
print('X_test_shipping shape:', X_test_shipping.shape)
print('X_test_cat_dae shape:', X_test_cat_dae.shape)
print('X_test_cat_jung shape:', X_test_cat_jung.shape)
print('X_test_cat_so shape:', X_test_cat_so.shape)

## Preview X_train

In [None]:
from scipy.sparse import hstack
import gc

sparse_matrix_list = (X_train_name, X_train_descp, X_train_brand, X_train_item_condition_id, X_train_shipping, X_train_cat_dae, X_train_cat_jung, X_train_cat_so)

X_train = hstack(sparse_matrix_list).tocsr()
print(type(X_train), X_train.shape)

del X_train
gc.collect()

# Evalutation

In [None]:
def rmsle(y, y_pred):
    return np.sqrt(np.mean(np.power(np.log1p(y) - np.log1p(y_pred), 2)))

def evaluate_orig_price(y_test, preds):
    preds_exmpm = np.expm1(preds)
    y_test_exmpm = np.expm1(y_test)
    
    return rmsle(y_test_exmpm, preds_exmpm)

In [None]:
def model_train_predict(model, matrix_list):
    X = hstack(matrix_list).tocsr()
    X_train, X_test, y_train, y_test = train_test_split(X, train_df['price'], test_size=0.2)

    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    
    del X, X_train, X_test, y_train
    gc.collect()
    
    return preds, y_test

In [None]:
linear_model = Ridge(solver='lsqr', fit_intercept=False)

sparse_matrix_list = (X_train_name, X_train_brand, X_train_item_condition_id, X_train_shipping, X_train_cat_dae, X_train_cat_jung, X_train_cat_so)
linear_preds, y_test = model_train_predict(model=linear_model, matrix_list=sparse_matrix_list)
print('Item Description을 제외했을 때 rmsle 값:', evaluate_orig_price(y_test, linear_preds))

sparse_matrix_list = (X_train_name, X_train_descp, X_train_brand, X_train_item_condition_id, X_train_shipping, X_train_cat_dae, X_train_cat_jung, X_train_cat_so)
linear_preds, y_test = model_train_predict(model=linear_model, matrix_list=sparse_matrix_list)
print('Item Description을 포함한 rmsle 값:', evaluate_orig_price(y_test, linear_preds))

In [None]:
from lightgbm import LGBMRegressor

lgbm_model = LGBMRegressor(n_estimators=200, learning_rate=0.5, num_leaves=125)

sparse_matrix_list = (X_train_name, X_train_descp, X_train_brand, X_train_item_condition_id, X_train_shipping, X_train_cat_dae, X_train_cat_jung, X_train_cat_so)
lgbm_preds, y_test = model_train_predict(model=lgbm_model, matrix_list=sparse_matrix_list)
print('LightGBM rmsle 값:', evaluate_orig_price(y_test, lgbm_preds))

# Prediction

In [None]:
sparse_matrix_list = (X_train_name, X_train_descp, X_train_brand, X_train_item_condition_id, X_train_shipping, X_train_cat_dae, X_train_cat_jung, X_train_cat_so)
X_train = hstack(sparse_matrix_list).tocsr()
X_train

In [None]:
y_train = train_df['price']
y_train

In [None]:
lgbm_model = LGBMRegressor(n_estimators=200, learning_rate=0.5, num_leaves=125)
lgbm_model.fit(X_train, y_train)

In [None]:
sparse_matrix_list = (X_test_name, X_test_descp, X_test_brand, X_test_item_condition_id, X_test_shipping, X_test_cat_dae, X_test_cat_jung, X_test_cat_so)
X_test = hstack(sparse_matrix_list).tocsr()

In [None]:
preds = lgbm_model.predict(X_test)
preds

In [None]:
preds = np.expm1(preds)
preds

# Submission

In [None]:
submission = pd.read_csv('sample_submission_stg2.csv')
submission

In [None]:
submission.loc[:, 'price'] = preds
submission

In [None]:
submission.to_csv('submission.csv', index=False)