# Mercari Price Suggestion Challenge GBM Classifier

## Imports & Inits

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../')

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

import pdb
import re
import pickle
import string
import pandas as pd
import numpy as np
import lightgbm as lgb
np.set_printoptions(precision=4)

from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from pathlib import Path

from utils.data_utils import set_two_splits
from utils.plots import *

import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import seaborn as sns
sns.set_style("darkgrid")
%matplotlib inline

In [4]:
from args import args
vars(args)

{'path': PosixPath('data'),
 'workdir': PosixPath('data/workdir'),
 'train_tsv': PosixPath('data/train.tsv'),
 'test_tsv': PosixPath('data/test.tsv'),
 'test2_tsv': PosixPath('data/test_stg2.tsv'),
 'modeldir': PosixPath('data/workdir/models'),
 'figdir': PosixPath('data/workdir/figures')}

In [None]:
from tqdm import trange

In [None]:
desc_df = pd.read_csv(args.path/'train_df.csv', usecols=['text', 'price'])

In [None]:
for seed in trange(127, 137):  
  df = set_two_splits(desc_df.copy(), 'valid', seed=seed)
  train_df = df.loc[df['split'] == 'train', ['text', 'price']]
  valid_df = df.loc[df['split'] == 'valid', ['text', 'price']]
  
  vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=60_000)
  x_train = vectorizer.fit_transform(train_df['text'].values.astype('U'))
  x_valid = vectorizer.transform(valid_df['text'].values.astype('U'))

  with open(args.vectordir/f'bigram_{seed}.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)
    pickle.dump(x_train, f)
    pickle.dump(x_valid, f)

In [None]:
save = True

## Functions

In [None]:
def wordcloud(fi, idx2tok, min_len=5, n_tokens=50):
  idxs = np.argsort(fi)[-n_tokens:]
  score = fi[idxs]/fi[idxs].sum()
  tokens = [idx2tok[i] for i in idxs]
  d = dict(zip(tokens, score))
  return d, WordCloud(width=400, height=400, background_color='white', max_words=n_tokens, max_font_size=40, relative_scaling=0.5).generate_from_frequencies(d)

## Train and Validation

In [None]:
desc_df = pd.read_csv(args.path/'train_df.csv', usecols=['text', 'price'])
desc_df.head()

In [None]:
seed = 643
df = set_two_splits(desc_df.copy(), 'valid', seed=seed)
train_df = df.loc[df['split'] == 'train', ['text', 'price']]
valid_df = df.loc[df['split'] == 'valid', ['text', 'price']]
y_train = train_df['price']
y_valid = valid_df['price']

df.shape, train_df.shape, valid_df.shape, y_train.shape, y_valid.shape

In [None]:
# vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=60_000)
# x_train = vectorizer.fit_transform(train_df['text'].values.astype('U'))
# x_valid = vectorizer.transform(valid_df['text'].values.astype('U'))

# with open(args.vectordir/'bigram_643.pkl', 'wb') as f:
#   pickle.dump(vectorizer, f)
#   pickle.dump(x_train, f)
#   pickle.dump(x_valid, f)

In [None]:
with open(args.vectordir/'bigram_643.pkl', 'rb') as f:
  vectorizer = pickle.load(f)
  x_train = pickle.load(f)
  x_valid = pickle.load(f)
  
idx2tok = {v: k for k, v in vectorizer.vocabulary_.items()}  

In [None]:
lgb_train = lgb.Dataset(x_train, y_train)
lgb_valid = lgb.Dataset(x_valid, y_valid, reference=lgb_train)

lgb_params = {
  'num_leaves': 400,
  'learning_rate': 0.05,
  'feature_fraction': 0.9,
  'bagging_fraction': 0.7,
  'bagging_freq': 5,
  'metric': 'rmse',
  'num_threads': 32,
  'max_bin': 32,
  'objective': 'regression',
}

In [None]:
gbm = lgb.train(lgb_params, lgb_train, num_boost_round=600, valid_sets=[lgb_train, lgb_valid], early_stopping_rounds=10)

In [None]:
fig, ax = plt.subplots(figsize=(20, 20))
d, wc = wordcloud(gbm.feature_importance(importance_type='gain'), idx2tok, min_len=5, n_tokens=500)
ax.imshow(wc)
ax.axis('off')

if save:
  fig.savefig(args.figdir/'desc_wc.pdf', dpi=300, bbox_inches='tight', pad_inches=0)

In [None]:
preds = gbm.predict(x_valid)
np.round(np.sqrt(mean_squared_error(y_valid, preds)), 3)

## Full Model Training

In [None]:
train_df = pd.read_csv(args.path/'train_df.csv', usecols=['text', 'price'])
train_df.shape

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=60_000)
x_train = vectorizer.fit_transform(train_df['text'].values.astype('U'))

with open(args.vectordir/'default_bi_all.pkl', 'wb') as f:
  pickle.dump(vectorizer, f)
  pickle.dump(x_train, f)
  
y_train = train_df['price']
x_train.shape, y_train.shape

In [None]:
with open(args.vectordir/'default_bi_all.pkl', 'rb') as f:
  vectorizer = pickle.load(f)
  x_train = pickle.load(f)

idx2tok = {v: k for k, v in vectorizer.vocabulary_.items()}

In [None]:
lgb_train = lgb.Dataset(x_train, y_train)
lgb_params = {
  'num_leaves': 400,
  'learning_rate': 0.05,
  'feature_fraction': 0.9,
  'bagging_fraction': 0.7,
  'bagging_freq': 5,
  'metric': 'rmse',
  'num_threads': 32,
  'max_bin': 32,
  'objective': 'regression',
}

In [None]:
gbm = lgb.train(lgb_params, lgb_train, num_boost_round=1000, valid_sets=[lgb_train], early_stopping_rounds=10, verbose_eval=True)
pickle.dump(gbm, (args.modeldir/'gbm_desc_all.pkl').open('wb'))

## Word Cloud

In [None]:
gbm = pickle.load((args.modeldir/'gbm_desc_all.pkl').open('rb'))

with open(args.vectordir/'default_bi_all.pkl', 'rb') as f:
  vectorizer = pickle.load(f)
  x_train = pickle.load(f)

idx2tok = {v: k for k, v in vectorizer.vocabulary_.items()}

In [None]:
fig, ax = plt.subplots(figsize=(20, 20))
d, wc = wordcloud(gbm.feature_importance(importance_type='gain'), idx2tok, min_len=5, n_tokens=500)
ax.imshow(wc)
ax.axis('off')
fig.savefig(args.figdir/'gbm_desc_wc.pdf', dpi=300)

In [None]:
test_df = pd.read_csv(args.path/'test_df.csv', usecols=['test_id', 'text'])
x_test = vectorizer.transform(test_df['text'].values.astype('U'))
preds = pd.DataFrame({'test_id': test_df['test_id'], 'price': np.expm1(gbm.predict(x_test))})
preds.to_csv(args.path/'gbm_desc_submission.csv', index=False)

In [None]:
test2_df = pd.read_csv(args.path/'test2_df.csv', usecols=['test_id', 'text'])
x_test2 = vectorizer.transform(test2_df['text'].values.astype('U'))
preds_test2 = pd.DataFrame({'test_id': test2_df['test_id'], 'price': np.expm1(gbm.predict(x_test2))})
preds_test2.to_csv(args.path/'gbm_desc_submission_stg2.csv', index=False)