In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
!apt-get install p7zip
!p7zip -d -f -k /kaggle/input/mercari-price-suggestion-challenge/train.tsv.7z
!p7zip -d -f -k /kaggle/input/mercari-price-suggestion-challenge/test.tsv.7z
!p7zip -d -f -k /kaggle/input/mercari-price-suggestion-challenge/sample_submission.csv.7z

In [None]:
train_data = pd.read_csv('train.tsv', sep='\t')

In [None]:
train_data = train_data.sample(300000)

In [None]:
train_data.info()

our understanding for the problem will makes us not drop the `brand_name` but the null values in the brand name would be an incentive to use additional features along with it.

In [None]:
train_data.head()

In [None]:
train_data.nunique()

# Checking target values

In [None]:
train_data.price.describe()

In [None]:
train_data.price.hist()

In [None]:
plt.hist(np.log1p(train_data.price))

Due to the large skewness in the data we will use the log for the evaluation of the target.

# Pipeline

Our preprocessing will include the following steps:

1. **imputation**  
1. **split category**
1. **minmax scaler**  
1. **onehot encoder** 
1. **text preprocessing**

## brand_name and category_name

Capitalization of brandnames and categories

In [None]:
train_data['category_name'] = train_data['category_name'].str.lower()

In [None]:
train_data['brand_name'] = train_data['brand_name'].str.lower()

In [None]:
train_data['brand_name'].value_counts()

We can see that the category names consists of 3 hierarchical levels 

In [None]:
train_data['category_name'].str.split('/', expand=True).info()

In [None]:
train_data['category_name'].str.split('/',n=2, expand=True).info()

When it comes to brand_name we can see that there a lot of brands that are not frequent and will cause an explosion when using one-hot encoding so we will select the most frequent ones only.

In [None]:
(train_data['brand_name'].value_counts() > 100).sum()

In [None]:
frequent_brands = list(train_data['brand_name'].value_counts()[train_data['brand_name'].value_counts() > 100].index)

In [None]:
train_data['price'].describe(percentiles=[0.75,0.9,0.95])

In [None]:
train_data[train_data['brand_name'].isin(frequent_brands)].query('price<50 & price>30').nunique()

So frequent are well represented in ranges of 30 to 50

In [None]:
train_data[train_data['brand_name'].isin(frequent_brands)].query('price>250').nunique()

In [None]:
train_data[~train_data['brand_name'].isin(frequent_brands)].query('price>250').nunique()

But it seems that some other brands are found in higher prices categories so we will add other prices

In [None]:
frequent_brands = frequent_brands + list(train_data[~train_data['brand_name'].isin(frequent_brands) & ~train_data['brand_name'].isna()].query('price>250').brand_name.unique())

In [None]:
train_data['brand_name'] = train_data['brand_name'].apply(lambda x: x if x in frequent_brands else 'Others')

In [None]:
len(frequent_brands)

In [None]:
train_data

When inspecting the categories, we splot them into 3 main levels, but we will fill the unknown values first.

In [None]:
train_data.fillna({'category_name':'Unknown/Unknown/Unknown', 'brand_name':'Unknown', 'name':'Unknown', 'item_description':'Unknown'}, inplace=True)

In [None]:
train_data

In [None]:
train_data = pd.concat([train_data.drop('category_name', axis=1),train_data['category_name'].str.split('/',n=2, expand=True)], axis=1)

In [None]:
train_data.rename({0: 'cat_0', 1: 'cat_1', 2: 'cat_2'}, axis=1, inplace=True)

In [None]:
train_data

In [None]:
train_data['cat_2'].value_counts()

In [None]:
train_data['cat_2'].value_counts()[train_data['cat_2'].value_counts() > 100]

In [None]:
train_data[train_data['cat_2'].isin(train_data['cat_2'].value_counts()[train_data['cat_2'].value_counts() < 100].index)].price.describe()

Since the prices distribution isn't variant for cat_2 that are not frequent, we will ignore them.

In [None]:
frequent_cat2 = list(train_data['cat_2'].value_counts()[train_data['cat_2'].value_counts() > 100].index)

In [None]:
frequent_cat2

In [None]:
train_data['cat_2'] = train_data['cat_2'].apply(lambda x: x if x in frequent_cat2 else 'Others')

In [None]:
train_data['cat_1'].value_counts()

In [None]:
log_prices = np.log1p(train_data.price)
train_data.drop(['price','train_id'], axis=1, inplace = True)

In order to get an accurate representation for the test data, we will use stratified sampling based on the subcategories.

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42)
for train_index, test_index in split.split(train_data, train_data['cat_1']):
    train_idx = train_index
    test_idx = test_index

## Text manipulation

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MinMaxScaler, LabelBinarizer


In [None]:
train_data['name'].value_counts()

In [None]:
train_data['name'].str.len().describe()

In [None]:
train_data['item_description'].value_counts()

In [None]:
train_data['item_description'].str.len().describe()

In [None]:
train_data

In [None]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem import PorterStemmer
import re
import string

In [None]:
ps = PorterStemmer()

In [None]:
re_punc = re.compile('[%s]' % re.escape(string.punctuation))
st = ' '.join(word_tokenize("KYLIE(TRUE BROWN K) MATTE LIPSTICK&LINER"))
st = re_punc.sub('', st)
st = ' '.join([ps.stem(w) for w in st.split()])
vecto = CountVectorizer(lowercase=True,stop_words='english')
cntrr = vecto.fit_transform([st])

In [None]:
st

In [None]:
vecto.get_feature_names()

In [None]:
ps = PorterStemmer()
re_punc = re.compile('[%s]' % re.escape(string.punctuation))
def preprocess_text(X):
    X = X.lower()
    X = ' '.join(word_tokenize(X))
    X = re_punc.sub('', X)
    X = ' '.join([ps.stem(w) for w in X.split()])
    return X

In [None]:
preprocess_text("KYLIE(TRUE BROWN K) MATTE LIPSTICK&LINER")

In [None]:
train_data['item_description'] = train_data['item_description'].apply(preprocess_text)

In [None]:
train_data['name'] = train_data['name'].apply(preprocess_text)

In [None]:
train_data['item_description']

In [None]:
train_data['name']

## Final preprocessed data representation

Since in the names we seek the type of the product regardless how many times this product was found in the corpus (items list) so we will use CountVectorizer when it comes to the name. However, we will use TF-IDF for the description for emphasis on genuine descriptions for the item.

In [None]:
tfidf_desc = TfidfVectorizer(max_features=50000, ngram_range=(1, 1), stop_words='english', norm='l2',lowercase=True)
name_vectorizer = CountVectorizer(stop_words='english')

X_train_descp = tfidf_desc.fit_transform(train_data['item_description'])
X_train_name = name_vectorizer.fit_transform(train_data['name'])

In [None]:
X_train_name.shape

Avoid using get_dummies because of memory inefficient usage. So we use LabelBinarizer instead.  
`X_brand_cat = pd.get_dummies(train_data[['brand_name', 'cat_0', 'cat_1', 'cat_2']]).values`

In [None]:
lb_brand_name = LabelBinarizer(sparse_output=True)
X_train_brand = lb_brand_name.fit_transform(train_data['brand_name'])

lb_shipping = LabelBinarizer(sparse_output=True)
X_train_shipping = lb_shipping.fit_transform(train_data['shipping'])

lb_cat_0 = LabelBinarizer(sparse_output=True)
X_train_cat_0 = lb_cat_0.fit_transform(train_data['cat_0'])

lb_cat_1 = LabelBinarizer(sparse_output=True)
X_train_cat_1 = lb_cat_1.fit_transform(train_data['cat_1'])

lb_cat_2 = LabelBinarizer(sparse_output=True)
X_train_cat_2 = lb_cat_2.fit_transform(train_data['cat_2'])

In [None]:
scaler = MinMaxScaler()
X_train_item_condition_id = scaler.fit_transform(train_data['item_condition_id'].values.reshape(-1,1))

In [None]:
from scipy.sparse import hstack

X_train = hstack((X_train_name, X_train_descp, X_train_brand, X_train_item_condition_id, X_train_shipping, X_train_cat_0, X_train_cat_1, X_train_cat_2)).tocsr()

In [None]:
X_train[train_idx]

In [None]:
log_prices

In [None]:
test_data = X_train[test_idx]
log_test_prices = log_prices.iloc[test_idx]
train_data = X_train[train_idx]
log_train_prices = log_prices.iloc[train_idx]

In [None]:
X_train

# Another preprocessing pipeline approach

I tried using preprocessing pipeline, but conversion from dataframes to numpy arrays for such large scale values (due to the abundance of expanded features) caused the failure to due excessive memory usage, thus we only referred to using Sklearn CSR matrices.

In [None]:
# from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, OneHotEncoder
# from sklearn.impute import SimpleImputer
# from sklearn.pipeline import Pipeline, FeatureUnion
# from sklearn.compose import ColumnTransformer


In [None]:
# def categorical_imputer(X, nlp=False):
#     if nlp:
#         return X.fillna({'category_name':'Unknown/Unknown/Unknown','name':'Unknown','item_description':'Unknown', 'brand_name':'Unknown'})
#     else:
#         return X.fillna({'category_name':'Unknown/Unknown/Unknown', 'brand_name':'Unknown'})

In [None]:
# def split_cat_name(X):
#     split_cat = X.fillna({'category_name':'Unknown/Unknown/Unknown'})['category_name'].str.split('/',n=2, expand=True)
#     split_cat.rename({0: 'cat_0', 1: 'cat_1', 2: 'cat_2'}, axis=1, inplace=True)
#     return pd.concat([X.drop('category_name', axis=1),split_cat], axis=1)

In [None]:
# def bin_encoder(X):
#     ls = list(X.columns)
#     return pd.get_dummies(X[ls])

In [None]:
# def cat_cols(nlp=False):
#     if nlp:
#         return ['name', 'category_name', 'brand_name', 'item_description']
#     else:
#         return ['category_name', 'brand_name']

# numeric_cols = list(train_data.select_dtypes(include=np.number).columns)
# cat_columns =  cat_cols(nlp=False)
    

In [None]:
# def select_numeric(X):
#     return X[numeric_cols]
# def select_categorical(X):
#     return X[cat_columns]

In [None]:
# num_pipeline = Pipeline([
#         ('selector',FunctionTransformer(select_numeric, validate=False)),
#         ('imputer', SimpleImputer(strategy="most_frequent")),
#         ('min_max_scaler', MinMaxScaler()),
#     ])

In [None]:
# cat_pipeline = Pipeline([
#         ('selector',FunctionTransformer(select_categorical, validate=False)),
#         ('imputer',  FunctionTransformer(categorical_imputer, validate=False,
#                                          kw_args={"nlp": False})),
#         ('splitter', FunctionTransformer(split_cat_name, validate=False)),
#         ('cat_encoder', FunctionTransformer(bin_encoder, validate=False)),
#     ])

In [None]:
# from sklearn.model_selection import StratifiedShuffleSplit

# split = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42)
# for train_index, test_index in split.split(train_data, train_data['category_name'].str.split('/',n=2, expand=True)[1]):
#     test_data = train_data.loc[test_index]
#     log_test_prices = log_prices[test_index]
#     train_data = train_data.loc[train_index]
#     log_train_prices = log_prices[train_index]
    

In [None]:
# train_data = train_data[cat_cols(nlp=False)+numeric_cols]

In [None]:
# full_pipeline = FeatureUnion(transformer_list=[
#         ("num_pipeline", num_pipeline),
#         ("cat_pipeline", cat_pipeline),
#     ])

In [None]:

# full_pipeline = ColumnTransformer([
#         ("num", num_pipeline, numeric_cols),
#         ("cat", cat_pipeline, cat_cols(nlp=False)),
#         ])

In [None]:
# X = full_pipeline.fit_transform(train_data)

In [None]:
# X.shape

In [None]:
import gc
gc.collect()

# Modeling

In the modelling approach we will try multiple algorithms and perform hyperparameter tuning for tree based regression methods to select the best boosting parameters.

In [None]:
from sklearn.linear_model import LinearRegression, SGDRegressor, Lasso, Ridge, ElasticNet
from sklearn.svm import LinearSVR, SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import time

## LinearRegression

The `LinearRegression` took to much to run and eventually didn't converge (more than an hour) 

In [None]:
# lin_reg = LinearRegression()
# lin_reg.fit(train_data, log_train_prices)
# lin_reg.score(train_data, log_train_prices)

In [None]:
# mean_absolute_error(lin_reg.predict(train_data),log_train_prices)

In [None]:
# mean_absolute_error(lin_reg.predict(test_data),log_test_prices)

## Lasso

It took too much time to fit althogh I have set `the max_iter` to 10, so, I used ridge instead

In [None]:
# lasso_lin_reg = Lasso(alpha=0.01 , max_iter=10)
# lasso_lin_reg.fit(train_data, log_train_prices)

In [None]:
# mean_absolute_error(lasso_lin_reg.predict(train_data),log_train_prices)

In [None]:
# mean_absolute_error(lasso_lin_reg.predict(test_data),log_test_prices)

## SGDRegressor

In [None]:
t = time.time()
sgd_lin_reg = SGDRegressor(penalty='l2',  alpha=1, max_iter=400, early_stopping=False, learning_rate='invscaling', eta0=0.01, verbose=False)
sgd_lin_reg.fit(train_data, log_train_prices)
elapsed = time.time() - t
print("elapsed training time is "+ str(elapsed))

In [None]:
print("Training mean absolute error is "+ str(mean_absolute_error(sgd_lin_reg.predict(train_data),log_train_prices)))
print("Testing mean absolute error is "+ str(mean_absolute_error(sgd_lin_reg.predict(test_data),log_test_prices)))

## Ridge

In [None]:
t = time.time()
ridge_lin_reg = Ridge(alpha=4, max_iter= 500)
ridge_lin_reg.fit(train_data, log_train_prices)
elapsed = time.time() - t
print("elapsed training time is "+ str(elapsed))

In [None]:
print("Training mean absolute error is "+ str(mean_absolute_error(ridge_lin_reg.predict(train_data),log_train_prices)))
print("Testing mean absolute error is "+ str(mean_absolute_error(ridge_lin_reg.predict(test_data),log_test_prices)))

## Linear SVR

In [None]:
t = time.time()
lin_SVR = LinearSVR(max_iter=500, C=0.1)
lin_SVR.fit(train_data, log_train_prices)
elapsed = time.time() - t
print("elapsed training time is "+ str(elapsed))

In [None]:
print("Training mean absolute error is "+ str(mean_absolute_error(lin_SVR.predict(train_data),log_train_prices)))
print("Testing mean absolute error is "+ str(mean_absolute_error(lin_SVR.predict(test_data),log_test_prices)))

## SVR

In [None]:
t = time.time()
kernel_SVR = SVR(kernel='rbf', verbose=True, max_iter= 1000)
kernel_SVR.fit(train_data, log_train_prices)
elapsed = time.time() - t
print("elapsed training time is "+ str(elapsed))

In [None]:
print("Training mean absolute error is "+ str(mean_absolute_error(kernel_SVR.predict(train_data),log_train_prices)))
print("Testing mean absolute error is "+ str(mean_absolute_error(kernel_SVR.predict(test_data),log_test_prices)))

## RandomForest

In [None]:
t = time.time()
forest_reg = RandomForestRegressor(verbose=True, max_depth=15, n_estimators=30)
forest_reg.fit(train_data, log_train_prices)
elapsed = time.time() - t
print("elapsed training time is "+ str(elapsed))

In [None]:
print("Training mean absolute error is "+ str(mean_absolute_error(forest_reg.predict(train_data),log_train_prices)))
print("Testing mean absolute error is "+ str(mean_absolute_error(forest_reg.predict(test_data),log_test_prices)))

## LGBMRegressor

In [None]:
from lightgbm import LGBMRegressor

In [None]:
#lgbm_model = LGBMRegressor(n_estimators=200, learning_rate=0.5, num_leaves=125)
t = time.time()
lgbm_model = LGBMRegressor(verbose=0, max_depth=15, n_estimators=30)
lgbm_model.fit(train_data, log_train_prices, verbose=0)
elapsed = time.time() - t
print("elapsed training time is "+ str(elapsed))

In [None]:
print("Training mean absolute error is "+ str(mean_absolute_error(lgbm_model.predict(train_data),log_train_prices)))
print("Testing mean absolute error is "+ str(mean_absolute_error(lgbm_model.predict(test_data),log_test_prices)))

# Hyperparameter tuning

Given the excessive time needed to train a simple random forest algorithm with only 30 estimators at max depth of 15 lead to excluding it for our randomized search since LightGBM resulted in similar evaluastion metrics vlue but in much more efficient time.  
Consequently, we will use LightGBM for randomized search of optimal hyperparamters and use it for detecting feature importance.

In [None]:
from sklearn.model_selection import PredefinedSplit

In [None]:
split_index = [-1 if x in train_idx else 0 for x in range(X_train.shape[0])]

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
        'n_estimators': randint(low=5, high=200),
        'max_depth': randint(low=10, high=100),
    }

lgbm_srch = LGBMRegressor()
pds = PredefinedSplit(test_fold = split_index)

rnd_search = RandomizedSearchCV(lgbm_srch, param_distributions=param_distribs,
                                n_iter=20, cv=pds, scoring='neg_mean_squared_error')

rnd_search.fit(X_train, log_prices)

In [None]:
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

In [None]:
print("Testing mean absolute error for best estimator is "+ str(mean_absolute_error(rnd_search.best_estimator_.predict(test_data),log_test_prices)))

In [None]:
feature_importances = rnd_search.best_estimator_.feature_importances_
feature_importances

In [None]:
list_of_features_shapes = [X_train_name, X_train_descp, X_train_brand, X_train_item_condition_id, X_train_shipping, X_train_cat_0, X_train_cat_1, X_train_cat_2]
list_of_features = ["name","item_description","brand_name","item_condition_id","shipping","cat_0","cat_1","cat_2"]

In [None]:
cat_range = {}
for i in range(len(list_of_features)):
    if i == 0:
        cat_range[list_of_features[i]] = (0 , list_of_features_shapes[i].shape[1])
    elif i == len(list_of_features)-1:
        cat_range[list_of_features[i]] = (cat_range[list_of_features[i-1]][1]+1 , X_train.shape[1])
    else:
        cat_range[list_of_features[i]] = (cat_range[list_of_features[i-1]][1]+1 , cat_range[list_of_features[i-1]][1]+list_of_features_shapes[i].shape[1])


In [None]:
feature_imp_cat = []
for feature_idx in feature_importances.argsort()[-40:]:
    for key, value in cat_range.items():
        if value[0] <= feature_idx <= value[1]: 
            feature_imp_cat.append(key)

In [None]:
feature_imp_cat

It's clear that the categories are the most dominant when it comes to their importance in defining the price of the product.