In [None]:
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt

In [None]:
train_df = pd.read_csv('../input/train.tsv', delimiter='\t', na_values={np.nan})

In [None]:
# Create a validation set
from sklearn.model_selection import train_test_split
train_df, valid_df = train_test_split(train_df, random_state=42)

In [None]:
test_df = pd.read_csv('../input/test.tsv', delimiter='\t', na_values={np.nan})

## Pre-processing

In [None]:
def replace_nan(df, columns):
    """ Replace np.nan with empty string."""
    df_ = df.copy()
    df_[columns] = df_[columns].replace(np.nan, '')
    return df_

In [None]:
def lower_case(df, columns):
    """ Convert all characters to lower case."""
    df_ = df.copy()
    df_[columns] = df_[columns].applymap(lambda x: x.lower())
    return df_

In [None]:
import unicodedata
def strip_accents(df, columns):
    df_ = df.copy()
    df_[columns] = df_[columns].applymap(
        lambda x: ''.join(c for c in unicodedata.normalize('NFD', x) 
                          if unicodedata.category(c) != 'Mn'))
    return df_

In [None]:
def break_up_categories(df, num_sub_categories):
    """ Break up the categories into different sub-categories based separated by '/'."""
    df_ = df.copy()
    categories = df['category_name'].apply(lambda x: x.split('/'))
    for i in range(1, num_sub_categories+1):
        df_['cat_' + str(i)] = [s[i-1] if len(s) > i-1 else '' for s in categories] 
    return df_

In [None]:
import string
def remove_punctuation(df, columns):
    """ Remove all punctuation except for '-' and '&'."""
    translator = str.maketrans('', '', '!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~')
    df_ = df.copy()
    df_[columns] = df_[columns].applymap(lambda x: x.translate(translator))
    return df_

In [None]:
import spacy
nlp = spacy.load('en')
def lemmatize(df, column):
    """ Perform lemmatization on the values in a column."""
    v = pd.DataFrame(df[column].unique(), columns=[column])
    v[column + '_new'] = v[column].apply(lambda x: ' '.join(t.lemma_ for t in nlp(x)))
    df_ = df.copy()
    df_ = pd.merge(df_, v, how='inner', on=column).drop(column, axis=1)
    return df_.rename(columns={column + '_new': column})

In [None]:
def keep_top_n_values(df, n, column):
    """ Remove all but the top n values of a column."""
    value_count = sorted(Counter(df[column]).items(), key=lambda x: x[1], reverse=True)
    top_n_values = [x[0] for x in value_count[:n]]
    df_ = df.copy()
    df_[column] = [b if b in top_n_values else '' for b in df_[column]]
    return df_

In [None]:
def preprocessing(df):
    """ Prepare the data for training."""
    # Need to do some cleaning on the columns 'category_name' and 'brand_name'
    columns = ['category_name', 'brand_name']
    df_ = df.copy()
    df_ = replace_nan(df, columns)
    df_ = lower_case(df_, columns)
    df_ = strip_accents(df_, columns)
    # Track 3 sub-categories
    num_sub_categories = 3
    df_ = break_up_categories(df_, num_sub_categories)
    cat_columns = ['cat_' + str(i) for i in range(1, num_sub_categories+1)]
    df_ = remove_punctuation(df_, cat_columns)
    # Perform lemmatization on the level-2 and level-3 sub-categories
    df_ = lemmatize(df_, 'cat_2')
    df_ = lemmatize(df_, 'cat_3')
    # Keep just the top 100 level-3 sub-categories
    df_ = keep_top_n_values(df_, 100, 'cat_3')
    # Create feature set and label set
    return df_

In [None]:
train_df = preprocessing(train_df)
valid_df = preprocessing(valid_df)
test_df = preprocessing(test_df)

In [None]:
# Remove the features 'name', 'train_id', 'category_name', and 'item_description'.
train_df.drop(['name', 'train_id', 'category_name', 'item_description'], axis=1, inplace=True)
valid_df.drop(['name', 'train_id', 'category_name', 'item_description'], axis=1, inplace=True)
test_df.drop(['name', 'category_name', 'item_description'], axis=1, inplace=True)

In [None]:
train_df.head()

In [None]:
test_df.head()

## Feature encoding
For the feature `brand_name`, we replace the brands by their median, mode, mean, max, and min prices through custom transformers. If a brand in the test data or future data is newly observed, then use the corresponding encoded value for brandless items.

For the sub-categories, we apply one-hot encoding. We remove any new sub-categories in the test data.

In [None]:
class ValueEncoder():
    """ Feature value to aggregate target value encoder."""
    
    def __init__(self, df, columns, target):
        self.df_ = df
        self.columns_ = columns
        self.target_ = target
        self.mappings = dict()
    
    def __mapper(self, column, key):
        try:
            return self.mappings[column][key]
        except KeyError:
            return self.mappings[column]['']
    
    def fit(self, kind):
        self.kind_ = kind
        for column in self.columns_:
            grouped = self.df_[[column, self.target_]].groupby(column)
            if kind == 'median':
                self.mappings[column] = grouped.median()
            elif kind == 'mean':
                self.mappings[column] = grouped.mean()
            elif kind == 'mode':
                self.mappings[column] = grouped.agg(lambda x:x.value_counts().index[0])
            elif kind == 'max':
                self.mappings[column] = grouped.max()
            elif kind == 'min':
                self.mappings[column] = grouped.min()
            else:
                self.mappings[column] = grouped.median()
            self.mappings[column] = self.mappings[column].to_dict()[self.target_]
    
    def transform(self, df):
        tmp_df = df.copy() 
        for column in self.columns_:
            tmp_df[column + '_' + self.kind_] = [self.__mapper(column, x) for x in tmp_df[column]]
        return tmp_df

In [None]:
# Median encoder
median_brand_encoder = ValueEncoder(train_df, ['brand_name'], 'price')
median_brand_encoder.fit('median')

In [None]:
# Mean encoder
mean_brand_encoder = ValueEncoder(train_df, ['brand_name'], 'price')
mean_brand_encoder.fit('mean')

In [None]:
# Mode encoder
mode_brand_encoder = ValueEncoder(train_df, ['brand_name'], 'price')
mode_brand_encoder.fit('mode')

In [None]:
# Max encoder
max_brand_encoder = ValueEncoder(train_df, ['brand_name'], 'price')
max_brand_encoder.fit('max')

In [None]:
# Min encoder
min_brand_encoder = ValueEncoder(train_df, ['brand_name'], 'price')
min_brand_encoder.fit('min')

In [None]:
train_df = median_brand_encoder.transform(train_df)
train_df = mean_brand_encoder.transform(train_df)
train_df = mode_brand_encoder.transform(train_df)
train_df = max_brand_encoder.transform(train_df)
train_df = min_brand_encoder.transform(train_df)

In [None]:
valid_df = median_brand_encoder.transform(valid_df)
valid_df = mean_brand_encoder.transform(valid_df)
valid_df = mode_brand_encoder.transform(valid_df)
valid_df = max_brand_encoder.transform(valid_df)
valid_df = min_brand_encoder.transform(valid_df)

In [None]:
test_df = median_brand_encoder.transform(test_df)
test_df = mean_brand_encoder.transform(test_df)
test_df = mode_brand_encoder.transform(test_df)
test_df = max_brand_encoder.transform(test_df)
test_df = min_brand_encoder.transform(test_df)

In [None]:
train_df.sample(10)

In [None]:
test_df.head()

In [None]:
class OneHotEncoders:
    """ Use pandas get_dummies to perform one-hot encoding."""
    
    def __init__(self, df, columns):
        self.df_ = df
        self.columns_ = columns
    
    def fit_transform(self):
        self.df_transformed = pd.get_dummies(self.df_, columns=self.columns_)
        return self.df_transformed
    
    def transform(self, df, ignore=None):
        if ignore is None:
            ignore = []
        tmp_df = df.copy()
        tmp_df = pd.get_dummies(df, columns=self.columns_)
        for column in tmp_df.columns:
            if column not in df.columns and \
               column not in self.df_transformed.columns and \
               column not in ignore:
                tmp_df.drop(column, axis=1, inplace=True)
        for column in self.df_transformed.columns:
            if column not in tmp_df.columns:
                tmp_df[column] = 0
        return tmp_df

In [None]:
ohe = OneHotEncoders(train_df, ['cat_1', 'cat_2', 'cat_3'])

In [None]:
train_df_trans = ohe.fit_transform()
valid_df_trans = ohe.transform(valid_df)
test_df_trans = ohe.transform(test_df, ignore=['test_id'])

In [None]:
train_df_trans.head()

In [None]:
X_train = train_df_trans.drop(['brand_name', 'price'], axis=1)
X_valid = valid_df_trans.drop(['brand_name', 'price'], axis=1)
y_train = train_df_trans['price']
y_valid = valid_df_trans['price']

## Model training

In [None]:
import lightgbm as lgb

In [None]:
train_data = lgb.Dataset(X_train, label=y_train)
eval_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)

In [None]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2_root'},
    'max_bin': 1000,
    'max_depth': 20,
    'num_leaves': 80,
    'learning_rate': 0.05,
    'feature_fraction': 1,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 2
}

In [None]:
gbm = lgb.train(params, train_data, num_boost_round=500, valid_sets=eval_data, early_stopping_rounds=30)

In [None]:
for col in X_train.columns[np.argsort(gbm.feature_importance())[::-1]]:
    print(col)

In [None]:
X_test = test_df_trans.drop(['test_id', 'brand_name'], axis=1)

In [None]:
test_df['price'] = gbm.predict(X_test)

In [None]:
test_df['price'] = test_df['price'].apply(lambda x: 0 if x < 0 else x )

In [None]:
test_df[['test_id', 'price']].to_csv('basic_model_submission.csv', index=False)