In [1]:
import math
from nltk.corpus import stopwords
from keras import backend as K
from keras.models import Model
from keras.optimizers import Adam
from keras.layers import Input, Dropout, Dense, concatenate, GRU, Embedding, Flatten, Activation
from keras.utils import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
from datetime import datetime
start_real = datetime.now()


# from keras.layers import Bidirectional
# set seed
np.random.seed(123)

# Defining RMSLE Error Functions #
This will be used to check model accuracy after preductions are made on the train/test datasets

In [2]:
def rmsle(Y, Y_pred):
    assert Y.shape == Y_pred.shape
    return np.sqrt(np.mean(np.square(Y_pred - Y)))

# Loading Datasets #
1. Loading Data
2. Cleaning Datasets
3. Splitting Data into Test/Train

In [3]:
train_df = pd.read_table('Dataset/train.tsv')
test_df = pd.read_table('Dataset/test.tsv')
print(train_df.shape, test_df.shape)

(1482535, 8) (693359, 7)


### Pre-processing & Cleaning Datasets

Removing low prices, anything below 3. Postinggs below 3 are likely to be an error. Removing them helps the models.

In [4]:
# remove low prices
train_df = train_df.drop(train_df[(train_df.price < 3.0)].index)
train_df.shape

(1481661, 8)

Removing Stop-words from the dataset

In [5]:
stop = stopwords.words('english')
train_df["item_description"].fillna(value='No description yet', inplace=True)
train_df['item_description'] = train_df['item_description'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
train_df["name"].fillna(value="missing", inplace=True)
train_df['name'] = train_df['name'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

test_df["item_description"].fillna(value='No description yet', inplace=True)
test_df['item_description'] = test_df['item_description'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
test_df["name"].fillna(value="missing", inplace=True)
test_df['name'] = test_df['name'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

train_df.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard great condition works like came ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top hint lace key hole back! The pale...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New tags. Leather horses. Retail [rm] each. St...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete certificate authenticity


The length of the description, that is the raw number of words used, does have some correlation with price. The RNN might find this out on it's own, but since a max depth is used to save computations, it does not always know. Description length clearly helps the model, name length maybe not so much. Does not hurt the models so leaving name length in.

In [6]:
# get name and description lengths
def wordCount(text):
    try:
        if text == 'No description yet':
            return 0
        else:
            text = text.lower()
            words = [w for w in text.split(" ")]
            return len(words)
    except:
        return 0

train_df['desc_len'] = train_df['item_description'].apply(lambda x: wordCount(x))
test_df['desc_len'] = test_df['item_description'].apply(lambda x: wordCount(x))
train_df['name_len'] = train_df['name'].apply(lambda x: wordCount(x))
test_df['name_len'] = test_df['name'].apply(lambda x: wordCount(x))
train_df.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,desc_len,name_len
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet,0,7
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard great condition works like came ...,21,4
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top hint lace key hole back! The pale...,16,2
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New tags. Leather horses. Retail [rm] each. St...,22,3
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete certificate authenticity,3,4


Here we split the category_name into 3 parts. The RNN models can get more information this way.

In [7]:
# split category name into 3 parts
def split_cat(text):
    try:
        return text.split("/")
    except:
        return ("No Label", "No Label", "No Label")


train_df['subcat_0'], train_df['subcat_1'], train_df['subcat_2'] = \
    zip(*train_df['category_name'].apply(lambda x: split_cat(x)))
test_df['subcat_0'], test_df['subcat_1'], test_df['subcat_2'] = \
    zip(*test_df['category_name'].apply(lambda x: split_cat(x)))

The brand name data is sparse, missing over 600,000 values. This gets *some* of those values back by checking their names. An *exact* name match against all_brand names will help finding some of the missing values. We can be pretty confident in these. At the other extreme, we can search for *any* matches throughout all words in name. This finds over 200,000 but a lot of these are incorrect. Can land somewhere in the middle by either keeping cases or trimming out some of the 5000 brand names.

For example, PINK is a brand by victoria secret. If we remove case, then almost all *pink* items are labeled as PINK brand. The other issue is that some of the "brand names" are not brands but really categories like "Boots" or "Keys". 

Currently, checking every word in name of a case-sensitive match does best. This gets around 137,000 finds while avoiding the problems with brands like PINK.

In [8]:
# attempt to find missing brand names
# train_df['name'] = train_df.name.str.lower()
# train_df['brand_name'] = train_df.brand_name.str.lower()
# test_df['name'] = test_df.name.str.lower()
# test_df['brand_name'] = test_df.brand_name.str.lower()
full_set = pd.concat([train_df, test_df])
all_brands = set(full_set['brand_name'].values)
train_df.brand_name.fillna(value="missing", inplace=True)
test_df.brand_name.fillna(value="missing", inplace=True)

# get to finding!
premissing = len(train_df.loc[train_df['brand_name'] == 'missing'])


def brandfinder(line):
    brand = line[0]
    name = line[1]
    namesplit = name.split(' ')
    if brand == 'missing':
        for x in namesplit:
            if x in all_brands:
                return name
    if name in all_brands:
        return name
    return brand


train_df['brand_name'] = train_df[['brand_name', 'name']].apply(brandfinder, axis=1)
test_df['brand_name'] = test_df[['brand_name', 'name']].apply(brandfinder, axis=1)
found = premissing-len(train_df.loc[train_df['brand_name'] == 'missing'])
print(found)

137314


In [9]:
# Scale target variable to log.
train_df["target"] = np.log1p(train_df.price)

# Split training examples into train/dev examples.
train, test = train_test_split(train_df, random_state=123, train_size=0.8)

# Calculate number of train/dev/test examples.
n_trains = train.shape[0]
n_devs = test.shape[0]
n_tests = test_df.shape[0]
print("Training on", n_trains, "examples")
print("Validating on", n_devs, "examples")
print("Testing on", n_tests, "examples")

Training on 1185328 examples
Validating on 296333 examples
Testing on 693359 examples


# RNN Model fitting:
1. Preprocessing data
2. Define RNN model
3. Fitting RNN model on training examples
4. Evaluating RNN model on dev examples
5. Make prediction for test data using RNN model

In [10]:
# Concatenate train - dev - test data for easy to handle
full_df = pd.concat([train, test])

In [11]:
full_df

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,desc_len,name_len,subcat_0,subcat_1,subcat_2,target
1388738,1388738,Bath Body Works aromatherapy lotion.,1,Beauty/Skin Care/Body,Bath & Body Works,11.0,0,Bath Body Works aromatherapy body lotion. BRAN...,18,5,Beauty,Skin Care,Body,2.484907
1077160,1077160,Step Up 2&3,1,Electronics/Media/Blu-Ray,Step Up 2&3,16.0,1,Step Up 2 The Streets Step Up 3 Both new,10,3,Electronics,Media,Blu-Ray,2.833213
1270135,1270135,Lululemon Tank,3,Sports & Outdoors/Exercise/Athletic Training,Lululemon Tank,10.0,1,"Teal lulu top, back splits I cut long tag neck",10,2,Sports & Outdoors,Exercise,Athletic Training,2.397895
834631,834631,ON HOLD FOR JLOVE,2,Vintage & Collectibles/Clothing/Shorts,Maurices,40.0,1,"Jean shorts Maurice's, Jean shorts hollister, ...",20,4,Vintage & Collectibles,Clothing,Shorts,3.713572
762041,762041,Nike Shox,2,Women/Shoes/Athletic,Nike,50.0,1,"Women's 9, practically brand new worn like 2 t...",14,2,Women,Shoes,Athletic,3.931826
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
869112,869112,"God, Goals, Grind Graphic Tee",2,Women/Tops & Blouses/T-Shirts,missing,9.0,0,"God, Goals, Grind Graphic Tee size medium. Wor...",16,5,Women,Tops & Blouses,T-Shirts,2.302585
889051,889051,"Benefit ""fakeup"" concealer stick",3,Beauty/Makeup/Face,Benefit,8.0,0,Hydrating concealer color 01 light. Used sanit...,7,4,Beauty,Makeup,Face,2.197225
812733,812733,**Bundle HoneyLove**,1,"Women/Athletic Apparel/Pants, Tights, Leggings",missing,60.0,1,BNWT Lularoe TC Leggings Bundle Mint purple Fl...,37,2,Women,Athletic Apparel,"Pants, Tights, Leggings",4.110874
1210108,1210108,Res!! M pink style lace choker bralette,1,Women/Other/Other,Res!! M pink style lace choker bralette,14.0,1,Brand new Small-XL Black Wine red,6,7,Women,Other,Other,2.708050


### Fill missing data
Note that replacing 'No description yet' with "missing" helps the model a bit by treating it the same as the NA values

In [12]:
# Filling missing values
def fill_missing_values(df):
    df["category_name"].fillna(value="missing", inplace=True)
    df["brand_name"].fillna(value="missing", inplace=True)
    df["item_description"].fillna(value="missing", inplace=True)
    df["item_description"].replace('No description yet', "missing", inplace=True)
    return df


print("Filling missing data...")
full_df = fill_missing_values(full_df)
print(full_df.loc[:,"category_name"])

Filling missing data...
1388738                             Beauty/Skin Care/Body
1077160                         Electronics/Media/Blu-Ray
1270135      Sports & Outdoors/Exercise/Athletic Training
834631             Vintage & Collectibles/Clothing/Shorts
762041                               Women/Shoes/Athletic
                                ...                      
869112                      Women/Tops & Blouses/T-Shirts
889051                                 Beauty/Makeup/Face
812733     Women/Athletic Apparel/Pants, Tights, Leggings
1210108                                 Women/Other/Other
670308     Home/Kitchen & Dining/Coffee & Tea Accessories
Name: category_name, Length: 1481661, dtype: object


### Encoding Categorical Data

In [13]:
print("Processing categorical data...")
le = LabelEncoder()
# full_df.category = full_df.category_name
le.fit(full_df.category_name)
full_df['category'] = le.transform(full_df.category_name)

le.fit(full_df.brand_name)
full_df.brand_name = le.transform(full_df.brand_name)

le.fit(full_df.subcat_0)
full_df.subcat_0 = le.transform(full_df.subcat_0)

le.fit(full_df.subcat_1)
full_df.subcat_1 = le.transform(full_df.subcat_1)

le.fit(full_df.subcat_2)
full_df.subcat_2 = le.transform(full_df.subcat_2)

# del le

Processing categorical data...


### Pre-Processing Text Data

Transforming Text Data into Tokens & then Sequences of numbers would help in fitting the RNN model

In [14]:
print("Transforming text data to sequences...")
raw_text = np.hstack([full_df["item_description"].str.lower(), full_df["name"].str.lower(), full_df["category_name"].str.lower()])

print("Fitting tokenizer...")
tok_raw = Tokenizer()
tok_raw.fit_on_texts(raw_text)

print("Transforming text to sequences...")
full_df['seq_item_description'] = tok_raw.texts_to_sequences(full_df["item_description"].str.lower())
full_df['seq_name'] = tok_raw.texts_to_sequences(full_df["name"].str.lower())
# full_df['seq_category'] = tok_raw.texts_to_sequences(full_df.category_name.str.lower())

del tok_raw

Transforming text data to sequences...
Fitting tokenizer...
Transforming text to sequences...


In [15]:
display(raw_text)
display(full_df.head())

array(['bath body works aromatherapy body lotion. brand new full sized. sealed 6.5 fl oz. glass bottle brand new!!!',
       'step up 2 the streets step up 3 both new',
       'teal lulu top, back splits i cut long tag neck', ...,
       'women/athletic apparel/pants, tights, leggings',
       'women/other/other',
       'home/kitchen & dining/coffee & tea accessories'], dtype=object)

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,desc_len,name_len,subcat_0,subcat_1,subcat_2,target,category,seq_item_description,seq_name
1388738,1388738,Bath Body Works aromatherapy lotion.,1,Beauty/Skin Care/Body,15100,11.0,0,Bath Body Works aromatherapy body lotion. BRAN...,18,5,0,90,99,2.484907,35,"[229, 100, 221, 4458, 100, 619, 5, 2, 122, 105...","[229, 100, 221, 4458, 619]"
1077160,1077160,Step Up 2&3,1,Electronics/Media/Blu-Ray,104785,16.0,1,Step Up 2 The Streets Step Up 3 Both new,10,3,1,65,95,2.833213,95,"[1864, 222, 10, 38, 13226, 1864, 222, 23, 396, 2]","[1864, 222, 10, 23]"
1270135,1270135,Lululemon Tank,3,Sports & Outdoors/Exercise/Athletic Training,68376,10.0,1,"Teal lulu top, back splits I cut long tag neck",10,2,8,37,33,2.397895,890,"[690, 2568, 46, 101, 11505, 6, 303, 108, 202, ...","[315, 120]"
834631,834631,ON HOLD FOR JLOVE,2,Vintage & Collectibles/Clothing/Shorts,71248,40.0,1,"Jean shorts Maurice's, Jean shorts hollister, ...",20,4,9,27,695,3.713572,1016,"[614, 85, 3592, 614, 85, 549, 57, 431, 85, 311...","[153, 227, 91, 47751]"
762041,762041,Nike Shox,2,Women/Shoes/Athletic,82094,50.0,1,"Women's 9, practically brand new worn like 2 t...",14,2,10,88,31,3.931826,1203,"[35, 136, 2409, 5, 2, 19, 42, 10, 130, 81, 7, ...","[77, 5423]"


In [21]:
# Define the Keras TensorBoard callback.logdir="logs/fit/" + datetime.now().strftime("%Y%m%d-%H%M%S")tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir)
%load_ext tensorboard
from datetime import datetime
from tensorflow import keras

# Define the Keras TensorBoard callback.
logdir="logs/fit/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir)

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


### Define constants to use when define RNN model
Note the comments next to the first few lines indicate the longest entry in that column. Just for reference.

In [17]:
MAX_NAME_SEQ = 10  # 17
MAX_ITEM_DESC_SEQ = 75  # 269
MAX_CATEGORY_SEQ = 8  # 8
MAX_TEXT = np.max([
    np.max(full_df["seq_name"].max()),
    np.max(full_df["seq_item_description"].max()),
    #     np.max(full_df.seq_category.max()),
]) + 100
MAX_CATEGORY = np.max(full_df["category"].max()) + 1
MAX_BRAND = np.max(full_df["brand_name"].max()) + 1
MAX_CONDITION = np.max(full_df["item_condition_id"].max()) + 1
MAX_DESC_LEN = np.max(full_df["desc_len"].max()) + 1
MAX_NAME_LEN = np.max(full_df["name_len"].max()) + 1
MAX_SUBCAT_0 = np.max(full_df["subcat_0"].max()) + 1
MAX_SUBCAT_1 = np.max(full_df["subcat_1"].max()) + 1
MAX_SUBCAT_2 = np.max(full_df["subcat_2"].max()) + 1

### Getting Data for RNN Model

In [18]:
# Converting the datasets from a Pandas dataframe to a dictionary to train the RNN models...

def get_rnn_data(dataset):
    X = {
        'name': pad_sequences(dataset.seq_name, maxlen=MAX_NAME_SEQ),
        'item_desc': pad_sequences(dataset.seq_item_description, maxlen=MAX_ITEM_DESC_SEQ),
        'brand_name': np.array(dataset.brand_name),
        'category': np.array(dataset.category),
        #         'category_name': pad_sequences(dataset.seq_category, maxlen=MAX_CATEGORY_SEQ),
        'item_condition': np.array(dataset.item_condition_id),
        'num_vars': np.array(dataset[["shipping"]]),
        'desc_len': np.array(dataset[["desc_len"]]),
        'name_len': np.array(dataset[["name_len"]]),
        'subcat_0': np.array(dataset.subcat_0),
        'subcat_1': np.array(dataset.subcat_1),
        'subcat_2': np.array(dataset.subcat_2),
    }
    return X


train = full_df[:n_trains]
test = full_df[n_trains:n_trains+n_devs]
# test = full_df[n_trains+n_devs:]

X_train = get_rnn_data(train)
Y_train = train["target"].values.reshape(-1, 1)

X_test = get_rnn_data(test)
Y_test = test["target"].values.reshape(-1, 1)

# X_test = get_rnn_data(test)

### Defining the RNN Model

In [19]:
# set seed again in case testing models adjustments by looping next 2 blocks
np.random.seed(123)


def new_rnn_model(lr=0.001, decay=0.0):
    # Inputs
    name = Input(shape=[X_train["name"].shape[1]], name="name")
    item_desc = Input(shape=[X_train["item_desc"].shape[1]], name="item_desc")
    brand_name = Input(shape=[1], name="brand_name")
#     category = Input(shape=[1], name="category")
#     category_name = Input(shape=[X_train["category_name"].shape[1]], name="category_name")
    item_condition = Input(shape=[1], name="item_condition")
    num_vars = Input(shape=[X_train["num_vars"].shape[1]], name="num_vars")
    desc_len = Input(shape=[1], name="desc_len")
    name_len = Input(shape=[1], name="name_len")
    subcat_0 = Input(shape=[1], name="subcat_0")
    subcat_1 = Input(shape=[1], name="subcat_1")
    subcat_2 = Input(shape=[1], name="subcat_2")

    # Embeddings layers (adjust outputs to help model)
    emb_name = Embedding(MAX_TEXT, 20)(name)
    emb_item_desc = Embedding(MAX_TEXT, 60)(item_desc)
    emb_brand_name = Embedding(MAX_BRAND, 10)(brand_name)
#     emb_category_name = Embedding(MAX_TEXT, 20)(category_name)
#     emb_category = Embedding(MAX_CATEGORY, 10)(category)
    emb_item_condition = Embedding(MAX_CONDITION, 5)(item_condition)
    emb_desc_len = Embedding(MAX_DESC_LEN, 5)(desc_len)
    emb_name_len = Embedding(MAX_NAME_LEN, 5)(name_len)
    emb_subcat_0 = Embedding(MAX_SUBCAT_0, 10)(subcat_0)
    emb_subcat_1 = Embedding(MAX_SUBCAT_1, 10)(subcat_1)
    emb_subcat_2 = Embedding(MAX_SUBCAT_2, 10)(subcat_2)

    # rnn layers (GRUs are faster than LSTMs and speed is important here)
    rnn_layer1 = GRU(16)(emb_item_desc)
    rnn_layer2 = GRU(8)(emb_name)
#     rnn_layer3 = GRU(8) (emb_category_name)

    # main layers
    main_l = concatenate([
        Flatten()(emb_brand_name)  
        # , Flatten() (emb_category)
        , Flatten()(emb_item_condition)
        , Flatten()(emb_desc_len)
        , Flatten()(emb_name_len)
        , Flatten()(emb_subcat_0)
        , Flatten()(emb_subcat_1)
        , Flatten()(emb_subcat_2)
        , rnn_layer1
        , rnn_layer2  
        # , rnn_layer3
        , num_vars
    ])
    # (incressing the nodes or adding layers does not effect the time quite as much as the rnn layers)
    main_l = Dropout(0.1)(Dense(512, kernel_initializer='normal', activation='relu')(main_l))
    main_l = Dropout(0.1)(Dense(256, kernel_initializer='normal', activation='relu')(main_l))
    main_l = Dropout(0.1)(Dense(128, kernel_initializer='normal', activation='relu')(main_l))
    main_l = Dropout(0.1)(Dense(64, kernel_initializer='normal', activation='relu')(main_l))

    # the output layer.
    output = Dense(1, activation="linear")(main_l)

    model = Model([name, item_desc, brand_name, item_condition, num_vars, desc_len, name_len, subcat_0, subcat_1, subcat_2], output)

    optimizer = Adam(lr=lr, decay=decay)
    # (mean squared error loss function works as well as custom functions)
    model.compile(loss='mse', optimizer=optimizer)

    return model

model = new_rnn_model()
model.summary()
del model

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 brand_name (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 item_condition (InputLayer)    [(None, 1)]          0           []                               
                                                                                                  
 desc_len (InputLayer)          [(None, 1)]          0           []                               
                                                                                                  
 name_len (InputLayer)          [(None, 1)]          0           []                               
                                                                                              

  super().__init__(name, **kwargs)


### Fitting RNN model to train data

In [22]:
# Set hyper parameters for the model.
BATCH_SIZE = 512 * 3
epochs = 2

# Calculate learning rate decay.


def exp_decay(init, fin, steps): return (init/fin)**(1/(steps-1)) - 1


steps = int(len(X_train['name']) / BATCH_SIZE) * epochs
lr_init, lr_fin = 0.005, 0.001
lr_decay = exp_decay(lr_init, lr_fin, steps)

# Create model and fit it with training dataset.
rnn_model = new_rnn_model(lr=lr_init, decay=lr_decay)
rnn_model.fit(X_train, Y_train, epochs=epochs, batch_size=BATCH_SIZE, validation_data=(X_test, Y_test), verbose=1, callbacks=[tensorboard_callback],)

#callbacks=[tensorboard_callback])

Epoch 1/2


  inputs = self._flatten_to_reference_inputs(inputs)


Epoch 2/2


<keras.callbacks.History at 0x198b0a1e2d0>

### Evaluating RNN Model on Train & Test Data

In [None]:
# Evaluating Accuracy on Test Dataset
print("Evaluating the model on validation data...")
Y_test_preds_rnn = rnn_model.predict(X_test, batch_size=BATCH_SIZE)
print(" RMSLE error:", rmsle(Y_test, Y_test_preds_rnn))

Evaluating the model on validation data...
 RMSLE error: 0.4477298881579948


In [None]:
# Evaluating Accuracy on Training Dataset
print("Evaluating the model on validation data...")
Y_train_preds_rnn = rnn_model.predict(X_train, batch_size=BATCH_SIZE)
print(" RMSLE error:", rmsle(Y_train, Y_train_preds_rnn))

Evaluating the model on validation data...
 RMSLE error: 0.3799419746089792
