In [1]:
import os
import sys
import re
import numpy as np
import pandas as pd
import pickle
import tensorflow as tf
import sklearn

In [2]:
tf.__version__

'1.12.0'

## Data Preprocessing

In [3]:
FILEPATH = "../data/train_old.csv" # train data path

In [4]:
# load data
data = pd.read_csv(FILEPATH)

In [5]:
print("Data Shape:", data.shape)
print("Columns:", data.columns)

Data Shape: (384048, 4)
Columns: Index(['title', 'brand', 'model', 'model_len'], dtype='object')


In [6]:
data.head(5)

Unnamed: 0,title,brand,model,model_len
0,Sunrise 8 inch Ultra Thin With 15 inch Brass A...,Sunrise,8 inch Ultra Thin With 15 inch,7
1,Polytuf Overhead 4 Inches Chrome Plated Shower...,Polytuf,Overhead 4 Inches Chrome Plated,5
2,Kitsch SS Square 600mm Long 24 Shower Arm For ...,Kitsch,SS Square Long 24 Shower Arm For,7
3,Sunrise 6 Inch Ultra Thin With 18inch Showerar...,Sunrise,6 Inch Ultra Thin With 18inch Showerarm,7
4,ZakTag WATER BABY Dancing bubbles 18 inch Groo...,ZakTag,WATER BABY Dancing bubbles 18 inch,6


In [7]:
# check for null or missing values
data.isnull().sum()

title           0
brand           3
model        8241
model_len       0
dtype: int64

In [8]:
# drop rows with null values based on brands
data = data.dropna(axis=0, subset=["brand"], how="any")

In [9]:
# select rows where model name is present in title
# data = data[data.present_in_title == True]

In [10]:
data.shape

(384045, 4)

In [11]:
# keep rows where model name is present in description
# data = data[data.present_in_description == True]

In [12]:
# stats on title length
title_lens = data.title.apply(lambda x: len(x.split()))
print(np.mean(title_lens), np.median(title_lens), max(title_lens))

8.365397284172428 8.0 77


In [13]:
# desc_lens = data.description.apply(lambda x: len(x.split()))
# print(np.mean(desc_lens), np.median(desc_lens), max(desc_lens))

In [14]:
# data["model_len"] = data.model.apply(lambda x: len(x.split()))
# data.head(5)

In [15]:
# select rows with model name length less than or equal to 5
data = data[data.model_len <= 5]
data.shape

(309239, 4)

In [16]:
data.head(5)

Unnamed: 0,title,brand,model,model_len
1,Polytuf Overhead 4 Inches Chrome Plated Shower...,Polytuf,Overhead 4 Inches Chrome Plated,5
5,Bael Wellness BAELTTO30ML,Bael Wellness,BAELTTO30ML,1
6,Shubhpuja Ganesh Laxmi Showpiece 10 cm,Shubhpuja,Ganesh Laxmi,2
7,Vamaa SGPS300MM Linux 26 ARM A9 Dual Core 0 M...,Vamaa,SGPS300MM,1
8,eGlobal PAINTZOOM02 SMSPZGEP76 Airless Sprayer,eGlobal,PAINTZOOM02,1


In [17]:
# create word frequency mapping
word_frequency_mapping = dict()
for i, row in data.iterrows():
    raw_string = row["title"]
    text = re.sub("[^a-zA-Z0-9 ]", "", raw_string)
    tokens = text.split()
    for tok in tokens:
        if tok not in word_frequency_mapping:
            # if not in vocab, add new word
            word_frequency_mapping[tok] = 1
        else:
            # if word present in vocab, update frequency
            word_frequency_mapping[tok] += 1

In [18]:
print("Number of unique words = ", len(word_frequency_mapping))

Number of unique words =  151709


In [19]:
# reverse the dict based on the values
sorted_word_frequency_mapping = sorted(word_frequency_mapping.items(), key=lambda x: x[1], reverse=True)
len(sorted_word_frequency_mapping)

151709

In [20]:
# create a vocabulary for the embedding
# add word for unknown words and for padding
word2idx = {"<PAD>":0, "<UNK>": 1}
for i in range(len(sorted_word_frequency_mapping)):
    word2idx[sorted_word_frequency_mapping[i][0]] = i+2
# check for the lenght of the vocab
len(word2idx)

151711

In [21]:
# get a mapping from index to word
idx2word = {val:key for key, val in word2idx.items()}
len(idx2word)

151711

In [22]:
chars = """abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-,_!."""
chars = [a for a in chars]
len(chars)

67

In [23]:
char2idx = {"<UNK>": 1, "<PAD>": 0}
for i in range(len(chars)):
    char2idx[chars[i]] = i+2
# len(char2idx)

In [24]:
# get a mapping from index to word
idx2char = {val:key for key, val in char2idx.items()}
len(idx2char)

69

In [25]:
data.head(3)

Unnamed: 0,title,brand,model,model_len
1,Polytuf Overhead 4 Inches Chrome Plated Shower...,Polytuf,Overhead 4 Inches Chrome Plated,5
5,Bael Wellness BAELTTO30ML,Bael Wellness,BAELTTO30ML,1
6,Shubhpuja Ganesh Laxmi Showpiece 10 cm,Shubhpuja,Ganesh Laxmi,2


In [26]:
# create character based codes for each title name
x_char = list()
max_len = 16
max_char_len = 8
for i, row in data.iterrows():
    text = row["title"]
    text = re.sub("[^a-zA-Z0-9,-;.!?:’’’/\|_@#$%ˆ&*˜‘+-=' ]", "", text)
    tokens = text.split()
    total_token = list()
    for k in range(max_len):
        word_seq = list()
        for j in range(max_char_len):
            try:
                word_seq.append(char2idx[tokens[k][j]])
            except:
                word_seq.append(char2idx["<PAD>"])
        total_token.append(word_seq)
    x_char.append(total_token)

In [27]:
# create word based codes for each title name
x_word = list()
for i, row in data.iterrows():
    text = row["title"]
    text = re.sub("[^a-zA-Z0-9,-;.!?:’’’/\|_@#$%ˆ&*˜‘+-=' ]", "", text)
    tokens = text.split()
    total_tokens=  list()
    for tok in tokens:
        if tok in word2idx.keys():
            total_tokens.append(word2idx[tok])
        else:
            total_tokens.append(word2idx["<UNK>"])
    x_word.append(total_tokens)

In [28]:
tag2idx = {
    "O": 0, # others
    "B-M": 1, # begin of model_name
    "I-M": 2 # intermediate model_name
}

In [29]:
y = list()
for i, row in data.iterrows():
    model_name, title = row["model"], row["title"]
    model_name = re.sub("[^a-zA-Z0-9,-;.!?:’’’/\|_@#$%ˆ&*˜‘+-=' ]", "", str(model_name))
    model_name_tokens = model_name.split()
    title = re.sub("[^a-zA-Z0-9,-;.!?:’’’/\|_@#$%ˆ&*˜‘+-=' ]", "", str(title))
    title_tokens = title.split()
    tags = list()
    for t in title_tokens:
        if t not in model_name_tokens:
            tags.append(tag2idx["O"])
        elif model_name_tokens.index(t) == 0:
            tags.append(tag2idx["B-M"])
        else:
            tags.append(tag2idx["I-M"])
    y.append(tags)

In [30]:
padded_labels = tf.keras.preprocessing.sequence.pad_sequences(y, maxlen=16, padding="post", truncating="post")
categorical_labels = tf.keras.utils.to_categorical(padded_labels, 3)

In [31]:
padded_word = tf.keras.preprocessing.sequence.pad_sequences(x_word, maxlen=16, padding="post", truncating="post")

In [32]:
len(x_char), len(padded_word), len(padded_labels)

(309239, 309239, 309239)

In [33]:
import sklearn

In [34]:
from sklearn.model_selection import train_test_split

In [35]:
x_word_tr, x_word_te, y_tr, y_te = train_test_split(padded_word, categorical_labels, test_size=0.15, random_state=69)
x_char_tr, x_char_te, _, _ = train_test_split(x_char, categorical_labels, test_size=0.15, random_state=69)

In [36]:
x_word_tr = np.array(x_word_tr).reshape((len(x_word_tr), 16))
x_char_tr = np.array(x_char_tr).reshape((len(x_char_tr), 16, 8))
y_tr = np.array(y_tr).reshape(len(y_tr), 16, 3)

x_word_te = np.array(x_word_te).reshape((len(x_word_te), 16))
x_char_te = np.array(x_char_te).reshape((len(x_char_te), 16, 8))
y_te = np.array(y_te).reshape(len(y_te), 16, 3)

In [37]:
len(x_word_tr), len(x_word_te), len(y_tr), len(y_te), len(x_char_tr), len(x_char_te)

(262853, 46386, 262853, 46386, 262853, 46386)

In [38]:
x_word_tr.shape, x_word_te.shape

((262853, 16), (46386, 16))

In [39]:
x_char_tr.shape, x_char_te.shape

((262853, 16, 8), (46386, 16, 8))

In [40]:
y_tr.shape, y_te.shape

((262853, 16, 3), (46386, 16, 3))

## Model

In [41]:
# input word tensor
word_in = tf.keras.Input(shape=(16, ))
# pass tensor to embedding layer
emb_word = tf.keras.layers.Embedding(input_dim=len(word2idx)+2, output_dim=64, input_length=16)(word_in)

# input char tensor
char_in = tf.keras.Input(shape=(16, 8, ))
emb_char = tf.keras.layers.TimeDistributed(tf.keras.layers.Embedding(input_dim=len(char2idx)+2, output_dim=32, input_length=8))(char_in)

# LSTM to get word encodings by character
char_enc = tf.keras.layers.TimeDistributed(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=64, return_sequences=False, recurrent_dropout=0.4)))(emb_char)

# main BiLSTM block
merged = tf.keras.layers.concatenate([emb_word, char_enc])
# add another BiLSTM for ner task
main_lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=64, return_sequences=True))(merged)
# add a time distribute layer to work it in parallel
out = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(units=3, activation="softmax"))(main_lstm)

# set the model together
model = tf.keras.Model([word_in, char_in], out)

In [42]:
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [43]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 16, 8)        0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, 16)           0                                            
__________________________________________________________________________________________________
time_distributed (TimeDistribut (None, 16, 8, 32)    2272        input_2[0][0]                    
__________________________________________________________________________________________________
embedding (Embedding)           (None, 16, 64)       9709632     input_1[0][0]                    
__________________________________________________________________________________________________
time_distr

In [None]:
# model config
cbk = [tf.keras.callbacks.ModelCheckpoint(filepath='model_char_level.weights.best.hdf5', verbose = 1, save_best_only=True, save_weights_only=False), tf.keras.callbacks.EarlyStopping(patience=3)]

In [None]:
# model.fit([x_word_tr, x_char_tr], y_tr, batch_size=32, epochs=10, verbose=1, validation_data=([x_word_te, x_char_te], y_te), callbacks=cbk)

In [None]:
# # save word - index mapping to a pickle file
# output = open("data/word2idx.pkl", "wb")
# pickle.dump(word2idx, output)
# output.close()

# output = open("data/idx2word.pkl", "wb")
# pickle.dump(idx2word, output)
# output.close()

In [None]:
# # save char - index mapping to a pickle file
# output = open("data/char2idx.pkl", "wb")
# pickle.dump(char2idx, output)
# output.close()

# output = open("data/idx2char.pkl", "wb")
# pickle.dump(idx2char, output)
# output.close()