# Problem Statement

Named Entity Recognition model tries to identify "Brand" names from e-comm product titles. The dataset is taken from flipkart website. It consists of about 4Lakh+ de-duplicated samples spanning multiple product categories.

In [2]:
# load packages
import os
import re
import sys
import pickle
import string
import sklearn
import numpy as np
import pandas as pd
import tensorflow as tf
tf.__version__

'1.10.0'

# Data Preprocessing

In [3]:
# load data
FILEPATH = "/Users/nityansuman/__data__/flipkart_dataset/product_identification_train_set.csv"
data = pd.read_csv(FILEPATH)
data.shape

(772629, 9)

In [10]:
data.head(3)

Unnamed: 0,category_path,title,description,brand,model,model_no,category,present_in_title,present_in_description
0,[Home Improvement >> Hardware >> Bathroom & Ki...,Hindware Vara Spa Shower Head,Buy Hindware Vara Spa Shower Head for Rs.1567 ...,Hindware,Vara Spa,F160050,hardware,True,True
1,[Home Improvement >> Hardware >> Bathroom & Ki...,Sunrise 8 inch Ultra Thin With 15 inch Brass A...,Buy Sunrise 8 inch Ultra Thin With 15 inch Bra...,Sunrise,8 inch Ultra Thin With 15 inch Brass Arm,8inslimwith15inbras,hardware,True,True
2,[Home Improvement >> Hardware >> Bathroom & Ki...,Polytuf Overhead- 4 Inches Chrome Plated Showe...,Buy Polytuf Overhead- 4 Inches Chrome Plated S...,Polytuf,Overhead- 4 Inches Chrome Plated,1067(a),hardware,True,True


In [11]:
data = data.drop_duplicates(subset=["title", "brand"], keep=False)
data.shape

(404024, 9)

In [12]:
# check for null or missing values
data.isnull().sum()

category_path                0
title                        0
description                  0
brand                        3
model                        0
model_no                    76
category                  4626
present_in_title             0
present_in_description       0
dtype: int64

In [13]:
# drop rows with null values based on brands
data = data.dropna(axis=0, subset=["brand"], how="any")
data.shape

(404021, 9)

In [14]:
data.describe()

Unnamed: 0,category_path,title,description,brand,model,model_no,category,present_in_title,present_in_description
count,404021,404021,404021,404021,404021,403945,399395,404021,404021
unique,127114,403879,403880,22096,305259,245074,356,2,2
top,[Jewellery >> Earrings],Ciba Vision Freshlook Color Blends Monthly Con...,Not Found,Not Found,Bike Handle Grip,Not Found,earrings,True,True
freq,35585,5,63,10311,2068,129437,38030,384445,403645


In [15]:
word_frequency_mapping = dict()

for i, row in data.iterrows():
    # iterate over all samples
    raw_string = row["title"]
    
    text = re.sub("[^a-zA-Z ]", "", raw_string) # access only characters
    tokens = text.split() # tokenize string into words
    
    for tok in tokens:
        # access each token
        if tok not in word_frequency_mapping:
            # if not in vocab, add new word
            word_frequency_mapping[tok] = 1
        else:
            # if word present in vocab, update frequency
            word_frequency_mapping[tok] += 1

In [18]:
# reverse the dict based on the values
sorted_word_frequency_mapping = sorted(word_frequency_mapping.items(), key=lambda x: x[1], reverse=True)

In [19]:
print("Number of Unique Words = ", len(word_frequency_mapping), len(sorted_word_frequency_mapping))

Number of Unique Words =  95416 95416


In [20]:
# create a vocabulary for word embedding

# add word for unknown words and for padding
word2idx = {"<PAD>":0, "<UNK>": 1}
for i in range(len(sorted_word_frequency_mapping)):
    word2idx[sorted_word_frequency_mapping[i][0]] = i+2

In [22]:
# get a mapping from index to word
idx2word = {val:key for key, val in word2idx.items()}

In [23]:
# check for the lenght of the vocab
print("Number of Unique Tokens in Word Vocab = ", len(idx2word), len(word2idx))

Number of Unique Tokens in Word Vocab =  95418 95418


In [26]:
# get unique character set for character embedding
chars = string.ascii_letters
len(chars)

52

In [27]:
# create a character vocabulary
char2idx = {"<UNK>": 1, "<PAD>": 0}
for i in range(len(chars)):
    char2idx[chars[i]] = i+2

In [28]:
# get a mapping from index to word
idx2char = {val:key for key, val in char2idx.items()}

In [29]:
print("Number of Unique Tokens in Character Vocab = ", len(char2idx), len(idx2char))

Number of Unique Tokens in Character Vocab =  54 54


## Create Character Embedding

In [30]:
x_char = list()
max_len = 16 # number of max words
max_char_len = 8 # number of max characters in each word

for i, row in data.iterrows():
    # access each sample
    text = row["title"]
    
    text = re.sub("[^a-zA-Z0-9 ]", " ", text) # remove all non english characters
    tokens = text.split() # tokenize string
    
    total_token = list()
    # create character level code for each word
    for k in range(max_len):
        word_seq = list()
        for j in range(max_char_len):
            try:
                word_seq.append(char2idx[tokens[k][j]])
            except:
                word_seq.append(char2idx["<PAD>"])
        total_token.append(word_seq)
    x_char.append(total_token)

In [32]:
len(x_char), len(x_char[0]), len(x_char[0][0])

(404021, 16, 8)

## Create Word Embedding

In [33]:
x_word = list()

for i, row in data.iterrows():
    # access each sample
    text = row["title"]
    
    text = re.sub("[^a-zA-Z0-9 ]", " ", text) # remove all non english characters
    tokens = text.split() # tokenize string
    
    total_tokens=  list()
    # create word level code
    for tok in tokens:
        if tok in word2idx.keys():
            total_tokens.append(word2idx[tok])
        else:
            total_tokens.append(word2idx["<UNK>"])
    x_word.append(total_tokens)

In [34]:
len(x_word), len(x_word[0])

(404021, 12)

## Encode Brand Names

In [35]:
# mapping for brand names
tag2idx = {
    "O": 0, # others
    "B-M": 1, # begin of brand name 
    "I-M": 2 # intermediate brand name
}

In [36]:
y = list()

for i, row in data.iterrows():
    # access each sample
    name, title = row["brand"], row["title"] # access brand name and product title
    
    name = re.sub("[^a-zA-Z0-9 ]", " ", str(name)) # remove all non english chars from brand name
    name_tokens = name.split() # tokenize brand name

    title = re.sub("[^a-zA-Z0-9 ]", " ", str(title)) # remove all non english chars from title
    title_tokens = title.split() # tokenize title string

    tags = list()
    for t in title_tokens:
        # access each toke
        if t not in name_tokens:
            # tag other than brand name
            tags.append(tag2idx["O"])
        elif name_tokens.index(t) == 0:
            # tag begining brand name
            tags.append(tag2idx["B-M"])
        else:
            # tag intermediate brand name
            tags.append(tag2idx["I-M"])
    y.append(tags)

In [37]:
len(y), len(y[0])

(404021, 12)

## Prepare Encoded Data

In [39]:
# pad labels
padded_labels = tf.keras.preprocessing.sequence.pad_sequences(y, maxlen=max_len, padding="post", truncating="post")
# convert to categorical
categorical_labels = tf.keras.utils.to_categorical(padded_labels, len(tag2idx.keys()))

In [40]:
# pad word embedding
padded_word = tf.keras.preprocessing.sequence.pad_sequences(x_word, maxlen=max_len, padding="post", truncating="post")

## Split Train and Dev Set

In [41]:
from sklearn.model_selection import train_test_split

# split word embedding and target labels
x_word_tr, x_word_te, y_tr, y_te = train_test_split(padded_word, categorical_labels, test_size=0.15, random_state=69)
# split character embedding
x_char_tr, x_char_te, _, _ = train_test_split(x_char, categorical_labels, test_size=0.15, random_state=69)

## Convert To Numpy

In [42]:
# convert train set
x_word_tr = np.array(x_word_tr).reshape((len(x_word_tr), 16))
x_char_tr = np.array(x_char_tr).reshape((len(x_char_tr), 16, 8))
y_tr = np.array(y_tr).reshape(len(y_tr), 16, 3)

# convert dev set
x_word_te = np.array(x_word_te).reshape((len(x_word_te), 16))
x_char_te = np.array(x_char_te).reshape((len(x_char_te), 16, 8))
y_te = np.array(y_te).reshape(len(y_te), 16, 3)

In [43]:
x_word_tr.shape, x_word_te.shape

((343417, 16), (60604, 16))

In [44]:
x_char_tr.shape, x_char_te.shape

((343417, 16, 8), (60604, 16, 8))

In [45]:
y_tr.shape, y_te.shape

((343417, 16, 3), (60604, 16, 3))

# NER Model

In [46]:
# input word tensor
word_in = tf.keras.Input(shape=(max_len, ))
# pass tensor to embedding layer
emb_word = tf.keras.layers.Embedding(input_dim=len(word2idx)+2, output_dim=64, input_length=16)(word_in)

# input char tensor
char_in = tf.keras.Input(shape=(max_len, max_char_len, ))
# pass tensor to embedding layer
emb_char = tf.keras.layers.TimeDistributed(tf.keras.layers.Embedding(input_dim=len(char2idx)+2, output_dim=32, input_length=8))(char_in)

# BiLSTM to learn character embeddings
char_enc = tf.keras.layers.TimeDistributed(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=64, return_sequences=False, recurrent_dropout=0.4)))(emb_char)

# merge word and character embeddings
merged = tf.keras.layers.concatenate([emb_word, char_enc])

# BiLSTM for ner
main_lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=64, return_sequences=True))(merged)

# classify for each word
out = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(units=3, activation="softmax"))(main_lstm)

# set model together
model = tf.keras.Model([word_in, char_in], out)

In [47]:
# compile model
model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])

In [48]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 16, 8)        0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, 16)           0                                            
__________________________________________________________________________________________________
time_distributed (TimeDistribut (None, 16, 8, 32)    1792        input_2[0][0]                    
__________________________________________________________________________________________________
embedding (Embedding)           (None, 16, 64)       6106880     input_1[0][0]                    
__________________________________________________________________________________________________
time_distr

In [50]:
# model config
cbk = [
    tf.keras.callbacks.ModelCheckpoint(filepath='model_char_level.weights.best.hdf5', verbose = True, save_best_only=True, save_weights_only=False),
    tf.keras.callbacks.EarlyStopping(patience=3)
]

In [51]:
model.fit(
    [x_word_tr, x_char_tr], y_tr,
    batch_size=32,
    epochs=10,
    verbose=True,
    validation_data=([x_word_te, x_char_te], y_te)
)#, callbacks=cbk)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 343417 samples, validate on 60604 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1adafaab00>