In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, SpatialDropout1D, GRU
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer

from keras.preprocessing import sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
%matplotlib inline


Using TensorFlow backend.


In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
## Loading Data - labelled data:

data_1 = pd.ExcelFile('RNN_Tranzact_Rohan_1_Regrouped_with_supplier.xlsx')
tranzact_data_1 = data_1.parse(0)

tranzact_data_1.tail()

Unnamed: 0,supplier_item_id,to_company_name,supplier_item_id.1,supplier_item_id.2,product,supplier,Done,Atul Sugg.,sub_type,material,process,grade,spec,brand
13552,86390,CESARE BONETTI INTERNATIONAL PVT. LTD.,86390,"YOKE SLEEVE;8"" # 150;GTV; B150 C62300",YOKE SLEEVE,CESARE BONETTI INDIA PVT. LTD.,278.0,,,,,C62300,,
13553,89723,CESARE BONETTI INTERNATIONAL PVT. LTD.,89723,"YOKE SLEEVE 1½"" #2700 BLY;B150 C62300",YOKE SLEEVE,CESARE BONETTI INDIA PVT. LTD.,278.0,,,,,C62300,,
13554,89995,CESARE BONETTI INTERNATIONAL PVT. LTD.,89995,"YOKE SLEEVE;4"" #150 GTV; B150 C62300",YOKE SLEEVE,CESARE BONETTI INDIA PVT. LTD.,278.0,,,,,C62300,,
13555,117819,WAAREE INDUSTRIES PVT. LTD.,117819,"YOKE SLEEVE;8"" # 150;GTV; A439 D2",cicasting,Waaree Industries Pvt.Ltd.,278.0,,,,,,,
13556,169292,CESARE BONETTI INTERNATIONAL PVT. LTD.,169292,"YOKE SLEEVE; 1"";#1500;CBD;A582 T416",YOKE SLEEVE,CESARE BONETTI INDIA PVT. LTD.,278.0,,,,,,,


In [4]:
## Picking up required info from labelled data:

col = ['supplier_item_id.2', 'product']
tranzact_data_2 = tranzact_data_1[col]
tranzact_data_2.columns = ['item', 'product']

test_data = pd.DataFrame()

test_data = tranzact_data_2[0:5]

test_data.head()

Unnamed: 0,item,product
0,NEW CYLINDER 404 GAS,GAS
1,"ACCUMULATOR - MODEL A-AS 5126, (3/4"" CONNECTIO...",COOLING
2,"ACCUMULATOR; FLOKOOL; 3/4""; 400 PSI, FKSA596",COOLING
3,ACCUMULATOR 1.5/8 CONNECTION,COOLING
4,ACCUMULATOR 7/8‘’,COOLING


In [27]:
## make all in lower case

test_data['item'] = test_data['item'].map(lambda x: x if type(x)!=str else x.lower()) 
test_data['product'] = test_data['product'].map(lambda x: x if type(x)!=str else x.lower())

## Removing some punctuations:

punctuation = ['!', '$', '%', '&', '(', ')', '*', '+', ',', '.', ':', ';', '<', '=', '>', '?', '@', \
               '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '\t', '\n'] 

for i in punctuation:
    
    test_data['item']= test_data['item'].str.replace(i," ")
    
test_data['item'] = test_data['item'].str.replace(r'\s+',' ')

print(len(test_data))
test_data.tail()

5


Unnamed: 0,item,product
0,new cylinder 404 gas,gas
1,"accumulator - model a-as 5126 3/4"" connection ...",cooling
2,"accumulator flokool 3/4"" 400 psi fksa596",cooling
3,accumulator 1 5/8 connection,cooling
4,accumulator 7/8‘’,cooling


In [21]:
from gensim.models import Word2Vec

from tqdm import tqdm

temp_corpus = test_data['item'].map(lambda x: x.split('.'))

corpus = []
for i in tqdm(range(len(temp_corpus))):
    for line in temp_corpus[i]:
        words = [x for x in line.split()]
        corpus.append(words)

100%|██████████| 5/5 [00:00<00:00, 10496.26it/s]


In [23]:
print(type(corpus))
print(corpus)

<class 'list'>
[['new', 'cylinder', '404', 'gas'], ['accumulator', '-', 'model', 'a-as', '5126', '3/4"', 'connection', '-', 'emerson', 'make'], ['accumulator', 'flokool', '3/4"', '400', 'psi', 'fksa596'], ['accumulator', '1', '5/8', 'connection'], ['accumulator', '7/8‘’']]


In [24]:
num_of_items = len(corpus)

num_of_words = 0
for line in corpus:
    num_of_words += len(line)
    
max_length = 0
for i in range(len(corpus)):
    max_length  = max(max_length, len(corpus[i]), len(corpus[i+1]))
    if i == len(corpus)-2:
        break
        
print('Num of sentences - %s'%(num_of_items))
print('Num of words - %s'%(num_of_words))
print('Max no. of words in a sentence - %s'%(max_length))

Num of sentences - 5
Num of words - 26
Max no. of words in a sentence - 10


In [25]:
## items to word2vec :

emb_dim = 10

model_1 = Word2Vec(corpus, size = emb_dim, window=3, \
               min_count=1, negative = 15, iter = 10, workers = 10, sg=1)

print(model_1)

Word2Vec(vocab=20, size=10, alpha=0.025)


In [31]:
## keras.preprocessing.text.Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', \
##                                   lower=True, split=' ', char_level=False, oov_token=None, document_count=0)


tokenizer_obj_test = Tokenizer(filters='!$%&()*+,.:;<=>?@[\\]^_`{|}~\t\n', split=' ' )
tokenizer_obj_test.fit_on_texts(test_data['item'])

## pad sequences :

max_length_test = max([len(s.split()) for s in test_data['item']])

## define vocabulary size :

wd_index = tokenizer_obj_test.word_index

vocab_size_test = len(wd_index)

test_item_tokens = tokenizer_obj_test.texts_to_sequences(test_data['item'])

test_item_pad = pad_sequences(test_item_tokens, maxlen = max_length_test, padding = 'post')


print(max_length_test)
print('==============')
print(vocab_size_test)
print('==============')
print(wd_index)
print('==============')
print(test_item_tokens)
print('===============')
print(test_item_pad)


10
20
{'3/4"': 3, 'accumulator': 1, 'a-as': 10, 'connection': 4, 'fksa596': 17, '5126': 11, '5/8': 19, '-': 2, 'make': 13, 'flokool': 14, 'cylinder': 6, 'new': 5, '1': 18, '7/8‘’': 20, '404': 7, '400': 15, 'gas': 8, 'model': 9, 'emerson': 12, 'psi': 16}
[[5, 6, 7, 8], [1, 2, 9, 10, 11, 3, 4, 2, 12, 13], [1, 14, 3, 15, 16, 17], [1, 18, 19, 4], [1, 20]]
[[ 5  6  7  8  0  0  0  0  0  0]
 [ 1  2  9 10 11  3  4  2 12 13]
 [ 1 14  3 15 16 17  0  0  0  0]
 [ 1 18 19  4  0  0  0  0  0  0]
 [ 1 20  0  0  0  0  0  0  0  0]]


In [18]:
model_1.wv.vocab

{'-': <gensim.models.keyedvectors.Vocab at 0x7fef18ad6748>,
 '1': <gensim.models.keyedvectors.Vocab at 0x7fef18ad6860>,
 '3/4"': <gensim.models.keyedvectors.Vocab at 0x7fef18ad65c0>,
 '400': <gensim.models.keyedvectors.Vocab at 0x7fef18ad6908>,
 '404': <gensim.models.keyedvectors.Vocab at 0x7fef18ad6668>,
 '5/8': <gensim.models.keyedvectors.Vocab at 0x7fef18ad6710>,
 '5126': <gensim.models.keyedvectors.Vocab at 0x7fef18ad66d8>,
 '7/8‘’': <gensim.models.keyedvectors.Vocab at 0x7fef18ad6898>,
 'a-as': <gensim.models.keyedvectors.Vocab at 0x7fef18ad6630>,
 'accumulator': <gensim.models.keyedvectors.Vocab at 0x7fef18ad65f8>,
 'connection': <gensim.models.keyedvectors.Vocab at 0x7fef18ad68d0>,
 'cylinder': <gensim.models.keyedvectors.Vocab at 0x7fef18ad67f0>,
 'emerson': <gensim.models.keyedvectors.Vocab at 0x7fef18ad69b0>,
 'fksa596': <gensim.models.keyedvectors.Vocab at 0x7fef18ad66a0>,
 'flokool': <gensim.models.keyedvectors.Vocab at 0x7fef18ad67b8>,
 'gas': <gensim.models.keyedvectors.V

In [32]:
test_matrix_data = test_data[2:5]
test_matrix_data.head()

Unnamed: 0,item,product
2,"accumulator flokool 3/4"" 400 psi fksa596",cooling
3,accumulator 1 5/8 connection,cooling
4,accumulator 7/8‘’,cooling


In [33]:
## Tikenize test_matrix_data :

tokenizer_obj_matrix = Tokenizer(filters='!$%&()*+,.:;<=>?@[\\]^_`{|}~\t\n', split=' ' )
tokenizer_obj_matrix.fit_on_texts(test_matrix_data['item'])

## pad sequences :

max_length_matrix = max([len(s.split()) for s in test_matrix_data['item']])

## define vocabulary size :

wd_index_matrix = tokenizer_obj_matrix.word_index

vocab_size_test_matrix = len(wd_index_matrix)

test_matrix_tokens = tokenizer_obj_matrix.texts_to_sequences(test_matrix_data['item'])

test_matrix_pad = pad_sequences(test_matrix_tokens, maxlen = max_length_matrix, padding = 'post')

print(max_length_matrix)
print('==============')
print(vocab_size_test_matrix)
print('==============')
print(wd_index_matrix)
print('==============')
print(test_matrix_tokens)
print('===============')
print(test_matrix_pad)



6
10
{'3/4"': 3, 'accumulator': 1, '1': 7, 'flokool': 2, '7/8‘’': 10, 'connection': 9, '400': 4, 'fksa596': 6, '5/8': 8, 'psi': 5}
[[1, 2, 3, 4, 5, 6], [1, 7, 8, 9], [1, 10]]
[[ 1  2  3  4  5  6]
 [ 1  7  8  9  0  0]
 [ 1 10  0  0  0  0]]


In [40]:
## prepare embeddings

## Preparing embedding matrix

emb_dim = 10

embedding_matrix = np.zeros((vocab_size_test_matrix+1, emb_dim))

    
for word, i in wd_index_matrix.items():
    
    
    emb_vector = model_1.wv.word_vec(word)
    
    if emb_vector is not None:
        embedding_matrix[i] = emb_vector
        
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))


Null word embeddings: 1


In [41]:
print(embedding_matrix)

[[ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [-1.07272435e-02 -1.24397278e-02 -3.13976780e-02 -2.76597347e-02
  -9.97568597e-04  3.80993411e-02  1.42966863e-02 -2.89153308e-03
  -3.89787480e-02 -9.11578070e-03]
 [-4.84701768e-02 -3.10889184e-02  4.57988568e-02  1.83870029e-02
   8.12934339e-03 -8.56283959e-03 -2.91244537e-02  4.36556377e-02
   3.17112729e-02 -4.00307029e-02]
 [ 2.07959265e-02  4.85222340e-02  1.73733681e-02  1.10867564e-02
   3.90771292e-02  2.54254118e-02 -2.08431352e-02  2.61874031e-02
   2.41404437e-02 -4.22534011e-02]
 [ 3.00608128e-02 -4.88103405e-02 -1.38026839e-02 -4.41556908e-02
  -2.77705565e-02 -7.23003689e-03 -2.00753361e-02 -2.15586051e-02
   1.09302634e-02 -3.47028184e-03]
 [ 4.54635173e-02  5.92378573e-03  3.82669382e-02  7.37847295e-05
   2.32619494e-02 -4.12590802e-02  5.54583268e-03  3.87741462e-03
  -1.53143452e-02  6.80060266e-03

In [78]:
wd_index = tokenizer_obj_test.word_index
print(len(wd_index))
print(wd_index)


23
{'as': 11, 'new': 5, '3': 2, '5126': 12, 'connection': 4, 'cylinder': 6, 'fksa596': 18, 'gas': 8, '8‘’': 23, 'a': 10, 'accumulator': 1, '4': 3, 'make': 14, '5': 20, '400': 16, '7': 22, 'psi': 17, '8': 21, 'model': 9, '404': 7, 'flokool': 15, 'emerson': 13, '1': 19}


In [103]:
wd_index_1 = tokenizer_obj_aa.word_index
print(len(wd_index_1))
print(wd_index_1)
aa_item_tokens = tokenizer_obj_aa.texts_to_sequences(test_data['item'])
print('============================')
print(aa_item_tokens)

23
{'as': 3, 'new': 4, 'model': 5, '5126': 7, 'connection': 12, 'cylinder': 13, 'fksa596': 14, 'gas': 16, '8‘’': 10, 'a': 2, 'accumulator': 19, '4': 18, 'make': 22, '5': 20, '400': 1, '7': 9, 'psi': 6, '8': 21, '3': 17, '404': 11, 'flokool': 15, 'emerson': 8, '1': 23}
[[4, 13, 11, 16], [19, 5, 2, 3, 7, 17, 18, 12, 8, 22], [19, 15, 17, 18, 1, 6, 14], [19, 23, 20, 21, 12], [19, 9, 10]]


In [30]:
word_index = tokenizer_obj_test.word_index
print('Found %s unique tokens' % len(word_index))

Found 23 unique tokens


In [67]:
print(word_index)



In [61]:
wrd = 'prmium'
print(model_1.wv.word_vec(wrd))


[ 0.24179202  0.06549174 -0.38604274  0.2220079  -0.17099561 -0.00210161
 -0.19163914  0.02628648  0.1819302  -0.13675494  0.03219317 -0.00996133
 -0.05369972  0.2527915   0.2741835   0.12086015 -0.04894331 -0.16053675
 -0.00583438  0.00325824  0.06969016 -0.11515658  0.0805554   0.09401441
  0.2272013   0.03878359  0.27700585 -0.00347151  0.03068962  0.12152074
  0.20699036  0.09057083  0.06172016  0.08677412  0.16805473  0.0134774
 -0.09955977 -0.23010123  0.11060134  0.06406747  0.2174236  -0.03278314
  0.10400584  0.06704241 -0.21097016 -0.12631032 -0.05094669 -0.09503785
 -0.10903387 -0.0955196   0.22453734 -0.03931703 -0.17506832 -0.19794413
  0.3891973   0.35113436 -0.5200029   0.1512953  -0.04274882 -0.15493187
  0.07763566 -0.08534835  0.02209762  0.04802343 -0.04624603  0.12014635
 -0.162002   -0.21071538 -0.04798426  0.07724632 -0.19445488 -0.2439696
 -0.23706     0.02334258 -0.12904176 -0.14800826  0.02665363  0.13792053
  0.11965785  0.28002065]


In [78]:
j = 0

In [79]:
## prepare embeddings

## Preparing embedding matrix

nb_words = min(vocab_size, vocab_size_1)-1

embedding_matrix = np.zeros((nb_words, emb_dim))

    
for word, i in word_index.items():
        
        
    if word in model_1.wv.vocab:
            
        embedding_matrix[j] = model_1.wv.word_vec(word)
        j = j+1
    
    else:
                
        print(word)
        
        
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))


iot
le18b22rd
uc18h7tft
18g16sk
ver1
txmr
1ohm
m66
935uh
le18b28pid
1755uh
le15k17cl
52uh
05w
54w
gaurds
le18h23ap
16uh
le18j27dg2
jsw
guards
le17h8ap
le18k1m
18b3sk
le17l2ap
le18j19ap
jsav150
le18h16dg2
le18l12dg2
89x74
atomfan4
48x38
le17f5wc
elettro
tft
Null word embeddings: 35


In [76]:
print(type(embedding_matrix))
print(embedding_matrix.shape)
print(embedding_matrix[0])

<class 'numpy.ndarray'>
(7948, 80)
[ 0.24179202  0.06549174 -0.38604274  0.2220079  -0.17099561 -0.00210161
 -0.19163914  0.02628648  0.1819302  -0.13675494  0.03219317 -0.00996133
 -0.05369972  0.25279149  0.27418351  0.12086015 -0.04894331 -0.16053675
 -0.00583438  0.00325824  0.06969016 -0.11515658  0.0805554   0.09401441
  0.2272013   0.03878359  0.27700585 -0.00347151  0.03068962  0.12152074
  0.20699036  0.09057083  0.06172016  0.08677412  0.16805473  0.0134774
 -0.09955977 -0.23010123  0.11060134  0.06406747  0.2174236  -0.03278314
  0.10400584  0.06704241 -0.21097016 -0.12631032 -0.05094669 -0.09503785
 -0.10903387 -0.0955196   0.22453734 -0.03931703 -0.17506832 -0.19794413
  0.38919729  0.35113436 -0.5200029   0.1512953  -0.04274882 -0.15493187
  0.07763566 -0.08534835  0.02209762  0.04802343 -0.04624603  0.12014635
 -0.162002   -0.21071538 -0.04798426  0.07724632 -0.19445488 -0.2439696
 -0.23706     0.02334258 -0.12904176 -0.14800826  0.02665363  0.13792053
  0.11965785  0.28

In [None]:
## Total no. of unique words = vocab_size = 14707
## each word has got 150 parameters as per Embedding Dimension defined.
## Hence total Param # = 14707 * 150 = 2206050


In [35]:
from keras.models import Sequential

model_2 = Sequential()
model_2.add(Embedding(vocab_size_1, emb_dim, weights=[embedding_matrix],\
                    input_length=max_length, trainable=False))
model_2.add(SpatialDropout1D(0.2))
#model.add(LSTM(32, dropout=0.2, recurrent_dropout=0.2))
model_2.add(GRU(units=32, dropout=0.2, recurrent_dropout=0.2))
model_2.add(Dense(256, activation='relu'))
model_2.add(Dense(71, activation='softmax'))
model_2.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

model_2.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 121, 100)          794900    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 121, 100)          0         
_________________________________________________________________
gru_1 (GRU)                  (None, 32)                12768     
_________________________________________________________________
dense_1 (Dense)              (None, 256)               8448      
_________________________________________________________________
dense_2 (Dense)              (None, 71)                18247     
Total params: 834,363
Trainable params: 39,463
Non-trainable params: 794,900
_________________________________________________________________


In [39]:
## Creating X and Y variables for training. X = Items or item 'feature'. Y = 'product' or product category

X = []
for i in range(len(tranzact_data_labelled_final)):
    
    X.append(tranzact_data_labelled_final.iloc[i]['item'])
#Q = np.array(tranzact_data_3["product"])
Y = tranzact_data_labelled_final['product']
print(X[1])

print(Y[1])


accumulator model a as 5126 3 4 connection emerson make
cooling


In [40]:
#train test split in 80% / 20% ratio

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state = 5)


In [41]:
## X_train and X_test word embedding :

X_train_tokens = tokenizer_obj_1.texts_to_sequences(X_train)
X_test_tokens = tokenizer_obj_1.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_tokens, maxlen = max_length, padding = 'post')
X_test_pad = pad_sequences(X_test_tokens, maxlen = max_length, padding = 'post')

In [44]:
print(X_train_pad.shape)
print(X_test_pad.shape)
print(max_length)
print(vocab_size_1)

(9442, 121)
(2361, 121)
121
7949


In [42]:
from sklearn.preprocessing import LabelBinarizer

encoder = LabelBinarizer()

Y_train_coded = encoder.fit_transform(Y_train)
Y_test_coded = encoder.fit_transform(Y_test)

print(Y_train_coded.shape)
print(Y_test_coded.shape)


(9442, 71)
(2361, 69)


In [43]:
model_2.fit(X_train_pad,Y_train_coded,batch_size=128,epochs=25,
          validation_split=0.2)

Train on 7553 samples, validate on 1889 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7f1e54ea3f28>