In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, SpatialDropout1D, GRU
from keras.layers import Conv1D, MaxPooling1D
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.layers import Bidirectional
from keras.preprocessing import sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
%matplotlib inline


Using TensorFlow backend.


In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
## Loading Data - labelled data:

data_1 = pd.ExcelFile('R2_RNN_Tranzact_Rohan_1_Regrouped_with_supplier.xlsx')
tranzact_data_1 = data_1.parse(0)

tranzact_data_1.tail()

Unnamed: 0,supplier_item_id,to_company_name,supplier_item_id.1,supplier_item_id.2,product,supplier,Done,Atul Sugg.,sub_type,material,process,grade,spec,brand
13552,86390,CESARE BONETTI INTERNATIONAL PVT. LTD.,86390,"YOKE SLEEVE;8"" # 150;GTV; B150 C62300",YOKE SLEEVE,CESARE BONETTI INDIA PVT. LTD.,278.0,,,,,C62300,,
13553,89723,CESARE BONETTI INTERNATIONAL PVT. LTD.,89723,"YOKE SLEEVE 1½"" #2700 BLY;B150 C62300",YOKE SLEEVE,CESARE BONETTI INDIA PVT. LTD.,278.0,,,,,C62300,,
13554,89995,CESARE BONETTI INTERNATIONAL PVT. LTD.,89995,"YOKE SLEEVE;4"" #150 GTV; B150 C62300",YOKE SLEEVE,CESARE BONETTI INDIA PVT. LTD.,278.0,,,,,C62300,,
13555,117819,WAAREE INDUSTRIES PVT. LTD.,117819,"YOKE SLEEVE;8"" # 150;GTV; A439 D2",cicasting,Waaree Industries Pvt.Ltd.,278.0,,,,,,,
13556,169292,CESARE BONETTI INTERNATIONAL PVT. LTD.,169292,"YOKE SLEEVE; 1"";#1500;CBD;A582 T416",YOKE SLEEVE,CESARE BONETTI INDIA PVT. LTD.,278.0,,,,,,,


In [4]:
## Loading Data - un-labelled data:

data_2 = pd.ExcelFile('14Mar_Gorky.xlsx')
tranzact_data_new_1 = data_2.parse(0)

tranzact_data_new_1.head()

Unnamed: 0,buyer_item_id,buyer_itemid,buyer_item_name,from_company_id,from_company_name,to_company_id,to_company_name
0,195191,DRF/E/047617,EMERSON Liquid Line Filter/Drier EK-165S (0476...,328,Polfrost Air Con Pvt Ltd.,1189,H J International
1,46564,11338,"Hydraulic Fittings - cramping Nipple - 3/8"" x ...",6423,Nandan GSE Pvt Ltd,6619,ASIATIC HYDRAULICS
2,372105,20119,Hydraulic Fitting - Hose Clip Elbow 1/4 x 5/16,6423,Nandan GSE Pvt Ltd,6619,ASIATIC HYDRAULICS
3,372104,20118,"Hydraulic Hose Pipe - Hose Pipe 3/4"" x 500mm",6423,Nandan GSE Pvt Ltd,6619,ASIATIC HYDRAULICS
4,372101,20117,"Hydraulic Hose Pipe - Hose Pipe 3/4"" x 3 mtr 6...",6423,Nandan GSE Pvt Ltd,6619,ASIATIC HYDRAULICS


In [5]:
## Picking up required info from labelled data:

col = ['to_company_name', 'supplier_item_id.2', 'product', 'supplier']
tranzact_data_2 = tranzact_data_1[col]
tranzact_data_2.columns = ['OEM', 'item', 'product', 'supplier']

# Creating separate dataframe for Cesare Bonetti International Pvt. Ltd. since these are unlabelled :

tranzact_data_cesare_international = tranzact_data_2.loc[tranzact_data_2['OEM'] \
                                                               == 'CESARE BONETTI INTERNATIONAL PVT. LTD.']

tranzact_data_2_revised = tranzact_data_2.drop(tranzact_data_cesare_international.index)

tranzact_data_cesare_international = tranzact_data_cesare_international.reset_index(drop=True)
tranzact_data_2_revised = tranzact_data_2_revised.reset_index(drop=True)

## Picking up required info from un-labelled data:

col = ['from_company_name', 'buyer_item_name', 'to_company_name']
tranzact_data_new_2 = tranzact_data_new_1[col]
tranzact_data_new_2.columns = ['OEM', 'item', 'supplier']

## Merging two data frames :

tranzact_data_combined = pd.concat([tranzact_data_2_revised, tranzact_data_new_2], axis=0, ignore_index=True)



## Removing duplicate (item + supplier) combinations :

tranzact_data_combined['find_duplicate'] = tranzact_data_combined['item'].map(str) + \
                                            tranzact_data_combined['supplier'].map(str)

tranzact_data_combined.drop_duplicates(subset='find_duplicate', keep = 'first', inplace = True)
tranzact_data_combined = tranzact_data_combined.reset_index(drop=True)
        
print(len(tranzact_data_combined))
tranzact_data_combined.head()

29063


Unnamed: 0,OEM,item,product,supplier,find_duplicate
0,Polfrost Air Con Pvt Ltd.,NEW CYLINDER 404 GAS,GAS,Stallion Enterprises,NEW CYLINDER 404 GASStallion Enterprises
1,Ecofrost Technologies Private Limited,"ACCUMULATOR - MODEL A-AS 5126, (3/4"" CONNECTIO...",COOLING,Neelam Enterprises,"ACCUMULATOR - MODEL A-AS 5126, (3/4"" CONNECTIO..."
2,Ecofrost Technologies Private Limited,"ACCUMULATOR; FLOKOOL; 3/4""; 400 PSI, FKSA596",COOLING,T J CONTROLS,"ACCUMULATOR; FLOKOOL; 3/4""; 400 PSI, FKSA596T ..."
3,Polfrost Air Con Pvt Ltd.,ACCUMULATOR 1.5/8 CONNECTION,COOLING,H J International,ACCUMULATOR 1.5/8 CONNECTIONH J International
4,Polfrost Air Con Pvt Ltd.,ACCUMULATOR 7/8‘’,COOLING,H J International,ACCUMULATOR 7/8‘’H J International


In [6]:
## Cleaning Data :

pd.options.mode.chained_assignment = None

## make all in lower case :

tranzact_data_combined['OEM'] = tranzact_data_combined['OEM'].map(lambda x: x if type(x)!=str else x.lower()) 
tranzact_data_combined['item'] = tranzact_data_combined['item'].map(lambda x: x if type(x)!=str else x.lower()) 
tranzact_data_combined['product'] = tranzact_data_combined['product'].map(lambda x: x if type(x)!=str else x.lower())

## Join all hyphaned words :

tranzact_data_combined['item'] = tranzact_data_combined['item'].str.replace(r'\b\-\b','')  

## Removing underscores '_':

punctuation = ['_']

for i in punctuation:
    
    tranzact_data_combined['item']= tranzact_data_combined['item'].str.replace(i," ")
    
## Remove integers & all other punctuations:

tranzact_data_combined['item'] = tranzact_data_combined['item'].str.replace(r'\b\d+\b','') ## remove integers

tranzact_data_combined['item'] = tranzact_data_combined['item'].str.replace(r'\W',' ')   ## remove puntuations

## Removing some punctuations:

# punctuation = ['!', '$', '%', '&', '(', ')', '*', '+', ',', '.', ':', ';', '<', '=', '-', '>', '?', '@', \
#                '[', '\\', ']', '"', '^', '_', '`', '{', '|', '}', '~', '\t', '\n'] 
 
# for i in punctuation:
    
#     tranzact_data_combined['item']= tranzact_data_combined['item'].str.replace(i," ")
    
    
tranzact_data_combined['item'] = tranzact_data_combined['item'].str.replace(r'\s+',' ')  # making single gaps

## 

## remove rows with any cell value = none

# tranzact_data_labelled_final = tranzact_data_labelled.dropna()
# tranzact_data_un_labelled_final = tranzact_data_un_labelled.dropna()

# tranzact_data_labelled_final = tranzact_data_labelled_final.reset_index(drop=True)
# tranzact_data_un_labelled_final = tranzact_data_un_labelled_final.reset_index(drop=True)

print(len(tranzact_data_combined))


29063


In [9]:
l_ist = ['mm', 'ft']

tranzact_data_combined['item']= tranzact_data_combined['item'].apply(lambda x:' '.join([i for i in x.split()
                                                                                if i not in l_ist]).lower())


#tratranzact_data_combined['item'] = tranzact_data_combined['item'].str.replace(r'\:alnum:\+mm\b',' ')   ## remove all 'mm' dimensions 
 
tranzact_data_combined['item'] = tranzact_data_combined['item'].str.replace(r'\b\m\d+\b',' ') ## remove 'm8' etc
tranzact_data_combined['item'] = tranzact_data_combined['item'].str.replace(r'\b\d+\mm\b',' ')
#tranzact_data_combined['item'] = tranzact_data_combined['item'].str.replace(r'\b\d+\ft\b',' ')
tranzact_data_combined['item'] = tranzact_data_combined['item'].str.replace(r'\s+',' ')  # making single gaps
#tranzact_data_combined['item'] = tranzact_data_combined['item'].str.replace(r'\b\d+\m\b',' ')
#tranzact_data_combined['item'] = tranzact_data_combined['item'].str.replace(r'\b\d+\ft\b',' ') 
tranzact_data_combined['item'] = tranzact_data_combined['item'].str.replace(r'\b\w\b','') ## remove stand alone single letters
tranzact_data_combined['item'] = tranzact_data_combined['item'].str.replace(r'\s+',' ')  # making single gaps
tranzact_data_combined['item'] = tranzact_data_combined['item'].str.replace(r'\s+',' ')  # making single gaps
tranzact_data_combined['item'] = tranzact_data_combined['item'].str.replace(r'\s+',' ')  # making single gaps
tranzact_data_combined['item'] = tranzact_data_combined['item'].str.replace(r'\s+',' ')  # making single gaps

tranzact_data_combined['item']= tranzact_data_combined['item'].apply(lambda x:' '.join([i for i in x.split()
                                                                                if i not in l_ist]).lower())

## Removing duplicate (item + supplier) combinations :

tranzact_data_combined['find_duplicate'] = tranzact_data_combined['item'].map(str) + \
                                            tranzact_data_combined['supplier'].map(str)

tranzact_data_combined.drop_duplicates(subset='find_duplicate', keep = 'first', inplace = True)
tranzact_data_combined = tranzact_data_combined.reset_index(drop=True)


print(len(tranzact_data_combined))

24247


In [10]:
from gensim.models import Word2Vec

from tqdm import tqdm

temp_corpus = tranzact_data_combined['item'].map(lambda x: x.split('.'))

corpus = []
for i in tqdm(range(len(temp_corpus))):
    for line in temp_corpus[i]:
        words = [x for x in line.split()]
        corpus.append(words)

100%|██████████| 24247/24247 [00:00<00:00, 53385.90it/s]


In [11]:
num_of_items = len(corpus)

num_of_words = 0
for line in corpus:
    num_of_words += len(line)
    
max_length = 0
for i in range(len(corpus)):
    max_length  = max(max_length, len(corpus[i]), len(corpus[i+1]))
    if i == len(corpus)-2:
        break
        
print('Num of items - %s'%(num_of_items))
print('Num of words in all items - %s'%(num_of_words))
print('Max no. of words in longest item-description - %s'%(max_length))

Num of items - 24247
Num of words in all items - 125499
Max no. of words in longest item-description - 86


In [12]:
## Splitting the combined data into labelled data and unlabelled data :

# tranzact_data_unlabelled = tranzact_data_combined.loc[tranzact_data_combined['product'] == Null]

tranzact_data_labelled = tranzact_data_combined.loc[pd.notnull(tranzact_data_combined['product'])]

tranzact_data_un_labelled = tranzact_data_combined.drop(tranzact_data_labelled.index)

tranzact_data_labelled = tranzact_data_labelled.reset_index(drop=True)
tranzact_data_un_labelled = tranzact_data_un_labelled.reset_index(drop=True)

print(len(tranzact_data_labelled))
print(len(tranzact_data_un_labelled))
tranzact_data_un_labelled.tail()

10058
14189


Unnamed: 0,OEM,item,product,supplier,find_duplicate
14184,cesare bonetti india pvt. ltd.,ms wire brush diameter,,DAMODAR ENTERPRISES,ms wire brush diameterDAMODAR ENTERPRISES
14185,r-tech products & packaging pvt ltd,polybag 9x13 inch pp,,GLORY PACK INDUSTRIES,polybag 9x13 inch ppGLORY PACK INDUSTRIES
14186,r-tech products & packaging pvt ltd,polybag 10x15 inch pp,,GLORY PACK INDUSTRIES,polybag 10x15 inch ppGLORY PACK INDUSTRIES
14187,cesare bonetti india pvt. ltd.,handle nut htb a194 gr8,,Shah Brothers,handle nut htb a194 gr8Shah Brothers
14188,r-tech products & packaging pvt ltd,pp monobox st5,,Shree Arun Packaging Co. Pvt. Ltd.,pp monobox st5Shree Arun Packaging Co. Pvt. Ltd.


In [46]:
from collections import Counter
Counter(tranzact_data_labelled["product"])

Counter({'acid': 9,
         'actuator': 61,
         'adhesive': 12,
         'alloysteelbar': 63,
         'alloysteelcasting': 99,
         'alloysteelforging': 62,
         'aluminiumcasting': 3,
         'bearing': 128,
         'belt': 28,
         'bushbearing': 93,
         'carbonsteelbar': 125,
         'cibar': 5,
         'cicasting': 179,
         'cooling': 132,
         'cscasting': 500,
         'csforging': 104,
         'diaphragm': 28,
         'electrical': 1353,
         'electronic': 255,
         'fasteners': 1737,
         'gas': 21,
         'gasket': 241,
         'gearbox': 30,
         'glass': 22,
         'hydraulic': 620,
         'ic': 78,
         'inductor': 19,
         'lab-apparatus': 33,
         'machinery': 10,
         'manifold': 47,
         'measuringinstrumentelectrical': 6,
         'measuringinstrumentmechanical': 65,
         'mechanicalconnector': 31,
         'metalsheet': 358,
         'misc': 358,
         'motor': 38,
         'msbar

In [13]:
## digitising 'product' and creating a dictionary for labelled data :

tranzact_data_labelled['product_id'] = tranzact_data_labelled['product'].factorize()[0]

product_id_df = tranzact_data_labelled[['product', 'product_id']].drop_duplicates().sort_values('product_id')
product_to_id = dict(product_id_df.values)
id_to_product = dict(product_id_df[['product_id', 'product']].values)

num_labells = max(tranzact_data_labelled['product_id']) + 1  ## required for keras last layer dimension

## removing puncuations from items of both labelled and un-labelled data :

# pd.options.mode.chained_assignment = None

# tranzact_data_labelled_final['item'] = tranzact_data_labelled_final['item'].str.replace(r'\W',' ')
# tranzact_data_un_labelled_final['item'] = tranzact_data_un_labelled_final['item'].str.replace(r'\W',' ')

## remove gaps between words to single gap

# tranzact_data_labelled_final['item'] = tranzact_data_labelled_final['item'].str.replace(r'\s+',' ')
# tranzact_data_un_labelled_final['item'] = tranzact_data_un_labelled_final['item'].str.replace(r'\s+',' ')

print('Num of product categories - %s'%(max(tranzact_data_labelled['product_id'])))
print(len(tranzact_data_labelled))
print(len(tranzact_data_un_labelled))
tranzact_data_un_labelled.tail()

Num of product categories - 65
10058
14189


Unnamed: 0,OEM,item,product,supplier,find_duplicate
14184,cesare bonetti india pvt. ltd.,ms wire brush diameter,,DAMODAR ENTERPRISES,ms wire brush diameterDAMODAR ENTERPRISES
14185,r-tech products & packaging pvt ltd,polybag 9x13 inch pp,,GLORY PACK INDUSTRIES,polybag 9x13 inch ppGLORY PACK INDUSTRIES
14186,r-tech products & packaging pvt ltd,polybag 10x15 inch pp,,GLORY PACK INDUSTRIES,polybag 10x15 inch ppGLORY PACK INDUSTRIES
14187,cesare bonetti india pvt. ltd.,handle nut htb a194 gr8,,Shah Brothers,handle nut htb a194 gr8Shah Brothers
14188,r-tech products & packaging pvt ltd,pp monobox st5,,Shree Arun Packaging Co. Pvt. Ltd.,pp monobox st5Shree Arun Packaging Co. Pvt. Ltd.


In [14]:
## tokenizer_obj_comb = Tokenizer(filters='!$%&()*+,.:;<=>?@[\\]^_`{|}~\t\n', split=' ' )
## tokenizer_obj_lab = Tokenizer(filters='!$%&()*+,.:;<=>?@[\\]^_`{|}~\t\n', split=' ' )
## tokenizer_obj_un_lab = Tokenizer(filters='!$%&()*+,.:;<=>?@[\\]^_`{|}~\t\n', split=' ' )

tokenizer_obj_comb = Tokenizer()
tokenizer_obj_lab = Tokenizer()
tokenizer_obj_un_lab = Tokenizer()

tokenizer_obj_comb.fit_on_texts(tranzact_data_combined['item'])
tokenizer_obj_lab.fit_on_texts(tranzact_data_labelled['item'])
tokenizer_obj_un_lab.fit_on_texts(tranzact_data_un_labelled['item'])

## pad sequences :

max_length_comb = max([len(s.split()) for s in tranzact_data_combined['item']])
max_length_lab = max([len(s.split()) for s in tranzact_data_labelled['item']])
max_length_un_lab = max([len(s.split()) for s in tranzact_data_un_labelled['item']])

# min_length = min([len(s.split()) for s in total_item_labelled_unlabelled['item']])

## define vocabulary size :

tokenizer_obj_lab_index = tokenizer_obj_lab.word_index
tokenizer_obj_comb_index = tokenizer_obj_comb.word_index

vocab_size_comb = len(tokenizer_obj_comb.word_index) + 1
vocab_size_lab = len(tokenizer_obj_lab.word_index) + 1
vocab_size_un_lab = len(tokenizer_obj_un_lab.word_index) + 1

comb_item_tokens = tokenizer_obj_comb.texts_to_sequences(tranzact_data_combined['item'])
comb_item_tokens_padded = pad_sequences(comb_item_tokens, maxlen = max_length_comb, padding = 'post')

# total_item_tokens = tokenizer_obj.texts_to_sequences(total_item_labelled_unlabelled['item'])
# total_item_tokens_1 = tokenizer_obj_1.texts_to_sequences(tranzact_data_labelled_final['item'])

# total_item_pad = pad_sequences(total_item_tokens, maxlen = max_length, padding = 'post')
# total_item_pad_1 = pad_sequences(total_item_tokens_1, maxlen = max_length, padding = 'post')

print(max_length_comb)
print(max_length_lab)
print(max_length_un_lab)
print('==============')
print(vocab_size_comb)
print(vocab_size_lab)
print(vocab_size_un_lab)
print('=================')
print(comb_item_tokens_padded.shape)
print(comb_item_tokens_padded[0])

86
32
86
14272
7340
10527
(24247, 86)
[493 175 403   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0]


In [15]:
temp_corpus_labelled = tranzact_data_labelled['item'].map(lambda x: x.split('.'))

corpus_labelled = []
for i in tqdm(range(len(temp_corpus_labelled))):
    for line in temp_corpus_labelled[i]:
        words_labelled = [x for x in line.split()]
        corpus_labelled.append(words_labelled)
        

100%|██████████| 10058/10058 [00:00<00:00, 47895.61it/s]


In [16]:
temp_corpus_un_labelled = tranzact_data_un_labelled['item'].map(lambda x: x.split('.'))

corpus_un_labelled = []
for i in tqdm(range(len(temp_corpus_un_labelled))):
    for line in temp_corpus_un_labelled[i]:
        words_un_labelled = [x for x in line.split()]
        corpus_un_labelled.append(words_un_labelled)

100%|██████████| 14189/14189 [00:00<00:00, 46557.37it/s]


In [88]:
print(corpus_labelled)



In [17]:
num_of_items_labelled = len(corpus_labelled)

num_of_words_labelled = 0
for line in corpus_labelled:
    num_of_words_labelled += len(line)
    
max_length_labelled = 0
for i in range(len(corpus_labelled)):
    max_length_labelled  = max(max_length_labelled, len(corpus_labelled[i]), len(corpus_labelled[i+1]))
    if i == len(corpus_labelled)-2:
        break
        
        
num_of_items_un_labelled = len(corpus_un_labelled)

num_of_words_un_labelled = 0
for line in corpus_un_labelled:
    num_of_words_un_labelled += len(line)
    
max_length_un_labelled = 0
for i in range(len(corpus_un_labelled)):
    max_length_un_labelled  = max(max_length_un_labelled, len(corpus_un_labelled[i]), len(corpus_un_labelled[i+1]))
    if i == len(corpus_un_labelled)-2:
        break


print('Num of labelled sentences - %s'%(num_of_items_labelled))
print('Num of labelled words - %s'%(num_of_words_labelled))
print('Max no. of labelled words in a sentence - %s'%(max_length_labelled))
print('Num of un-labelled sentences - %s'%(num_of_items_un_labelled))
print('Num of un-labelled words - %s'%(num_of_words_un_labelled))
print('Max no. of un-labelled words in a sentence - %s'%(max_length_un_labelled))

Num of labelled sentences - 10058
Num of labelled words - 51347
Max no. of labelled words in a sentence - 32
Num of un-labelled sentences - 14189
Num of un-labelled words - 74152
Max no. of un-labelled words in a sentence - 86


In [188]:
## for only embedding layer output
from keras.models import Sequential

emb_dim = 300

model_emb = Sequential()
model_emb.add(Embedding(vocab_size_comb, emb_dim , mask_zero = True, input_length= max_length_comb))

# the model will take as input an integer matrix of size (batch, input_length).
# the largest integer (i.e. word index) in the input should be
# no larger than 999 (vocabulary size).
# now model.output_shape == (None, 10, 64), where None is the batch dimension.
##  input_array = np.random.randint(1000, size=(32, 10))

input_array = comb_item_tokens_padded  ## Shape = 24247 X 86

model_emb.compile('adam', 'categorical_crossentropy')
emb_output_array = model_emb.predict(input_array)
assert emb_output_array.shape == (num_of_items, max_length_comb, emb_dim)
print(emb_output_array.shape)

(24247, 86, 300)


In [43]:
# tokenizer_obj_comb_index = tokenizer_obj_comb.word_index

print(tokenizer_obj_comb_index)



In [189]:
## 09-04-2019

emb_mat = model_emb.get_weights()[0]

print(emb_mat.shape)

words_embeddings = {w:emb_mat[idx] for w, idx in tokenizer_obj_comb_index.items()}
#print(words_embeddings['polyethylene'])

(14272, 300)


In [211]:
## Prepare embedding matrix :

embedding_matrix = np.zeros((vocab_size_lab, emb_dim))

for word, i in tokenizer_obj_lab_index.items():
    
    emb_vector = words_embeddings[word]
    
#    emb_vector = model_1.wv.word_vec(word)
    
    if emb_vector is not None:
        embedding_matrix[i] = emb_vector
        
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))


print(embedding_matrix.shape)

Null word embeddings: 1
(7340, 300)


In [212]:
## The model

model_3 = Sequential()
model_3.add(Embedding(vocab_size_lab, emb_dim, weights=[embedding_matrix],\
                    input_length=max_length_lab, trainable=True))
#model_3.add(SpatialDropout1D(0.2))
#model_3.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='tanh'))
#model_3.add(MaxPooling1D(pool_size=2))

#model_3.add(Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2)))
#model_3.add(GRU(units=32, dropout=0.2, recurrent_dropout=0.2))
model_3.add(Bidirectional(GRU(units=64, dropout=0.2, recurrent_dropout=0.2)))
#model_3.add(Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2)))
model_3.add(Dense(256, activation='relu'))
model_3.add(Dense(num_labells, activation='softmax'))
model_3.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

model_3.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_27 (Embedding)     (None, 32, 300)           2202000   
_________________________________________________________________
bidirectional_17 (Bidirectio (None, 128)               140160    
_________________________________________________________________
dense_28 (Dense)             (None, 256)               33024     
_________________________________________________________________
dense_29 (Dense)             (None, 66)                16962     
Total params: 2,392,146
Trainable params: 2,392,146
Non-trainable params: 0
_________________________________________________________________


In [213]:
## Creating X and Y variables for training. X = Items or item 'feature'. Y = 'product' or product category

X = []
for i in range(len(tranzact_data_labelled)):
    
    X.append(tranzact_data_labelled.iloc[i]['item'])
#Q = np.array(tranzact_data_3["product"])
Y = tranzact_data_labelled['product']
print(X[1])

print(Y[1])

##############################################################################################################

Un_labelled = []
for i in range(len(tranzact_data_un_labelled)):
    
    Un_labelled.append(tranzact_data_un_labelled.iloc[i]['item'])


accumulator model aas connection emerson make
cooling


In [58]:
Input_Item_Name = ['QUAD BAND GSM MODEM IC SIM800C LCC SIMCOM QUAD BAND GSM IC 32M S2-10688-Z1L0X']


In [214]:
X_tokens = tokenizer_obj_lab.texts_to_sequences(X)
#X_test_tokens = tokenizer_obj_lab.texts_to_sequences(X_test)

X_pad = pad_sequences(X_tokens, maxlen = max_length_labelled, padding = 'post')
#X_test_pad = pad_sequences(X_test_tokens, maxlen = max_length_labelled, padding = 'post')

print(X_tokens[0])


Un_label_tokens = tokenizer_obj_un_lab.texts_to_sequences(Un_labelled)

Un_label_pad = pad_sequences(Un_label_tokens, maxlen = max_length_labelled, padding = 'post')

[1115, 145, 573]


In [94]:
print(type(X_tokens))
print(X_tokens)

<class 'list'>
[[1454, 237, 2814, 792], [1106, 2, 267, 4147, 4148, 128, 1107, 2, 793, 66], [1106, 2222, 128, 484, 485, 2815], [1106, 10, 540, 1107], [1106, 4149], [1106, 4150, 4151], [1106, 2222, 522, 484, 485, 2815], [1106, 1880], [1106, 4152], [1106, 4153], [4154, 1881], [4155, 1881], [4156, 4157, 1881], [4158, 244, 145, 1881], [2816, 2817, 1882, 237, 332], [1882, 237, 4159], [1882, 792], [4160, 1108], [895, 4161, 1108], [4162, 1108, 60, 2818], [2819, 4163, 4164, 2820, 1108], [4165, 1108], [4166, 1108, 4167, 675, 4168], [4169, 1108], [4170, 2820, 1108], [507, 2, 4171, 1108, 4172], [1312, 675, 4173, 847, 4174, 4175], [486, 268, 4176, 269, 950, 302, 386], [486, 268, 4177], [486, 268, 4178], [486, 268, 4179, 269, 950, 302, 386], [1883, 66, 595, 1027, 268], [1883, 66, 486, 1624, 297, 1027, 268, 4180, 485, 487, 267, 244, 596, 66, 2821, 52, 2822, 4181, 4182, 1625, 487, 267, 2823, 90, 1884, 76, 151, 152, 1626], [486, 268, 4183], [268, 457, 704, 60, 312, 2223], [486, 268, 4184, 86, 597], [26

In [215]:
from sklearn.preprocessing import LabelBinarizer

encoder = LabelBinarizer()

Y_coded = encoder.fit_transform(Y)

print(Y_coded.shape)



(10058, 66)


In [216]:
#train test split in 80% / 20% ratio

from sklearn.model_selection import train_test_split
X_train_pad, X_test_pad, Y_train_coded, Y_test_coded = train_test_split(X_pad, Y_coded, test_size=0.20, random_state = 5)


In [None]:
# total_item_tokens = tokenizer_obj.texts_to_sequences(total_item_labelled_unlabelled['item'])
# total_item_tokens_1 = tokenizer_obj_1.texts_to_sequences(tranzact_data_labelled_final['item'])

# total_item_pad = pad_sequences(total_item_tokens, maxlen = max_length, padding = 'post')
# total_item_pad_1 = pad_sequences(total_item_tokens_1, maxlen = max_length, padding = 'post')


In [48]:
## X_train and X_test word embedding :

#X_train_tokens = tokenizer_obj_lab.texts_to_sequences(X_train)
#X_test_tokens = tokenizer_obj_lab.texts_to_sequences(X_test)

#X_train_pad = pad_sequences(X_train_tokens, maxlen = max_length_labelled, padding = 'post')
#X_test_pad = pad_sequences(X_test_tokens, maxlen = max_length_labelled, padding = 'post')

In [62]:
print(X_train_pad.shape)
print(X_test_pad.shape)
print(max_length_lab)
print(vocab_size_lab)

print(Y_train_coded.shape)

(8046, 32)
(2012, 32)
32
7340
(8046, 66)


In [217]:
len(X_test_pad)
X_test_tokens = []

for i in range(len(X_test_pad)):
    X_test_tokens.append((X_test_pad[i][X_test_pad[i] != 0]).tolist())
    
un_lab_tokens = []
for j in range(len(Un_label_pad)):
    un_lab_tokens.append((Un_label_pad[j][Un_label_pad[j] != 0]).tolist())

In [122]:
print(type(X_test_tokens))
print(len(X_test_tokens))
print(len(X_test_pad))
print(X_test_tokens)

<class 'list'>
2361
2361
[[6873, 162, 1575, 1428], [206, 5, 50, 7746, 840], [597, 46, 82, 6475, 6476, 3525, 7, 6477], [6924, 489, 10, 1666, 243], [440, 54], [578, 303, 6965], [702, 25, 1507, 3889, 510], [369, 748, 2138, 1197, 2139, 2, 7232], [53, 896, 855, 30, 72], [43, 4, 653, 2880, 87], [130, 2, 59, 37, 50, 2, 362], [346, 38, 51, 351, 16, 560, 145, 23], [5945, 32, 500, 5946, 1513, 1547], [2603, 1087, 487, 1206, 8272, 7, 441, 2, 8273], [89, 4, 1464, 414], [3265, 5658, 269, 1740, 1739, 2, 16, 3266, 3267, 3268, 46, 3269, 2354, 86, 329, 229, 2024, 66], [812, 1244, 20, 745, 81, 288, 52, 132, 6945], [76, 588, 1781, 68, 6105, 34, 147], [674, 73], [207, 50, 7756, 2696, 12, 215], [574, 22, 1401, 997, 1750, 408, 150], [588, 75, 6256, 71, 63, 113], [8, 19, 33, 51, 3, 70, 1, 219, 71], [129, 11, 2, 90, 2, 10, 747, 3, 1, 16, 186], [6, 5, 1973, 1974, 1975, 13, 17, 3, 65], [730, 1096, 334, 351, 221, 332, 761, 151, 145, 847, 7827, 1296], [123, 9, 2318, 178, 187, 980, 569, 51], [1380, 121, 3208, 5384,

In [218]:

text = tokenizer_obj_lab.sequences_to_texts(X_test_tokens)

X_test_text = pd.DataFrame()
X_test_text = pd.DataFrame(text)
X_test_text.columns = ['items']
X_test_text.head()

text_un_lab = tokenizer_obj_un_lab.sequences_to_texts(un_lab_tokens)

un_lab_text = pd.DataFrame()
un_lab_text = pd.DataFrame(text_un_lab)
un_lab_text.columns = ['items']
un_lab_text.head()

Unnamed: 0,items
0,emerson liquid line filter drier ek165s odf so...
1,hydraulic fittings cramping nipple
2,hydraulic fitting hose clip elbow
3,hydraulic hose pipe hose pipe
4,hydraulic hose pipe hose pipe mtr 6000psi


In [158]:
print(Y_test_coded[0])
print(Y_test_coded[0].shape)
print(Y_test_coded.shape)
print(([Y_test_coded][0]).shape)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
(67,)
(2361, 67)
(2361, 67)


In [241]:
encoder.inverse_transform(Y_test_coded)

array(['tool', 'metalsheet', 'valve', ..., 'spring', 'pipe', 'pipe'],
      dtype='<U29')

In [88]:

ppp = [encoder.inverse_transform([Y_test_coded][0])][0][0]
print(ppp)

tool


In [219]:
model_3.fit(X_train_pad,Y_train_coded,batch_size=64,epochs=10,
          validation_split=0.2)

Train on 6436 samples, validate on 1610 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ff9cc82d278>

In [220]:
#test_sequences = tok.texts_to_sequences(X_test)
#test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)
accr = model_3.evaluate(X_test_pad,Y_test_coded)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.622
  Accuracy: 0.887
