In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, SpatialDropout1D, GRU
from keras.layers import Conv1D, MaxPooling1D
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.layers import Bidirectional
from keras.preprocessing import sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
%matplotlib inline


Using TensorFlow backend.


In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
## Loading Data - labelled data:

data_1 = pd.ExcelFile('R1_RNN_Tranzact_Rohan_1_Regrouped_with_supplier.xlsx')
tranzact_data_1 = data_1.parse(0)

tranzact_data_1.tail()

Unnamed: 0,supplier_item_id,to_company_name,supplier_item_id.1,supplier_item_id.2,product,supplier,Done,Atul Sugg.,sub_type,material,process,grade,spec,brand
13552,86390,CESARE BONETTI INTERNATIONAL PVT. LTD.,86390,"YOKE SLEEVE;8"" # 150;GTV; B150 C62300",YOKE SLEEVE,CESARE BONETTI INDIA PVT. LTD.,278.0,,,,,C62300,,
13553,89723,CESARE BONETTI INTERNATIONAL PVT. LTD.,89723,"YOKE SLEEVE 1½"" #2700 BLY;B150 C62300",YOKE SLEEVE,CESARE BONETTI INDIA PVT. LTD.,278.0,,,,,C62300,,
13554,89995,CESARE BONETTI INTERNATIONAL PVT. LTD.,89995,"YOKE SLEEVE;4"" #150 GTV; B150 C62300",YOKE SLEEVE,CESARE BONETTI INDIA PVT. LTD.,278.0,,,,,C62300,,
13555,117819,WAAREE INDUSTRIES PVT. LTD.,117819,"YOKE SLEEVE;8"" # 150;GTV; A439 D2",cicasting,Waaree Industries Pvt.Ltd.,278.0,,,,,,,
13556,169292,CESARE BONETTI INTERNATIONAL PVT. LTD.,169292,"YOKE SLEEVE; 1"";#1500;CBD;A582 T416",YOKE SLEEVE,CESARE BONETTI INDIA PVT. LTD.,278.0,,,,,,,


In [4]:
## Loading Data - un-labelled data:

data_2 = pd.ExcelFile('14Mar_Gorky.xlsx')
tranzact_data_new_1 = data_2.parse(0)

tranzact_data_new_1.tail()

Unnamed: 0,buyer_item_id,buyer_itemid,buyer_item_name,from_company_id,from_company_name,to_company_id,to_company_name
17763,289052,PLYBAG_10X15,POLYBAG_10X15 INCH_PP,13872,R-Tech Products & Packaging Pvt Ltd,16730,GLORY PACK INDUSTRIES
17764,289639,HANDLENUTM12X5MMXX,"HANDLE NUT 1"" #600,HTB,A194 Gr8",842,CESARE BONETTI INDIA PVT. LTD.,7434,Shah Brothers
17765,48512,RDBR50 (SS431),ROUND BAR OD50 ASTM A276 TYPE 431,114,Entech Controls,921,ALPESH METALS
17766,57696,PGX618101601202200,"PTV 6"" #300 RF BVE WCB HW IBR",7506,CESARE BONETTI INTERNATIONAL PVT. LTD.,842,CESARE BONETTI INDIA PVT. LTD.
17767,274954,PPBOX_ST5,PP_MONOBOX_ST5,13872,R-Tech Products & Packaging Pvt Ltd,15527,Shree Arun Packaging Co. Pvt. Ltd.


In [5]:
## Picking up required info from labelled data:

col = ['to_company_name', 'supplier_item_id.2', 'product', 'supplier']
tranzact_data_2 = tranzact_data_1[col]
tranzact_data_2.columns = ['OEM', 'item', 'product', 'supplier']

# Creating separate dataframe for Cesare Bonetti International Pvt. Ltd. since these are unlabelled :

tranzact_data_cesare_international = tranzact_data_2.loc[tranzact_data_2['OEM'] \
                                                               == 'CESARE BONETTI INTERNATIONAL PVT. LTD.']

tranzact_data_2_revised = tranzact_data_2.drop(tranzact_data_cesare_international.index)

tranzact_data_cesare_international = tranzact_data_cesare_international.reset_index(drop=True)
tranzact_data_2_revised = tranzact_data_2_revised.reset_index(drop=True)

## Picking up required info from un-labelled data:

col = ['from_company_name', 'buyer_item_name', 'to_company_name']
tranzact_data_new_2 = tranzact_data_new_1[col]
tranzact_data_new_2.columns = ['OEM', 'item', 'supplier']

## Merging two data frames :

tranzact_data_combined = pd.concat([tranzact_data_2_revised, tranzact_data_new_2], axis=0, ignore_index=True)



## Removing duplicate (item + supplier) combinations :

tranzact_data_combined['find_duplicate'] = tranzact_data_combined['item'].map(str) + \
                                            tranzact_data_combined['supplier'].map(str)

tranzact_data_combined.drop_duplicates(subset='find_duplicate', keep = 'first', inplace = True)
tranzact_data_combined = tranzact_data_combined.reset_index(drop=True)
        
print(len(tranzact_data_combined))

tranzact_data_combined.tail()

29063


Unnamed: 0,OEM,item,product,supplier,find_duplicate
29058,R-Tech Products & Packaging Pvt Ltd,POLYBAG_10X15 INCH_PP,,GLORY PACK INDUSTRIES,POLYBAG_10X15 INCH_PPGLORY PACK INDUSTRIES
29059,CESARE BONETTI INDIA PVT. LTD.,"HANDLE NUT 1"" #600,HTB,A194 Gr8",,Shah Brothers,"HANDLE NUT 1"" #600,HTB,A194 Gr8Shah Brothers"
29060,Entech Controls,ROUND BAR OD50 ASTM A276 TYPE 431,,ALPESH METALS,ROUND BAR OD50 ASTM A276 TYPE 431ALPESH METALS
29061,CESARE BONETTI INTERNATIONAL PVT. LTD.,"PTV 6"" #300 RF BVE WCB HW IBR",,CESARE BONETTI INDIA PVT. LTD.,"PTV 6"" #300 RF BVE WCB HW IBRCESARE BONETTI IN..."
29062,R-Tech Products & Packaging Pvt Ltd,PP_MONOBOX_ST5,,Shree Arun Packaging Co. Pvt. Ltd.,PP_MONOBOX_ST5Shree Arun Packaging Co. Pvt. Ltd.


In [6]:
## make all in lower case :

tranzact_data_combined['OEM'] = tranzact_data_combined['OEM'].map(lambda x: x if type(x)!=str else x.lower()) 
tranzact_data_combined['item'] = tranzact_data_combined['item'].map(lambda x: x if type(x)!=str else x.lower()) 
tranzact_data_combined['product'] = tranzact_data_combined['product'].map(lambda x: x if type(x)!=str else x.lower())

## Removing some punctuations:

punctuation = ['!', '$', '%', '&', '(', ')', '*', '+', ',', '.', ':', ';', '<', '=', '>', '?', '@', \
               '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '\t', '\n'] 
 

for i in punctuation:
    
    tranzact_data_combined['item']= tranzact_data_combined['item'].str.replace(i," ")
    
tranzact_data_combined['item'] = tranzact_data_combined['item'].str.replace(r'\s+',' ')

## 

## remove rows with any cell value = none

# tranzact_data_labelled_final = tranzact_data_labelled.dropna()
# tranzact_data_un_labelled_final = tranzact_data_un_labelled.dropna()

# tranzact_data_labelled_final = tranzact_data_labelled_final.reset_index(drop=True)
# tranzact_data_un_labelled_final = tranzact_data_un_labelled_final.reset_index(drop=True)

print(len(tranzact_data_combined))
tranzact_data_combined.tail()

29063


Unnamed: 0,OEM,item,product,supplier,find_duplicate
29058,r-tech products & packaging pvt ltd,polybag 10x15 inch pp,,GLORY PACK INDUSTRIES,POLYBAG_10X15 INCH_PPGLORY PACK INDUSTRIES
29059,cesare bonetti india pvt. ltd.,"handle nut 1"" #600 htb a194 gr8",,Shah Brothers,"HANDLE NUT 1"" #600,HTB,A194 Gr8Shah Brothers"
29060,entech controls,round bar od50 astm a276 type 431,,ALPESH METALS,ROUND BAR OD50 ASTM A276 TYPE 431ALPESH METALS
29061,cesare bonetti international pvt. ltd.,"ptv 6"" #300 rf bve wcb hw ibr",,CESARE BONETTI INDIA PVT. LTD.,"PTV 6"" #300 RF BVE WCB HW IBRCESARE BONETTI IN..."
29062,r-tech products & packaging pvt ltd,pp monobox st5,,Shree Arun Packaging Co. Pvt. Ltd.,PP_MONOBOX_ST5Shree Arun Packaging Co. Pvt. Ltd.


In [7]:
from gensim.models import Word2Vec

from tqdm import tqdm

temp_corpus = tranzact_data_combined['item'].map(lambda x: x.split('.'))

corpus = []
for i in tqdm(range(len(temp_corpus))):
    for line in temp_corpus[i]:
        words = [x for x in line.split()]
        corpus.append(words)

100%|██████████| 29063/29063 [00:00<00:00, 51436.15it/s]


In [8]:
num_of_items = len(corpus)

num_of_words = 0
for line in corpus:
    num_of_words += len(line)
    
max_length = 0
for i in range(len(corpus)):
    max_length  = max(max_length, len(corpus[i]), len(corpus[i+1]))
    if i == len(corpus)-2:
        break
        
print('Num of items - %s'%(num_of_items))
print('Num of words in all items - %s'%(num_of_words))
print('Max no. of words in longest item-description - %s'%(max_length))

Num of items - 29063
Num of words in all items - 206797
Max no. of words in longest item-description - 103


In [9]:
## items to word2vec :

emb_dim = 80

model_1 = Word2Vec(corpus, size = emb_dim, window=5, \
               min_count=1, negative = 15, iter = 10, workers = 10, sg=1)

print(model_1)

Word2Vec(vocab=18013, size=80, alpha=0.025)


In [10]:
## Splitting the combined data into labelled data and unlabelled data :

# tranzact_data_unlabelled = tranzact_data_combined.loc[tranzact_data_combined['product'] == Null]

tranzact_data_labelled = tranzact_data_combined.loc[pd.notnull(tranzact_data_combined['product'])]

tranzact_data_un_labelled = tranzact_data_combined.drop(tranzact_data_labelled.index)

tranzact_data_labelled = tranzact_data_labelled.reset_index(drop=True)
tranzact_data_un_labelled = tranzact_data_un_labelled.reset_index(drop=True)

print(len(tranzact_data_labelled))
print(len(tranzact_data_un_labelled))
tranzact_data_un_labelled.tail()

11803
17260


Unnamed: 0,OEM,item,product,supplier,find_duplicate
17255,r-tech products & packaging pvt ltd,polybag 10x15 inch pp,,GLORY PACK INDUSTRIES,POLYBAG_10X15 INCH_PPGLORY PACK INDUSTRIES
17256,cesare bonetti india pvt. ltd.,"handle nut 1"" #600 htb a194 gr8",,Shah Brothers,"HANDLE NUT 1"" #600,HTB,A194 Gr8Shah Brothers"
17257,entech controls,round bar od50 astm a276 type 431,,ALPESH METALS,ROUND BAR OD50 ASTM A276 TYPE 431ALPESH METALS
17258,cesare bonetti international pvt. ltd.,"ptv 6"" #300 rf bve wcb hw ibr",,CESARE BONETTI INDIA PVT. LTD.,"PTV 6"" #300 RF BVE WCB HW IBRCESARE BONETTI IN..."
17259,r-tech products & packaging pvt ltd,pp monobox st5,,Shree Arun Packaging Co. Pvt. Ltd.,PP_MONOBOX_ST5Shree Arun Packaging Co. Pvt. Ltd.


In [14]:
from collections import Counter
Counter(tranzact_data_labelled["product"])

Counter({'acid': 9,
         'actuator': 61,
         'adhesive': 12,
         'alloysteelbar': 63,
         'alloysteelcasting': 99,
         'alloysteelforging': 62,
         'aluminiumcasting': 3,
         'bearing': 128,
         'belt': 28,
         'bushbearing': 93,
         'carbonsteelbar': 125,
         'cibar': 5,
         'cicasting': 179,
         'cooling': 130,
         'cscasting': 500,
         'csforging': 104,
         'diaphragm': 28,
         'electrical': 1362,
         'electronic': 255,
         'fasteners': 1737,
         'gas': 21,
         'gasket': 241,
         'gearbox': 30,
         'glass': 22,
         'ic': 78,
         'inductor': 19,
         'lab-apparatus': 33,
         'machinery': 10,
         'manifold': 47,
         'measuringinstrumentelectrical': 6,
         'measuringinstrumentmechanical': 65,
         'mechanicalconnector': 289,
         'metalsheet': 358,
         'misc': 350,
         'motor': 38,
         'msbar': 20,
         'nameplate

In [156]:
## digitising 'product' and creating a dictionary for labelled data :

tranzact_data_labelled['product_id'] = tranzact_data_labelled['product'].factorize()[0]

product_id_df = tranzact_data_labelled[['product', 'product_id']].drop_duplicates().sort_values('product_id')
product_to_id = dict(product_id_df.values)
id_to_product = dict(product_id_df[['product_id', 'product']].values)

num_labells = max(tranzact_data_labelled['product_id']) + 1  ## required for keras last layer dimension

## removing puncuations from items of both labelled and un-labelled data :

# pd.options.mode.chained_assignment = None

# tranzact_data_labelled_final['item'] = tranzact_data_labelled_final['item'].str.replace(r'\W',' ')
# tranzact_data_un_labelled_final['item'] = tranzact_data_un_labelled_final['item'].str.replace(r'\W',' ')

## remove gaps between words to single gap

# tranzact_data_labelled_final['item'] = tranzact_data_labelled_final['item'].str.replace(r'\s+',' ')
# tranzact_data_un_labelled_final['item'] = tranzact_data_un_labelled_final['item'].str.replace(r'\s+',' ')

print('Num of product categories - %s'%(max(tranzact_data_labelled['product_id'])))
print(len(tranzact_data_labelled))
print(len(tranzact_data_un_labelled))
tranzact_data_un_labelled.tail()

Num of product categories - 66
11803
17260


Unnamed: 0,OEM,item,product,supplier,find_duplicate
17255,r-tech products & packaging pvt ltd,polybag 10x15 inch pp,,GLORY PACK INDUSTRIES,POLYBAG_10X15 INCH_PPGLORY PACK INDUSTRIES
17256,cesare bonetti india pvt. ltd.,"handle nut 1"" #600 htb a194 gr8",,Shah Brothers,"HANDLE NUT 1"" #600,HTB,A194 Gr8Shah Brothers"
17257,entech controls,round bar od50 astm a276 type 431,,ALPESH METALS,ROUND BAR OD50 ASTM A276 TYPE 431ALPESH METALS
17258,cesare bonetti international pvt. ltd.,"ptv 6"" #300 rf bve wcb hw ibr",,CESARE BONETTI INDIA PVT. LTD.,"PTV 6"" #300 RF BVE WCB HW IBRCESARE BONETTI IN..."
17259,r-tech products & packaging pvt ltd,pp monobox st5,,Shree Arun Packaging Co. Pvt. Ltd.,PP_MONOBOX_ST5Shree Arun Packaging Co. Pvt. Ltd.


In [12]:
tokenizer_obj_comb = Tokenizer(filters='!$%&()*+,.:;<=>?@[\\]^_`{|}~\t\n', split=' ' )
tokenizer_obj_lab = Tokenizer(filters='!$%&()*+,.:;<=>?@[\\]^_`{|}~\t\n', split=' ' )
tokenizer_obj_un_lab = Tokenizer(filters='!$%&()*+,.:;<=>?@[\\]^_`{|}~\t\n', split=' ' )

tokenizer_obj_comb.fit_on_texts(tranzact_data_combined['item'])
tokenizer_obj_lab.fit_on_texts(tranzact_data_labelled['item'])
tokenizer_obj_un_lab.fit_on_texts(tranzact_data_un_labelled['item'])

## pad sequences :

max_length_comb = max([len(s.split()) for s in tranzact_data_combined['item']])
max_length_lab = max([len(s.split()) for s in tranzact_data_labelled['item']])
max_length_un_lab = max([len(s.split()) for s in tranzact_data_un_labelled['item']])

# min_length = min([len(s.split()) for s in total_item_labelled_unlabelled['item']])

## define vocabulary size :

tokenizer_obj_lab_index = tokenizer_obj_lab.word_index

vocab_size_comb = len(tokenizer_obj_comb.word_index) + 1
vocab_size_lab = len(tokenizer_obj_lab.word_index) + 1
vocab_size_un_lab = len(tokenizer_obj_un_lab.word_index) + 1

# total_item_tokens = tokenizer_obj.texts_to_sequences(total_item_labelled_unlabelled['item'])
# total_item_tokens_1 = tokenizer_obj_1.texts_to_sequences(tranzact_data_labelled_final['item'])

# total_item_pad = pad_sequences(total_item_tokens, maxlen = max_length, padding = 'post')
# total_item_pad_1 = pad_sequences(total_item_tokens_1, maxlen = max_length, padding = 'post')

print(max_length_comb)
print(max_length_lab)
print(max_length_un_lab)
print('==============')
print(vocab_size_comb)
print(vocab_size_lab)
print(vocab_size_un_lab)

103
39
103
18014
9509
13067


In [13]:
temp_corpus_labelled = tranzact_data_labelled['item'].map(lambda x: x.split('.'))

corpus_labelled = []
for i in tqdm(range(len(temp_corpus_labelled))):
    for line in temp_corpus_labelled[i]:
        words_labelled = [x for x in line.split()]
        corpus_labelled.append(words_labelled)

100%|██████████| 11803/11803 [00:00<00:00, 42793.29it/s]


In [14]:
temp_corpus_un_labelled = tranzact_data_un_labelled['item'].map(lambda x: x.split('.'))

corpus_un_labelled = []
for i in tqdm(range(len(temp_corpus_un_labelled))):
    for line in temp_corpus_un_labelled[i]:
        words_un_labelled = [x for x in line.split()]
        corpus_un_labelled.append(words_un_labelled)

100%|██████████| 17260/17260 [00:00<00:00, 51280.89it/s]


In [16]:
print(corpus_labelled[0])

['new', 'cylinder', '404', 'gas']


In [17]:
num_of_items_labelled = len(corpus_labelled)

num_of_words_labelled = 0
for line in corpus_labelled:
    num_of_words_labelled += len(line)
    
max_length_labelled = 0
for i in range(len(corpus_labelled)):
    max_length_labelled  = max(max_length_labelled, len(corpus_labelled[i]), len(corpus_labelled[i+1]))
    if i == len(corpus_labelled)-2:
        break
        
        
num_of_items_un_labelled = len(corpus_un_labelled)

num_of_words_un_labelled = 0
for line in corpus_un_labelled:
    num_of_words_un_labelled += len(line)
    
max_length_un_labelled = 0
for i in range(len(corpus_un_labelled)):
    max_length_un_labelled  = max(max_length_un_labelled, len(corpus_un_labelled[i]), len(corpus_un_labelled[i+1]))
    if i == len(corpus_un_labelled)-2:
        break


print('Num of labelled sentences - %s'%(num_of_items_labelled))
print('Num of labelled words - %s'%(num_of_words_labelled))
print('Max no. of labelled words in a sentence - %s'%(max_length_labelled))
print('Num of un-labelled sentences - %s'%(num_of_items_un_labelled))
print('Num of un-labelled words - %s'%(num_of_words_un_labelled))
print('Max no. of un-labelled words in a sentence - %s'%(max_length_un_labelled))

Num of labelled sentences - 11803
Num of labelled words - 86127
Max no. of labelled words in a sentence - 39
Num of un-labelled sentences - 17260
Num of un-labelled words - 120670
Max no. of un-labelled words in a sentence - 103


In [162]:
## prepare embeddings

## Preparing embedding matrix

## nb_words = min(vocab_size, vocab_size_1)-1

embedding_matrix = np.zeros((vocab_size_lab, emb_dim))

for word, i in tokenizer_obj_lab_index.items():
    
    emb_vector = model_1.wv.word_vec(word)
    
    if emb_vector is not None:
        embedding_matrix[i] = emb_vector
        
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))


Null word embeddings: 1


In [20]:
print(type(embedding_matrix))
print(embedding_matrix.shape)
print(embedding_matrix[0])
print('====================================')
print(embedding_matrix[9508])

<class 'numpy.ndarray'>
(9509, 80)
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0.]
[-0.1197671  -0.19976822 -0.1226549   0.01244516  0.09291881  0.07944667
  0.26135004 -0.08526938 -0.16277462 -0.1487934  -0.25571224 -0.07030543
 -0.06002564  0.04638237 -0.35307917  0.09544477  0.21462013 -0.14599212
  0.27784052 -0.23630147  0.3845998  -0.34082943 -0.07849368  0.50536484
 -0.21459618 -0.25117096 -0.16330597  0.20352568 -0.0408821   0.27964932
 -0.06286994 -0.11914232 -0.32240015 -0.31961137 -0.55712885  0.06404164
  0.02561327 -0.11268542  0.00494923  0.02816018  0.27702028 -0.13785642
 -0.15731809  0.15615804 -0.12265145 -0.10803672  0.03062834 -0.0264197
  0.04093572  0.01095584  0.32362533  0.04655594  0.3825244  -0.42067805
 -0.00656419  0.06181125  0.13144606  0.31073031  0.29134333  0.

In [None]:
## Total no. of unique words = vocab_size = 14707
## each word has got 150 parameters as per Embedding Dimension defined.
## Hence total Param # = 14707 * 150 = 2206050


In [163]:
from keras.models import Sequential

model_2 = Sequential()
model_2.add(Embedding(vocab_size_lab, emb_dim, weights=[embedding_matrix],\
                    input_length=max_length_un_labelled, trainable=True))
model_2.add(SpatialDropout1D(0.2))
#model_2.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='tanh'))
#model_2.add(MaxPooling1D(pool_size=2))

model_2.add(Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2)))
#model_2.add(GRU(units=32, dropout=0.2, recurrent_dropout=0.2))
model_2.add(Dense(256, activation='tanh'))
model_2.add(Dense(num_labells, activation='softmax'))
model_2.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

model_2.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 103, 80)           760720    
_________________________________________________________________
spatial_dropout1d_3 (Spatial (None, 103, 80)           0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 128)               74240     
_________________________________________________________________
dense_5 (Dense)              (None, 256)               33024     
_________________________________________________________________
dense_6 (Dense)              (None, 67)                17219     
Total params: 885,203
Trainable params: 885,203
Non-trainable params: 0
_________________________________________________________________


In [164]:
## Creating X and Y variables for training. X = Items or item 'feature'. Y = 'product' or product category

X = []
for i in range(len(tranzact_data_labelled)):
    
    X.append(tranzact_data_labelled.iloc[i]['item'])
#Q = np.array(tranzact_data_3["product"])
Y = tranzact_data_labelled['product']
print(X[1])

print(Y[1])

##############################################################################################################

Un_labelled = []
for i in range(len(tranzact_data_un_labelled)):
    
    Un_labelled.append(tranzact_data_un_labelled.iloc[i]['item'])


accumulator - model a-as 5126 3/4" connection - emerson make
cooling


In [171]:
X_tokens = tokenizer_obj_lab.texts_to_sequences(X)
#X_test_tokens = tokenizer_obj_lab.texts_to_sequences(X_test)

X_pad = pad_sequences(X_tokens, maxlen = max_length_un_labelled, padding = 'post')
#X_test_pad = pad_sequences(X_test_tokens, maxlen = max_length_labelled, padding = 'post')

print(X_tokens[0])


Un_label_tokens = tokenizer_obj_un_lab.texts_to_sequences(Un_labelled)

Un_label_pad = pad_sequences(Un_label_tokens, maxlen = max_length_un_labelled, padding = 'post')

[1454, 237, 2814, 792]


In [94]:
print(type(X_tokens))
print(X_tokens)

<class 'list'>
[[1454, 237, 2814, 792], [1106, 2, 267, 4147, 4148, 128, 1107, 2, 793, 66], [1106, 2222, 128, 484, 485, 2815], [1106, 10, 540, 1107], [1106, 4149], [1106, 4150, 4151], [1106, 2222, 522, 484, 485, 2815], [1106, 1880], [1106, 4152], [1106, 4153], [4154, 1881], [4155, 1881], [4156, 4157, 1881], [4158, 244, 145, 1881], [2816, 2817, 1882, 237, 332], [1882, 237, 4159], [1882, 792], [4160, 1108], [895, 4161, 1108], [4162, 1108, 60, 2818], [2819, 4163, 4164, 2820, 1108], [4165, 1108], [4166, 1108, 4167, 675, 4168], [4169, 1108], [4170, 2820, 1108], [507, 2, 4171, 1108, 4172], [1312, 675, 4173, 847, 4174, 4175], [486, 268, 4176, 269, 950, 302, 386], [486, 268, 4177], [486, 268, 4178], [486, 268, 4179, 269, 950, 302, 386], [1883, 66, 595, 1027, 268], [1883, 66, 486, 1624, 297, 1027, 268, 4180, 485, 487, 267, 244, 596, 66, 2821, 52, 2822, 4181, 4182, 1625, 487, 267, 2823, 90, 1884, 76, 151, 152, 1626], [486, 268, 4183], [268, 457, 704, 60, 312, 2223], [486, 268, 4184, 86, 597], [26

In [172]:
from sklearn.preprocessing import LabelBinarizer

encoder = LabelBinarizer()

Y_coded = encoder.fit_transform(Y)

print(Y_coded.shape)



(11803, 67)


In [173]:
#train test split in 80% / 20% ratio

from sklearn.model_selection import train_test_split
X_train_pad, X_test_pad, Y_train_coded, Y_test_coded = train_test_split(X_pad, Y_coded, test_size=0.20, random_state = 5)


In [None]:
# total_item_tokens = tokenizer_obj.texts_to_sequences(total_item_labelled_unlabelled['item'])
# total_item_tokens_1 = tokenizer_obj_1.texts_to_sequences(tranzact_data_labelled_final['item'])

# total_item_pad = pad_sequences(total_item_tokens, maxlen = max_length, padding = 'post')
# total_item_pad_1 = pad_sequences(total_item_tokens_1, maxlen = max_length, padding = 'post')


In [48]:
## X_train and X_test word embedding :

#X_train_tokens = tokenizer_obj_lab.texts_to_sequences(X_train)
#X_test_tokens = tokenizer_obj_lab.texts_to_sequences(X_test)

#X_train_pad = pad_sequences(X_train_tokens, maxlen = max_length_labelled, padding = 'post')
#X_test_pad = pad_sequences(X_test_tokens, maxlen = max_length_labelled, padding = 'post')

In [29]:
print(X_train_pad.shape)
print(X_test_pad.shape)
print(max_length_lab)
print(vocab_size_lab)

print(Y_train_coded.shape)

(9442, 39)
(2361, 39)
39
9509
(9442, 67)


In [174]:
len(X_test_pad)
X_test_tokens = []

for i in range(len(X_test_pad)):
    X_test_tokens.append((X_test_pad[i][X_test_pad[i] != 0]).tolist())

In [122]:
print(type(X_test_tokens))
print(len(X_test_tokens))
print(len(X_test_pad))
print(X_test_tokens)

<class 'list'>
2361
2361
[[6873, 162, 1575, 1428], [206, 5, 50, 7746, 840], [597, 46, 82, 6475, 6476, 3525, 7, 6477], [6924, 489, 10, 1666, 243], [440, 54], [578, 303, 6965], [702, 25, 1507, 3889, 510], [369, 748, 2138, 1197, 2139, 2, 7232], [53, 896, 855, 30, 72], [43, 4, 653, 2880, 87], [130, 2, 59, 37, 50, 2, 362], [346, 38, 51, 351, 16, 560, 145, 23], [5945, 32, 500, 5946, 1513, 1547], [2603, 1087, 487, 1206, 8272, 7, 441, 2, 8273], [89, 4, 1464, 414], [3265, 5658, 269, 1740, 1739, 2, 16, 3266, 3267, 3268, 46, 3269, 2354, 86, 329, 229, 2024, 66], [812, 1244, 20, 745, 81, 288, 52, 132, 6945], [76, 588, 1781, 68, 6105, 34, 147], [674, 73], [207, 50, 7756, 2696, 12, 215], [574, 22, 1401, 997, 1750, 408, 150], [588, 75, 6256, 71, 63, 113], [8, 19, 33, 51, 3, 70, 1, 219, 71], [129, 11, 2, 90, 2, 10, 747, 3, 1, 16, 186], [6, 5, 1973, 1974, 1975, 13, 17, 3, 65], [730, 1096, 334, 351, 221, 332, 761, 151, 145, 847, 7827, 1296], [123, 9, 2318, 178, 187, 980, 569, 51], [1380, 121, 3208, 5384,

In [175]:

text = tokenizer_obj_lab.sequences_to_texts(X_test_tokens)

X_test_text = pd.DataFrame()
X_test_text = pd.DataFrame(text)
X_test_text.columns = ['items']
X_test_text.head()

Unnamed: 0,items
0,xomt130406-pd insert pc5300 korloy
1,carbon steel plate 10tx1500wx6000l is2062-e-250
2,handwheel c i d-400 a-30 s-35 5 d2-90
3,lashing belt 1 ton 25mm
4,solenoid valve


In [158]:
print(Y_test_coded[0])
print(Y_test_coded[0].shape)
print(Y_test_coded.shape)
print(([Y_test_coded][0]).shape)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
(67,)
(2361, 67)
(2361, 67)


In [135]:
encoder.inverse_transform(Y_test_coded)

array(['tool', 'metalsheet', 'valve', ..., 'spring', 'pipe', 'pipe'],
      dtype='<U29')

In [88]:

ppp = [encoder.inverse_transform([Y_test_coded][0])][0][0]
print(ppp)

tool


In [176]:
model_2.fit(X_train_pad,Y_train_coded,batch_size=64,epochs=20,
          validation_split=0.2)

Train on 7553 samples, validate on 1889 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f67d5b603c8>

In [177]:
#test_sequences = tok.texts_to_sequences(X_test)
#test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)
accr = model_2.evaluate(X_test_pad,Y_test_coded)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.333
  Accuracy: 0.924


In [45]:
bb = encoder.inverse_transform(Y_test_coded)
print(aa)

['tool' 'metalsheet' 'valve' ... 'spring' 'pipe' 'pipe']


In [178]:
df_or_label = pd.DataFrame()
df_pred_label = pd.DataFrame()

df_or_label = pd.DataFrame(encoder.inverse_transform(Y_test_coded))
df_or_label.columns = ['product_or']

df_pred_label = pd.DataFrame(encoder.inverse_transform(model_2.predict(X_test_pad)))
df_pred_label.columns = ['product_pred_1']

print(len(df_or_label))
print(len(df_pred_label))

# df_label_compare = df_or_label.merge(df_pred_label)

df_label_compare = pd.concat([X_test_text, df_or_label, df_pred_label], axis=1)

2361
2361


In [179]:
df_label_compare.head(20)

Unnamed: 0,items,product_or,product_pred_1
0,xomt130406-pd insert pc5300 korloy,tool,tool
1,carbon steel plate 10tx1500wx6000l is2062-e-250,metalsheet,metalsheet
2,handwheel c i d-400 a-30 s-35 5 d2-90,valve,valve
3,lashing belt 1 ton 25mm,belt,belt
4,solenoid valve,valve,valve
5,frb 13 5/200,bearing,bearing
6,"segmental ring 26"" #600tcv aisi420",sealing,sealing
7,plastic sheets hm hdpe polythyelene - 30x48,nonmetalsheet,nonmetalsheet
8,stud m18 l85 a193 b7,fasteners,fasteners
9,hex bar a/f 41 a105n,carbonsteelbar,carbonsteelbar


In [108]:
#pred_iction = model_2.predict(X_test_pad[0])
print(X_test_pad[0].shape)
print(X_test_pad.shape)
aaa = X_test_pad[0].reshape(39,1)
print(aaa.shape)

(39,)
(2361, 39)
(39, 1)


In [130]:
print([X_test_pad][0])

[[6873  162 1575 ...    0    0    0]
 [ 206    5   50 ...    0    0    0]
 [ 597   46   82 ...    0    0    0]
 ...
 [   6   92  159 ...    0    0    0]
 [  20    9  136 ...    0    0    0]
 [  68 1409  699 ...    0    0    0]]


In [125]:
print(X_test_pad[1])

[ 206    5   50 7746  840    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0]


In [134]:
pr = [model_2.predict([X_test_pad][0])][0][0]
aaa = [model_2.predict([X_test_pad][0])][0][1]
print(aaa)

[2.43695065e-13 5.37912215e-11 6.06054513e-14 1.58411409e-11
 3.62898242e-13 3.07999654e-12 1.80659944e-13 7.98073155e-14
 2.48591037e-10 8.95355734e-10 5.61677198e-11 2.49392304e-13
 5.67071000e-12 9.73222963e-16 9.79940296e-11 9.85307036e-09
 7.43214934e-11 6.00585448e-09 2.84825691e-10 1.18157439e-09
 3.32950654e-13 8.60682375e-14 6.53146735e-16 4.27782375e-12
 8.61340166e-12 1.55950621e-12 5.95001002e-12 2.48350971e-12
 7.43745939e-13 3.18562612e-15 1.34125052e-13 5.68245483e-13
 9.99999404e-01 3.40822481e-08 4.58213155e-12 1.57077355e-11
 2.80499108e-07 2.46402183e-13 6.62316957e-09 3.17122935e-14
 5.72866199e-09 1.58481908e-10 2.00847380e-16 3.10599688e-07
 1.40421647e-10 7.88168975e-11 2.69482922e-14 1.05637173e-10
 1.26338556e-14 2.54071585e-13 4.12701096e-13 3.03979135e-08
 3.66830812e-17 4.13453669e-16 5.29142456e-15 1.92841840e-10
 2.86646087e-13 6.45367426e-16 1.17023458e-09 5.59293237e-11
 5.30485248e-12 7.18299443e-13 2.55449429e-11 2.64648465e-14
 1.26343805e-11 7.793420

In [91]:
print(type(X_test_pad[0]))
print(type([X_test_pad][0]))
print('=====================')
print((X_test_pad[0]).shape)
print(([X_test_pad][0]).shape)

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(39,)
(2361, 39)


In [135]:
print(pr.shape)
print(pr)
ppr = pr.reshape(1,67)
paaa = aaa.reshape(1,67)

(67,)
[6.4785499e-06 6.2430248e-11 5.0905498e-08 1.4635412e-12 2.6252878e-12
 1.0080431e-11 5.6519306e-10 1.4289529e-06 6.9042760e-10 1.7831733e-10
 2.1181404e-10 1.2298527e-14 1.6803699e-12 1.5196718e-09 9.9246371e-09
 5.9822971e-12 8.0213539e-08 4.2344413e-06 7.4888703e-08 2.1124679e-06
 1.0061122e-12 2.1798749e-09 1.2109756e-13 5.4485174e-14 1.6755372e-12
 1.8148221e-11 7.0184125e-09 4.6026533e-10 6.5030742e-10 2.3398725e-10
 1.1460697e-09 3.3852853e-12 4.0140020e-09 1.0219968e-05 3.0548208e-06
 1.3218934e-13 4.5816320e-10 5.3340515e-10 5.3676023e-16 5.6870773e-13
 8.1808907e-11 5.9328343e-08 1.4899115e-06 5.6519839e-14 5.8173253e-09
 3.7106645e-07 4.4050662e-12 8.2427898e-11 8.2608983e-07 2.7882590e-11
 1.1674641e-09 2.3090813e-14 1.1537603e-11 1.0172010e-09 1.7543336e-13
 3.6769111e-14 3.6106526e-10 6.2531993e-12 5.3707188e-11 7.2918711e-13
 3.3740302e-12 9.9996948e-01 2.1328434e-11 5.9378920e-09 7.5311508e-09
 1.7534629e-14 8.5223384e-11]


In [136]:
#ppp = [encoder.inverse_transform([Y_test_coded][0])][0][0]

ppp = encoder.inverse_transform([ppr][0])[0]
aaap = encoder.inverse_transform([paaa][0])[0]

In [137]:
print(ppp)
print(aaap)

tool
metalsheet


In [None]:
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))


In [180]:
text = ['forged plate 200X170X30 a387 gr 11']

In [183]:
text_tokens = tokenizer_obj_lab.texts_to_sequences(text)
#X_test_tokens = tokenizer_obj_lab.texts_to_sequences(X_test)

text_pad = pad_sequences(text_tokens, maxlen = max_length_un_labelled, padding = 'post')
#X_test_pad = pad_sequences(X_test_tokens, maxlen = max_length_labelled, padding = 'post')


In [184]:
b = [model_2.predict([text_pad][0])][0][0]

In [185]:
bb = b.reshape(1,67)

In [186]:
pred = encoder.inverse_transform([bb][0])[0]

In [187]:
print(pred)

ssforging


In [188]:
pred_un_label = pd.DataFrame(encoder.inverse_transform(model_2.predict(Un_label_pad)))
pred_un_label.columns = ['product_pred_1']
tranzact_data_un_labelled_categorised = pd.concat([tranzact_data_un_labelled, pred_un_label], axis=1)

InvalidArgumentError: indices[12,3] = 9512 is not in [0, 9509)
	 [[Node: embedding_3/embedding_lookup = GatherV2[Taxis=DT_INT32, Tindices=DT_INT32, Tparams=DT_FLOAT, _class=["loc:@spatial_dropout1d_3/cond/Switch_1"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](embedding_3/embeddings/read, embedding_3/Cast, embedding_3/embedding_lookup/axis)]]