In [47]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, SpatialDropout1D, GRU
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer

from keras.preprocessing import sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
%matplotlib inline


In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
## Loading Data - labelled data:

data_1 = pd.ExcelFile('RNN_Tranzact_Rohan_1_Regrouped_with_supplier.xlsx')
tranzact_data_1 = data_1.parse(0)

tranzact_data_1.tail()

Unnamed: 0,supplier_item_id,to_company_name,supplier_item_id.1,supplier_item_id.2,product,supplier,Done,Atul Sugg.,sub_type,material,process,grade,spec,brand
13552,86390,CESARE BONETTI INTERNATIONAL PVT. LTD.,86390,"YOKE SLEEVE;8"" # 150;GTV; B150 C62300",YOKE SLEEVE,CESARE BONETTI INDIA PVT. LTD.,278.0,,,,,C62300,,
13553,89723,CESARE BONETTI INTERNATIONAL PVT. LTD.,89723,"YOKE SLEEVE 1½"" #2700 BLY;B150 C62300",YOKE SLEEVE,CESARE BONETTI INDIA PVT. LTD.,278.0,,,,,C62300,,
13554,89995,CESARE BONETTI INTERNATIONAL PVT. LTD.,89995,"YOKE SLEEVE;4"" #150 GTV; B150 C62300",YOKE SLEEVE,CESARE BONETTI INDIA PVT. LTD.,278.0,,,,,C62300,,
13555,117819,WAAREE INDUSTRIES PVT. LTD.,117819,"YOKE SLEEVE;8"" # 150;GTV; A439 D2",cicasting,Waaree Industries Pvt.Ltd.,278.0,,,,,,,
13556,169292,CESARE BONETTI INTERNATIONAL PVT. LTD.,169292,"YOKE SLEEVE; 1"";#1500;CBD;A582 T416",YOKE SLEEVE,CESARE BONETTI INDIA PVT. LTD.,278.0,,,,,,,


In [4]:
## Loading Data - un-labelled data:

data_2 = pd.ExcelFile('14Mar_Gorky.xlsx')
tranzact_data_new_1 = data_2.parse(0)

tranzact_data_new_1.tail()

Unnamed: 0,buyer_item_id,buyer_itemid,buyer_item_name,from_company_id,from_company_name,to_company_id,to_company_name
17763,289052,PLYBAG_10X15,POLYBAG_10X15 INCH_PP,13872,R-Tech Products & Packaging Pvt Ltd,16730,GLORY PACK INDUSTRIES
17764,289639,HANDLENUTM12X5MMXX,"HANDLE NUT 1"" #600,HTB,A194 Gr8",842,CESARE BONETTI INDIA PVT. LTD.,7434,Shah Brothers
17765,48512,RDBR50 (SS431),ROUND BAR OD50 ASTM A276 TYPE 431,114,Entech Controls,921,ALPESH METALS
17766,57696,PGX618101601202200,"PTV 6"" #300 RF BVE WCB HW IBR",7506,CESARE BONETTI INTERNATIONAL PVT. LTD.,842,CESARE BONETTI INDIA PVT. LTD.
17767,274954,PPBOX_ST5,PP_MONOBOX_ST5,13872,R-Tech Products & Packaging Pvt Ltd,15527,Shree Arun Packaging Co. Pvt. Ltd.


In [5]:
## Picking up required info from labelled data:

col = ['to_company_name', 'supplier_item_id.2', 'product', 'supplier']
tranzact_data_2 = tranzact_data_1[col]
tranzact_data_2.columns = ['OEM', 'item', 'product', 'supplier']

## Removing duplicate (item + supplier) combinations :

tranzact_data_2['find_duplicate'] = tranzact_data_2['item'].map(str)+tranzact_data_2['supplier'].map(str)
tranzact_data_2.drop_duplicates(subset='find_duplicate', keep = 'first', inplace = True)
tranzact_data_2 = tranzact_data_2.reset_index(drop=True)

# Creating separate dataframe for Cesare Bonetti International Pvt. Ltd. since these are unlabelled :

tranzact_data_2_cesare_international = tranzact_data_2.loc[tranzact_data_2['OEM'] \
                                                               == 'CESARE BONETTI INTERNATIONAL PVT. LTD.']

tranzact_data_labelled = tranzact_data_2.drop(tranzact_data_2_cesare_international.index)

tranzact_data_2_cesare_international = tranzact_data_2_cesare_international.reset_index(drop=True)
tranzact_data_labelled = tranzact_data_labelled.reset_index(drop=True)

print(len(tranzact_data_2_cesare_international))
        
print(len(tranzact_data_labelled))

tranzact_data_labelled.tail()

1754
11803


Unnamed: 0,OEM,item,product,supplier,find_duplicate
11798,CESARE BONETTI INDIA PVT. LTD.,"YOKE CASTING 12"" #2500 GTV, SA216 WCC, QAP",CS CASTING,AMI Alloys,"YOKE CASTING 12"" #2500 GTV, SA216 WCC, QAPAMI ..."
11799,CESARE BONETTI INDIA PVT. LTD.,"YOKE CAST, 3"" #2500 GTV, SA216 WCC, QAP",CS CASTING,Tulip Casting Pvt. Ltd.,"YOKE CAST, 3"" #2500 GTV, SA216 WCC, QAPTulip C..."
11800,CESARE BONETTI INDIA PVT. LTD.,"YOKE CASTING 12"" #2500 GTV, SA217 C12A, QAP",ALLOY STEEL CASTING,AMI Alloys,"YOKE CASTING 12"" #2500 GTV, SA217 C12A, QAPAMI..."
11801,CESARE BONETTI INDIA PVT. LTD.,"YOKE CAST 10"" #150 TCV; A217WC6",ALLOY STEEL CASTING,RAJ ENGINEERS,"YOKE CAST 10"" #150 TCV; A217WC6RAJ ENGINEERS"
11802,WAAREE INDUSTRIES PVT. LTD.,"YOKE SLEEVE;8"" # 150;GTV; A439 D2",cicasting,Waaree Industries Pvt.Ltd.,"YOKE SLEEVE;8"" # 150;GTV; A439 D2Waaree Indus..."


In [6]:
## Picking up required info from un-labelled data:

col = ['from_company_name', 'buyer_item_name', 'to_company_name']
tranzact_data_new_2 = tranzact_data_new_1[col]
tranzact_data_new_2.columns = ['OEM', 'item', 'supplier']

## Removing duplicate (item + supplier) combinations :

tranzact_data_new_2['find_duplicate'] = tranzact_data_new_2['item'].map(str)+tranzact_data_new_2['supplier'].map(str)
tranzact_data_new_2.drop_duplicates(subset='find_duplicate', keep = 'first', inplace = True)
tranzact_data_un_labelled = tranzact_data_new_2.reset_index(drop=True)

print(len(tranzact_data_un_labelled))
tranzact_data_un_labelled.tail()

17405


Unnamed: 0,OEM,item,supplier,find_duplicate
17400,R-Tech Products & Packaging Pvt Ltd,POLYBAG_10X15 INCH_PP,GLORY PACK INDUSTRIES,POLYBAG_10X15 INCH_PPGLORY PACK INDUSTRIES
17401,CESARE BONETTI INDIA PVT. LTD.,"HANDLE NUT 1"" #600,HTB,A194 Gr8",Shah Brothers,"HANDLE NUT 1"" #600,HTB,A194 Gr8Shah Brothers"
17402,Entech Controls,ROUND BAR OD50 ASTM A276 TYPE 431,ALPESH METALS,ROUND BAR OD50 ASTM A276 TYPE 431ALPESH METALS
17403,CESARE BONETTI INTERNATIONAL PVT. LTD.,"PTV 6"" #300 RF BVE WCB HW IBR",CESARE BONETTI INDIA PVT. LTD.,"PTV 6"" #300 RF BVE WCB HW IBRCESARE BONETTI IN..."
17404,R-Tech Products & Packaging Pvt Ltd,PP_MONOBOX_ST5,Shree Arun Packaging Co. Pvt. Ltd.,PP_MONOBOX_ST5Shree Arun Packaging Co. Pvt. Ltd.


In [7]:
## make all in lower case :

tranzact_data_labelled['OEM'] = tranzact_data_labelled['OEM'].map(lambda x: x if type(x)!=str else x.lower()) 
tranzact_data_labelled['item'] = tranzact_data_labelled['item'].map(lambda x: x if type(x)!=str else x.lower()) 
tranzact_data_labelled['product'] = tranzact_data_labelled['product'].map(lambda x: x if type(x)!=str else x.lower())

tranzact_data_un_labelled['OEM'] = tranzact_data_un_labelled['OEM'].map(lambda x: x if type(x)!=str else x.lower()) 
tranzact_data_un_labelled['item'] = tranzact_data_un_labelled['item'].map(lambda x: x if type(x)!=str else x.lower()) 

## remove rows with any cell value = none

tranzact_data_labelled_final = tranzact_data_labelled.dropna()
tranzact_data_un_labelled_final = tranzact_data_un_labelled.dropna()

tranzact_data_labelled_final = tranzact_data_labelled_final.reset_index(drop=True)
tranzact_data_un_labelled_final = tranzact_data_un_labelled_final.reset_index(drop=True)

print(len(tranzact_data_labelled_final))
print(len(tranzact_data_un_labelled_final))

11803
17405


In [56]:
## digitising 'product' and creating a dictionary for labelled data :

tranzact_data_labelled_final['product_id'] = tranzact_data_labelled_final['product'].factorize()[0]

product_id_df = tranzact_data_labelled_final[['product', 'product_id']].drop_duplicates().sort_values('product_id')
product_to_id = dict(product_id_df.values)
id_to_product = dict(product_id_df[['product_id', 'product']].values)

## removing puncuations from items of both labelled and un-labelled data :

pd.options.mode.chained_assignment = None

tranzact_data_labelled_final['item'] = tranzact_data_labelled_final['item'].str.replace(r'\W',' ')
tranzact_data_un_labelled_final['item'] = tranzact_data_un_labelled_final['item'].str.replace(r'\W',' ')

## remove gaps between words to single gap

tranzact_data_labelled_final['item'] = tranzact_data_labelled_final['item'].str.replace(r'\s+',' ')
tranzact_data_un_labelled_final['item'] = tranzact_data_un_labelled_final['item'].str.replace(r'\s+',' ')

print(len(tranzact_data_labelled_final))
print(len(tranzact_data_un_labelled_final))
tranzact_data_un_labelled_final.tail()

11803
17405


Unnamed: 0,OEM,item,supplier,find_duplicate
17400,r-tech products & packaging pvt ltd,polybag_10x15 inch_pp,GLORY PACK INDUSTRIES,POLYBAG_10X15 INCH_PPGLORY PACK INDUSTRIES
17401,cesare bonetti india pvt. ltd.,handle nut 1 600 htb a194 gr8,Shah Brothers,"HANDLE NUT 1"" #600,HTB,A194 Gr8Shah Brothers"
17402,entech controls,round bar od50 astm a276 type 431,ALPESH METALS,ROUND BAR OD50 ASTM A276 TYPE 431ALPESH METALS
17403,cesare bonetti international pvt. ltd.,ptv 6 300 rf bve wcb hw ibr,CESARE BONETTI INDIA PVT. LTD.,"PTV 6"" #300 RF BVE WCB HW IBRCESARE BONETTI IN..."
17404,r-tech products & packaging pvt ltd,pp_monobox_st5,Shree Arun Packaging Co. Pvt. Ltd.,PP_MONOBOX_ST5Shree Arun Packaging Co. Pvt. Ltd.


In [None]:
## DO NOT USE THIS STEP
# get stop words from nltk
stopWords = stopwords.words('english')

# pre processing data
def cleanData(sentence):
    processedList = ""
    
    # convert to lowercase, ignore all special characters - keep only alpha-numericals and spaces (not removing full-stop here)
    sentence = re.sub(r'[^A-Za-z0-9\s.]',r'',str(sentence).lower())
    sentence = re.sub(r'\n',r' ',sentence)
    
    # remove stop words
    sentence = " ".join([word for word in sentence.split() if word not in stopWords])
    
    return sentence


In [None]:
## DO NOT USE THIS STEP
# clean data of one example:
cleanData(data['Description'][2]) ## cleaning of dataframe 'data' column 'Description' index no. '2'

# clean data for whole dataframe
data['Description'] = data['Description'].map(lambda x: cleanData(x))

In [57]:
total_item_labelled_unlabelled = pd.DataFrame(pd.concat([tranzact_data_labelled_final['item'], \
                                            tranzact_data_un_labelled_final['item']]))
total_item_labelled_unlabelled.head()

Unnamed: 0,item
0,new cylinder 404 gas
1,accumulator model a as 5126 3 4 connection eme...
2,accumulator flokool 3 4 400 psi fksa596
3,accumulator 1 5 8 connection
4,accumulator 7 8


In [58]:
print(len(total_item_labelled_unlabelled))
total_item_labelled_unlabelled.tail()

29208


Unnamed: 0,item
17400,polybag_10x15 inch_pp
17401,handle nut 1 600 htb a194 gr8
17402,round bar od50 astm a276 type 431
17403,ptv 6 300 rf bve wcb hw ibr
17404,pp_monobox_st5


In [59]:
total_item_labelled_unlabelled = total_item_labelled_unlabelled.reset_index(drop=True)
total_item_labelled_unlabelled.tail()

Unnamed: 0,item
29203,polybag_10x15 inch_pp
29204,handle nut 1 600 htb a194 gr8
29205,round bar od50 astm a276 type 431
29206,ptv 6 300 rf bve wcb hw ibr
29207,pp_monobox_st5


In [11]:
from gensim.models import Word2Vec
from nltk import word_tokenize

In [27]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/scar3crow/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [60]:
total_item_labelled_unlabelled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29208 entries, 0 to 29207
Data columns (total 1 columns):
item    29208 non-null object
dtypes: object(1)
memory usage: 228.3+ KB


In [61]:
tmp_corpus = total_item_labelled_unlabelled['item'].map(lambda x: x.split('.'))

In [62]:
print(tmp_corpus)

0                                   [new cylinder 404 gas]
1        [accumulator model a as 5126 3 4 connection em...
2                [accumulator flokool 3 4 400 psi fksa596]
3                           [accumulator 1 5 8 connection]
4                                       [accumulator 7 8 ]
5                                      [accumulator 13 8 ]
6                [accumulator flokool 5 8 400 psi fksa596]
7                                      [accumulator 1 1 8]
8                                      [accumulator 2 1 8]
9                                    [accumulator 5216 7s]
10                                        [buthyl acetate]
11                                         [ethyl acetate]
12                                 [npac n proyl acetate ]
13                                      [pma p m acetate ]
14         [flashback arrester acetylene cylinder mounted]
15                                [acetylene cylinder 4kg]
16                                         [acetylene ga

In [53]:
from tqdm import tqdm

In [63]:
print(len(tmp_corpus))
print(tmp_corpus[29207])

29208
['pp_monobox_st5']


In [109]:
corpus = []
for i in tqdm(range(len(tmp_corpus))):
    for line in tmp_corpus[i]:
        words = [x for x in line.split()]
        corpus.append(words)

100%|██████████| 29208/29208 [00:00<00:00, 49972.23it/s]


In [110]:
print(corpus)



In [111]:
num_of_items = len(corpus)
num_of_words = 0
for line in corpus:
    num_of_words += len(line)

print('Num of sentences - %s'%(num_of_items))
print('Num of words - %s'%(num_of_words))

Num of sentences - 29208
Num of words - 216958


In [112]:
emb_dim = 150

model_1 = Word2Vec(corpus, size = emb_dim, window=5, \
               min_count=1, negative = 15, iter = 10, workers = 10, sg=1)

In [113]:
print(corpus[0])
print(corpus[29207])
print(len(corpus[0]))
print(len(corpus))

['new', 'cylinder', '404', 'gas']
['pp_monobox_st5']
4
29208


In [114]:
print(model_1)
aa = model_1.wv[corpus[0]]
bb = model_1.wv[corpus[29207]]
print(aa.shape)
print(bb.shape)
print(type(aa))

max_length = 0

for i in range(len(corpus)):
    
    if len(corpus[i]) > max_length:
        max_length = len(corpus[i])
        
    if i == len(corpus) - 1:
        break
    
print(max_length)    

Word2Vec(vocab=14883, size=150, alpha=0.025)
(4, 150)
(1, 150)
<class 'numpy.ndarray'>
121


In [98]:
print(aa)

[[ 2.01582864e-01  3.29994783e-02 -8.27533230e-02 -2.60685593e-01
  -1.77499413e-01  3.12090725e-01  5.59300631e-02  1.57511726e-01
   1.12139076e-01  6.82047829e-02  6.18606508e-02  1.37901574e-01
   7.27616474e-02 -1.83106914e-01 -5.17768443e-01  1.43430337e-01
  -1.76809609e-01 -7.67772421e-02 -2.10455596e-01  7.38934353e-02
   2.94795871e-01 -7.42435828e-02 -1.64191857e-01  1.33895576e-01
   7.86684826e-02 -3.76487412e-02 -2.75520921e-01 -2.62564123e-01
   1.94094647e-02  3.71732444e-01 -3.04807216e-01  4.60452810e-02
   3.73008698e-01 -2.77121425e-01  2.87072003e-01  3.20924461e-01
  -2.79400814e-02  3.80122244e-01 -4.94771183e-01 -3.07645619e-01
  -1.92513674e-01  1.03633970e-01  2.22773343e-01 -2.22777501e-01
   6.84149802e-01  9.27897394e-01  3.94415250e-03  5.27806953e-02
   1.76104587e-02 -7.69935668e-01  9.74345803e-02 -3.10029924e-01
  -5.97708404e-01  1.69088960e-01  2.36867264e-01 -4.83311474e-01
   2.51714975e-01 -2.28264123e-01  1.49616569e-01  1.90688953e-01
  -3.62183

In [115]:
len(model_1.wv.vocab)

14883

In [101]:
tokenizer_obj = Tokenizer()
tokenizer_obj_1 = Tokenizer()

tokenizer_obj.fit_on_texts(total_item_labelled_unlabelled['item'])
tokenizer_obj_1.fit_on_texts(tranzact_data_labelled_final['item'])

## pad sequences :

max_length = max([len(s.split()) for s in total_item_labelled_unlabelled['item']])
min_length = min([len(s.split()) for s in total_item_labelled_unlabelled['item']])

## define vocabulary size :

vocab_size = len(tokenizer_obj.word_index) + 1
vocab_size_1 = len(tokenizer_obj_1.word_index) + 1

total_item_tokens = tokenizer_obj.texts_to_sequences(total_item_labelled_unlabelled['item'])
total_item_tokens_1 = tokenizer_obj_1.texts_to_sequences(tranzact_data_labelled_final['item'])

total_item_pad = pad_sequences(total_item_tokens, maxlen = max_length, padding = 'post')
total_item_pad_1 = pad_sequences(total_item_tokens_1, maxlen = max_length, padding = 'post')

In [102]:
print(type(total_item_pad))

print(total_item_pad.shape)
print(total_item_pad_1.shape)

print(len(total_item_tokens))

print(max_length)
print(min_length)

print(vocab_size)
print(vocab_size_1)

<class 'numpy.ndarray'>
(29208, 121)
(11803, 121)
29208
121
1
14707
7949


In [103]:
print(total_item_pad[29207])

[ 1102  3545 14706     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0]


In [126]:
from keras.models import Sequential

Embedding_Dim = 150

## build model :

model = Sequential()
model.add(Embedding(vocab_size, Embedding_Dim, input_length=max_length))

model.compile(optimizer='adam', loss='categorical_crossentropy')

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 121, 150)          2206050   
Total params: 2,206,050
Trainable params: 2,206,050
Non-trainable params: 0
_________________________________________________________________


In [None]:
aaa = np.zeros((29208, 121, 150))

In [128]:
model.fit(total_item_pad, aaa, epochs=1)
          

Epoch 1/1


<keras.callbacks.History at 0x7f68a8a6dc50>

In [None]:
bbb = aaa.reshape(530125200,1)
print(max(bbb))

In [123]:
output_matrix = model.predict(total_item_pad)
output_matrix_1 = model.predict(total_item_pad_1)

In [124]:
print(len(output_matrix))
print(output_matrix.shape)
print(output_matrix_1.shape)
print('==================')
print(output_matrix[0])
print('==================')
print(output_matrix_1[0])

29208
(29208, 121, 150)
(11803, 121, 150)
[[-0.00062995  0.01594123 -0.04419523 ... -0.00966673  0.00883427
   0.03486185]
 [-0.01585352  0.00786423 -0.01931726 ... -0.02444116 -0.03071383
   0.04167284]
 [-0.02429928  0.01728732  0.04718467 ... -0.03828233 -0.02419143
   0.01099249]
 ...
 [ 0.00860514 -0.01156187  0.00366433 ... -0.04500468  0.01373686
   0.0366299 ]
 [ 0.00860514 -0.01156187  0.00366433 ... -0.04500468  0.01373686
   0.0366299 ]
 [ 0.00860514 -0.01156187  0.00366433 ... -0.04500468  0.01373686
   0.0366299 ]]
[[ 0.02512541  0.03257792  0.00572807 ...  0.01204318 -0.04263965
   0.03596118]
 [ 0.04674358 -0.03334226  0.00341275 ... -0.02666605 -0.02241912
   0.04187414]
 [ 0.00167211  0.00179468  0.01721409 ...  0.01829297 -0.04038652
   0.0307025 ]
 ...
 [ 0.00860514 -0.01156187  0.00366433 ... -0.04500468  0.01373686
   0.0366299 ]
 [ 0.00860514 -0.01156187  0.00366433 ... -0.04500468  0.01373686
   0.0366299 ]
 [ 0.00860514 -0.01156187  0.00366433 ... -0.04500468  0

In [23]:
aa = output_matrix[0:11802]
print(aa.shape)

(11802, 104, 150)


In [None]:
## Total no. of unique words = vocab_size = 14707
## each word has got 150 parameters as per Embedding Dimension defined.
## Hence total Param # = 14707 * 150 = 2206050


In [129]:
model_2 = Sequential()
model_2.add(Embedding(vocab_size, Embedding_Dim, weights=aaa,\
                    input_length=max_length, trainable=False))
model_2.add(SpatialDropout1D(0.2))
#model.add(LSTM(32, dropout=0.2, recurrent_dropout=0.2))
model_2.add(GRU(units=32, dropout=0.2, recurrent_dropout=0.2))
model_2.add(Dense(256, activation='relu'))
model_2.add(Dense(71, activation='softmax'))
model_2.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

model_2.summary()


ValueError: You called `set_weights(weights)` on layer "embedding_6" with a  weight list of length 29208, but the layer was expecting 1 weights. Provided weights: [[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0....