In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, SpatialDropout1D, GRU
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer

from keras.preprocessing import sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
%matplotlib inline


Using TensorFlow backend.


In [2]:
import warnings
warnings.filterwarnings("ignore")

In [33]:
## Loading Data - labelled data:

data_1 = pd.ExcelFile('RNN_Tranzact_Rohan_1_Regrouped_with_supplier.xlsx')
tranzact_data_1 = data_1.parse(0)

tranzact_data_1.tail()

Unnamed: 0,supplier_item_id,to_company_name,supplier_item_id.1,supplier_item_id.2,product,supplier,Done,Atul Sugg.,sub_type,material,process,grade,spec,brand
13552,86390,CESARE BONETTI INTERNATIONAL PVT. LTD.,86390,"YOKE SLEEVE;8"" # 150;GTV; B150 C62300",YOKE SLEEVE,CESARE BONETTI INDIA PVT. LTD.,278.0,,,,,C62300,,
13553,89723,CESARE BONETTI INTERNATIONAL PVT. LTD.,89723,"YOKE SLEEVE 1½"" #2700 BLY;B150 C62300",YOKE SLEEVE,CESARE BONETTI INDIA PVT. LTD.,278.0,,,,,C62300,,
13554,89995,CESARE BONETTI INTERNATIONAL PVT. LTD.,89995,"YOKE SLEEVE;4"" #150 GTV; B150 C62300",YOKE SLEEVE,CESARE BONETTI INDIA PVT. LTD.,278.0,,,,,C62300,,
13555,117819,WAAREE INDUSTRIES PVT. LTD.,117819,"YOKE SLEEVE;8"" # 150;GTV; A439 D2",cicasting,Waaree Industries Pvt.Ltd.,278.0,,,,,,,
13556,169292,CESARE BONETTI INTERNATIONAL PVT. LTD.,169292,"YOKE SLEEVE; 1"";#1500;CBD;A582 T416",YOKE SLEEVE,CESARE BONETTI INDIA PVT. LTD.,278.0,,,,,,,


In [34]:
## Loading Data - un-labelled data:

data_2 = pd.ExcelFile('14Mar_Gorky.xlsx')
tranzact_data_new_1 = data_2.parse(0)

tranzact_data_new_1.tail()

Unnamed: 0,buyer_item_id,buyer_itemid,buyer_item_name,from_company_id,from_company_name,to_company_id,to_company_name
17763,289052,PLYBAG_10X15,POLYBAG_10X15 INCH_PP,13872,R-Tech Products & Packaging Pvt Ltd,16730,GLORY PACK INDUSTRIES
17764,289639,HANDLENUTM12X5MMXX,"HANDLE NUT 1"" #600,HTB,A194 Gr8",842,CESARE BONETTI INDIA PVT. LTD.,7434,Shah Brothers
17765,48512,RDBR50 (SS431),ROUND BAR OD50 ASTM A276 TYPE 431,114,Entech Controls,921,ALPESH METALS
17766,57696,PGX618101601202200,"PTV 6"" #300 RF BVE WCB HW IBR",7506,CESARE BONETTI INTERNATIONAL PVT. LTD.,842,CESARE BONETTI INDIA PVT. LTD.
17767,274954,PPBOX_ST5,PP_MONOBOX_ST5,13872,R-Tech Products & Packaging Pvt Ltd,15527,Shree Arun Packaging Co. Pvt. Ltd.


In [35]:
## Picking up required info from labelled data:

col = ['to_company_name', 'supplier_item_id.2', 'product', 'supplier']
tranzact_data_2 = tranzact_data_1[col]
tranzact_data_2.columns = ['OEM', 'item', 'product', 'supplier']

## Removing duplicate (item + supplier) combinations :

tranzact_data_2['find_duplicate'] = tranzact_data_2['item'].map(str)+tranzact_data_2['supplier'].map(str)
tranzact_data_2.drop_duplicates(subset='find_duplicate', keep = 'first', inplace = True)
tranzact_data_2 = tranzact_data_2.reset_index(drop=True)

# Creating separate dataframe for Cesare Bonetti International Pvt. Ltd. since these are unlabelled :

tranzact_data_2_cesare_international = tranzact_data_2.loc[tranzact_data_2['OEM'] \
                                                               == 'CESARE BONETTI INTERNATIONAL PVT. LTD.']

tranzact_data_labelled = tranzact_data_2.drop(tranzact_data_2_cesare_international.index)

tranzact_data_2_cesare_international = tranzact_data_2_cesare_international.reset_index(drop=True)
tranzact_data_labelled = tranzact_data_labelled.reset_index(drop=True)

print(len(tranzact_data_2_cesare_international))
        
print(len(tranzact_data_labelled))

tranzact_data_labelled.tail()

1754
11803


Unnamed: 0,OEM,item,product,supplier,find_duplicate
11798,CESARE BONETTI INDIA PVT. LTD.,"YOKE CASTING 12"" #2500 GTV, SA216 WCC, QAP",CS CASTING,AMI Alloys,"YOKE CASTING 12"" #2500 GTV, SA216 WCC, QAPAMI ..."
11799,CESARE BONETTI INDIA PVT. LTD.,"YOKE CAST, 3"" #2500 GTV, SA216 WCC, QAP",CS CASTING,Tulip Casting Pvt. Ltd.,"YOKE CAST, 3"" #2500 GTV, SA216 WCC, QAPTulip C..."
11800,CESARE BONETTI INDIA PVT. LTD.,"YOKE CASTING 12"" #2500 GTV, SA217 C12A, QAP",ALLOY STEEL CASTING,AMI Alloys,"YOKE CASTING 12"" #2500 GTV, SA217 C12A, QAPAMI..."
11801,CESARE BONETTI INDIA PVT. LTD.,"YOKE CAST 10"" #150 TCV; A217WC6",ALLOY STEEL CASTING,RAJ ENGINEERS,"YOKE CAST 10"" #150 TCV; A217WC6RAJ ENGINEERS"
11802,WAAREE INDUSTRIES PVT. LTD.,"YOKE SLEEVE;8"" # 150;GTV; A439 D2",cicasting,Waaree Industries Pvt.Ltd.,"YOKE SLEEVE;8"" # 150;GTV; A439 D2Waaree Indus..."


In [36]:
## Picking up required info from un-labelled data:

col = ['from_company_name', 'buyer_item_name', 'to_company_name']
tranzact_data_new_2 = tranzact_data_new_1[col]
tranzact_data_new_2.columns = ['OEM', 'item', 'supplier']

## Removing duplicate (item + supplier) combinations :

tranzact_data_new_2['find_duplicate'] = tranzact_data_new_2['item'].map(str)+tranzact_data_new_2['supplier'].map(str)
tranzact_data_new_2.drop_duplicates(subset='find_duplicate', keep = 'first', inplace = True)
tranzact_data_un_labelled = tranzact_data_new_2.reset_index(drop=True)

print(len(tranzact_data_un_labelled))
tranzact_data_un_labelled.tail()

17405


Unnamed: 0,OEM,item,supplier,find_duplicate
17400,R-Tech Products & Packaging Pvt Ltd,POLYBAG_10X15 INCH_PP,GLORY PACK INDUSTRIES,POLYBAG_10X15 INCH_PPGLORY PACK INDUSTRIES
17401,CESARE BONETTI INDIA PVT. LTD.,"HANDLE NUT 1"" #600,HTB,A194 Gr8",Shah Brothers,"HANDLE NUT 1"" #600,HTB,A194 Gr8Shah Brothers"
17402,Entech Controls,ROUND BAR OD50 ASTM A276 TYPE 431,ALPESH METALS,ROUND BAR OD50 ASTM A276 TYPE 431ALPESH METALS
17403,CESARE BONETTI INTERNATIONAL PVT. LTD.,"PTV 6"" #300 RF BVE WCB HW IBR",CESARE BONETTI INDIA PVT. LTD.,"PTV 6"" #300 RF BVE WCB HW IBRCESARE BONETTI IN..."
17404,R-Tech Products & Packaging Pvt Ltd,PP_MONOBOX_ST5,Shree Arun Packaging Co. Pvt. Ltd.,PP_MONOBOX_ST5Shree Arun Packaging Co. Pvt. Ltd.


In [37]:
## make all in lower case :

tranzact_data_labelled['OEM'] = tranzact_data_labelled['OEM'].map(lambda x: x if type(x)!=str else x.lower()) 
tranzact_data_labelled['item'] = tranzact_data_labelled['item'].map(lambda x: x if type(x)!=str else x.lower()) 
tranzact_data_labelled['product'] = tranzact_data_labelled['product'].map(lambda x: x if type(x)!=str else x.lower())

tranzact_data_un_labelled['OEM'] = tranzact_data_un_labelled['OEM'].map(lambda x: x if type(x)!=str else x.lower()) 
tranzact_data_un_labelled['item'] = tranzact_data_un_labelled['item'].map(lambda x: x if type(x)!=str else x.lower()) 

## remove rows with any cell value = none

tranzact_data_labelled_final = tranzact_data_labelled.dropna()
tranzact_data_un_labelled_final = tranzact_data_un_labelled.dropna()

tranzact_data_labelled_final = tranzact_data_labelled_final.reset_index(drop=True)
tranzact_data_un_labelled_final = tranzact_data_un_labelled_final.reset_index(drop=True)

print(len(tranzact_data_labelled_final))
print(len(tranzact_data_un_labelled_final))

11803
17405


In [50]:
## digitising 'product' and creating a dictionary for labelled data :

tranzact_data_labelled_final['product_id'] = tranzact_data_labelled_final['product'].factorize()[0]

product_id_df = tranzact_data_labelled_final[['product', 'product_id']].drop_duplicates().sort_values('product_id')
product_to_id = dict(product_id_df.values)
id_to_product = dict(product_id_df[['product_id', 'product']].values)

## removing puncuations from items of both labelled and un-labelled data :

pd.options.mode.chained_assignment = None

tranzact_data_labelled_final['item'] = tranzact_data_labelled_final['item'].str.replace(r'\W',' ')
tranzact_data_un_labelled_final['item'] = tranzact_data_un_labelled_final['item'].str.replace(r'\W',' ')

## remove gaps between words to single gap

tranzact_data_labelled_final['item'] = tranzact_data_labelled_final['item'].str.replace(r'\s+',' ')
tranzact_data_un_labelled_final['item'] = tranzact_data_un_labelled_final['item'].str.replace(r'\s+',' ')

print(len(tranzact_data_labelled_final))
print(len(tranzact_data_un_labelled_final))
tranzact_data_un_labelled_final.tail()

11803
17405


Unnamed: 0,OEM,item,supplier,find_duplicate
17400,r-tech products & packaging pvt ltd,polybag_10x15 inch_pp,GLORY PACK INDUSTRIES,POLYBAG_10X15 INCH_PPGLORY PACK INDUSTRIES
17401,cesare bonetti india pvt. ltd.,handle nut 1 600 htb a194 gr8,Shah Brothers,"HANDLE NUT 1"" #600,HTB,A194 Gr8Shah Brothers"
17402,entech controls,round bar od50 astm a276 type 431,ALPESH METALS,ROUND BAR OD50 ASTM A276 TYPE 431ALPESH METALS
17403,cesare bonetti international pvt. ltd.,ptv 6 300 rf bve wcb hw ibr,CESARE BONETTI INDIA PVT. LTD.,"PTV 6"" #300 RF BVE WCB HW IBRCESARE BONETTI IN..."
17404,r-tech products & packaging pvt ltd,pp_monobox_st5,Shree Arun Packaging Co. Pvt. Ltd.,PP_MONOBOX_ST5Shree Arun Packaging Co. Pvt. Ltd.


In [51]:
total_item_labelled_unlabelled = pd.concat([tranzact_data_labelled_final['item'], \
                                            tranzact_data_un_labelled_final['item']])
total_item_labelled_unlabelled.head()

0                                 new cylinder 404 gas
1    accumulator model a as 5126 3 4 connection eme...
2              accumulator flokool 3 4 400 psi fksa596
3                         accumulator 1 5 8 connection
4                                     accumulator 7 8 
Name: item, dtype: object

In [52]:
print(len(total_item_labelled_unlabelled))
total_item_labelled_unlabelled.tail()

29208


17400                polybag_10x15 inch_pp
17401        handle nut 1 600 htb a194 gr8
17402    round bar od50 astm a276 type 431
17403          ptv 6 300 rf bve wcb hw ibr
17404                       pp_monobox_st5
Name: item, dtype: object

In [53]:
total_item_labelled_unlabelled = total_item_labelled_unlabelled.reset_index(drop=True)
total_item_labelled_unlabelled.tail()

29203                polybag_10x15 inch_pp
29204        handle nut 1 600 htb a194 gr8
29205    round bar od50 astm a276 type 431
29206          ptv 6 300 rf bve wcb hw ibr
29207                       pp_monobox_st5
Name: item, dtype: object

In [11]:
from gensim.models import Word2Vec
from nltk import word_tokenize

In [12]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/scar3crow/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [134]:
#tilu_list = total_item_labelled_unlabelled.tolist()
tilu_list = [word_tokenize(s) for s in total_item_labelled_unlabelled[0].split()]

print(type(tilu_list))
print(len(tilu_list[0]))
print(tilu_list)
print([total_item_labelled_unlabelled[0]])
print(type(total_item_labelled_unlabelled))
print(type(total_item_labelled_unlabelled[0]))
print(type([total_item_labelled_unlabelled]))
print(type([total_item_labelled_unlabelled[0]]))

<class 'list'>
1
[['new'], ['cylinder'], ['404'], ['gas']]
['new cylinder 404 gas']
<class 'pandas.core.series.Series'>
<class 'str'>
<class 'list'>
<class 'list'>


In [89]:
emb_dim = 150

model = Word2Vec([total_item_labelled_unlabelled[0:]], size = emb_dim, window=10, \
               min_count=0, negative = 15, iter = 10, workers = 10, sg=1)

In [90]:
word_vectors = model.wv
print(type(word_vectors))
print(type(model))
print(model)

<class 'gensim.models.keyedvectors.Word2VecKeyedVectors'>
<class 'gensim.models.word2vec.Word2Vec'>
Word2Vec(vocab=15, size=150, alpha=0.025)


In [46]:
print(word_vectors)

<gensim.models.keyedvectors.Word2VecKeyedVectors object at 0x7f5e22162ba8>


In [54]:
tokenizer_obj = Tokenizer()
tokenizer_obj_1 = Tokenizer()

tokenizer_obj.fit_on_texts(total_item_labelled_unlabelled)
tokenizer_obj_1.fit_on_texts(tranzact_data_labelled_final['item'])

## pad sequences :

max_length = max([len(s.split()) for s in total_item_labelled_unlabelled])
min_length = min([len(s.split()) for s in total_item_labelled_unlabelled])

## define vocabulary size :

vocab_size = len(tokenizer_obj.word_index) + 1
vocab_size_1 = len(tokenizer_obj_1.word_index) + 1

total_item_tokens = tokenizer_obj.texts_to_sequences(total_item_labelled_unlabelled)
total_item_tokens_1 = tokenizer_obj_1.texts_to_sequences(tranzact_data_labelled_final['item'])

total_item_pad = pad_sequences(total_item_tokens, maxlen = max_length, padding = 'post')
total_item_pad_1 = pad_sequences(total_item_tokens_1, maxlen = max_length, padding = 'post')

Exception ignored in: <bound method BaseSession._Callable.__del__ of <tensorflow.python.client.session.BaseSession._Callable object at 0x7fb6331ba1d0>>
Traceback (most recent call last):
  File "/home/scar3crow/Dropbox/WorkStation-Subrata/python/venv1/lib/python3.5/site-packages/tensorflow/python/client/session.py", line 1398, in __del__
    self._session._session, self._handle, status)
  File "/home/scar3crow/Dropbox/WorkStation-Subrata/python/venv1/lib/python3.5/site-packages/tensorflow/python/framework/errors_impl.py", line 519, in __exit__
    c_api.TF_GetCode(self.status.status))
tensorflow.python.framework.errors_impl.InvalidArgumentError: No such callable handle: 93892200179424


In [58]:
print(total_item_pad.shape)
print(total_item_pad_1.shape)

print(len(total_item_tokens))

print(max_length)
print(min_length)

print(vocab_size)
print(vocab_size_1)

(29208, 121)
(11803, 121)
29208
121
1
14707
7949


In [67]:
print(total_item_pad[29207])

[ 1102  3545 14706     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0]


In [59]:
from keras.models import Sequential

Embedding_Dim = 150

## build model :

model = Sequential()
model.add(Embedding(vocab_size, Embedding_Dim, input_length=max_length))

model.compile(optimizer='adam', loss='categorical_crossentropy')

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 121, 150)          2206050   
Total params: 2,206,050
Trainable params: 2,206,050
Non-trainable params: 0
_________________________________________________________________


In [57]:
model.fit(total_item_pad)
          

IndexError: list index out of range

In [44]:
output_matrix = model.predict(total_item_pad)
output_matrix_1 = model.predict(total_item_pad_1)

In [47]:
print(len(output_matrix))
print(output_matrix.shape)
print(output_matrix_1.shape)
print('==================')
print(output_matrix[0])
print('==================')
print(output_matrix_1[0])

29208
(29208, 104, 150)
(11803, 104, 150)
[[ 0.03827683 -0.04964966  0.00533981 ... -0.00938513 -0.01848874
  -0.02171257]
 [ 0.01085696 -0.01500381  0.01200251 ...  0.04773391 -0.03894383
  -0.04329807]
 [-0.02027301 -0.03686506  0.00674832 ... -0.02870193  0.04623935
   0.00291699]
 ...
 [ 0.01659502 -0.00168975 -0.04770852 ...  0.01144811  0.013912
   0.00347376]
 [ 0.01659502 -0.00168975 -0.04770852 ...  0.01144811  0.013912
   0.00347376]
 [ 0.01659502 -0.00168975 -0.04770852 ...  0.01144811  0.013912
   0.00347376]]
[[ 0.0159557   0.01924546 -0.02351296 ... -0.00765111  0.00220709
  -0.04442221]
 [ 0.02946129 -0.03468596 -0.04777217 ...  0.00052925  0.01708217
   0.0480486 ]
 [-0.0243927  -0.0021177   0.04142804 ...  0.03606439 -0.03773411
   0.01185968]
 ...
 [ 0.01659502 -0.00168975 -0.04770852 ...  0.01144811  0.013912
   0.00347376]
 [ 0.01659502 -0.00168975 -0.04770852 ...  0.01144811  0.013912
   0.00347376]
 [ 0.01659502 -0.00168975 -0.04770852 ...  0.01144811  0.013912
  

In [23]:
aa = output_matrix[0:11802]
print(aa.shape)

(11802, 104, 150)


In [None]:
## Total no. of unique words = vocab_size = 14707
## each word has got 150 parameters as per Embedding Dimension defined.
## Hence total Param # = 14707 * 150 = 2206050


In [30]:
model = Sequential()
model.add(Embedding(vocab_size, Embedding_Dim, weights=[output_matrix],\
                    input_length=max_length, trainable=False))
model.add(SpatialDropout1D(0.2))
#model.add(LSTM(32, dropout=0.2, recurrent_dropout=0.2))
model.add(GRU(units=32, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(256, activation='relu'))
model.add(Dense(71, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

model.summary()


ValueError: Layer weight shape (14904, 150) not compatible with provided weight shape (29208, 104, 150)

In [77]:
sentences = [['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec'],
             ['this', 'is', 'the', 'second', 'sentence'],
             ['yet', 'another', 'sentence'],
             ['one', 'more', 'sentence'],
             ['and', 'the', 'final', 'sentence']]


In [78]:
print(type(sentences))

<class 'list'>


In [82]:
print(len(sentences))

5


In [87]:
print(type(total_item_labelled_unlabelled[0]))
print(type([total_item_labelled_unlabelled[0]]))

<class 'str'>
<class 'list'>
