In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, SpatialDropout1D, GRU
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer

from keras.preprocessing import sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
%matplotlib inline


Using TensorFlow backend.


In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
## Loading Data - labelled data:

data_1 = pd.ExcelFile('RNN_Tranzact_Rohan_1_Regrouped_with_supplier.xlsx')
tranzact_data_1 = data_1.parse(0)

tranzact_data_1.tail()

Unnamed: 0,supplier_item_id,to_company_name,supplier_item_id.1,supplier_item_id.2,product,supplier,Done,Atul Sugg.,sub_type,material,process,grade,spec,brand
13552,86390,CESARE BONETTI INTERNATIONAL PVT. LTD.,86390,"YOKE SLEEVE;8"" # 150;GTV; B150 C62300",YOKE SLEEVE,CESARE BONETTI INDIA PVT. LTD.,278.0,,,,,C62300,,
13553,89723,CESARE BONETTI INTERNATIONAL PVT. LTD.,89723,"YOKE SLEEVE 1½"" #2700 BLY;B150 C62300",YOKE SLEEVE,CESARE BONETTI INDIA PVT. LTD.,278.0,,,,,C62300,,
13554,89995,CESARE BONETTI INTERNATIONAL PVT. LTD.,89995,"YOKE SLEEVE;4"" #150 GTV; B150 C62300",YOKE SLEEVE,CESARE BONETTI INDIA PVT. LTD.,278.0,,,,,C62300,,
13555,117819,WAAREE INDUSTRIES PVT. LTD.,117819,"YOKE SLEEVE;8"" # 150;GTV; A439 D2",cicasting,Waaree Industries Pvt.Ltd.,278.0,,,,,,,
13556,169292,CESARE BONETTI INTERNATIONAL PVT. LTD.,169292,"YOKE SLEEVE; 1"";#1500;CBD;A582 T416",YOKE SLEEVE,CESARE BONETTI INDIA PVT. LTD.,278.0,,,,,,,


In [4]:
## Loading Data - un-labelled data:

data_2 = pd.ExcelFile('14Mar_Gorky.xlsx')
tranzact_data_new_1 = data_2.parse(0)

tranzact_data_new_1.tail()

Unnamed: 0,buyer_item_id,buyer_itemid,buyer_item_name,from_company_id,from_company_name,to_company_id,to_company_name
17763,289052,PLYBAG_10X15,POLYBAG_10X15 INCH_PP,13872,R-Tech Products & Packaging Pvt Ltd,16730,GLORY PACK INDUSTRIES
17764,289639,HANDLENUTM12X5MMXX,"HANDLE NUT 1"" #600,HTB,A194 Gr8",842,CESARE BONETTI INDIA PVT. LTD.,7434,Shah Brothers
17765,48512,RDBR50 (SS431),ROUND BAR OD50 ASTM A276 TYPE 431,114,Entech Controls,921,ALPESH METALS
17766,57696,PGX618101601202200,"PTV 6"" #300 RF BVE WCB HW IBR",7506,CESARE BONETTI INTERNATIONAL PVT. LTD.,842,CESARE BONETTI INDIA PVT. LTD.
17767,274954,PPBOX_ST5,PP_MONOBOX_ST5,13872,R-Tech Products & Packaging Pvt Ltd,15527,Shree Arun Packaging Co. Pvt. Ltd.


In [5]:
## Picking up required info from labelled data:

col = ['to_company_name', 'supplier_item_id.2', 'product', 'supplier']
tranzact_data_2 = tranzact_data_1[col]
tranzact_data_2.columns = ['OEM', 'item', 'product', 'supplier']

## Removing duplicate (item + supplier) combinations :

tranzact_data_2['find_duplicate'] = tranzact_data_2['item'].map(str)+tranzact_data_2['supplier'].map(str)
tranzact_data_2.drop_duplicates(subset='find_duplicate', keep = 'first', inplace = True)
tranzact_data_2 = tranzact_data_2.reset_index(drop=True)

# Creating separate dataframe for Cesare Bonetti International Pvt. Ltd. since these are unlabelled :

tranzact_data_2_cesare_international = tranzact_data_2.loc[tranzact_data_2['OEM'] \
                                                               == 'CESARE BONETTI INTERNATIONAL PVT. LTD.']

tranzact_data_labelled = tranzact_data_2.drop(tranzact_data_2_cesare_international.index)

tranzact_data_2_cesare_international = tranzact_data_2_cesare_international.reset_index(drop=True)
tranzact_data_labelled = tranzact_data_labelled.reset_index(drop=True)

print(len(tranzact_data_2_cesare_international))
        
print(len(tranzact_data_labelled))

tranzact_data_labelled.tail()

1754
11803


Unnamed: 0,OEM,item,product,supplier,find_duplicate
11798,CESARE BONETTI INDIA PVT. LTD.,"YOKE CASTING 12"" #2500 GTV, SA216 WCC, QAP",CS CASTING,AMI Alloys,"YOKE CASTING 12"" #2500 GTV, SA216 WCC, QAPAMI ..."
11799,CESARE BONETTI INDIA PVT. LTD.,"YOKE CAST, 3"" #2500 GTV, SA216 WCC, QAP",CS CASTING,Tulip Casting Pvt. Ltd.,"YOKE CAST, 3"" #2500 GTV, SA216 WCC, QAPTulip C..."
11800,CESARE BONETTI INDIA PVT. LTD.,"YOKE CASTING 12"" #2500 GTV, SA217 C12A, QAP",ALLOY STEEL CASTING,AMI Alloys,"YOKE CASTING 12"" #2500 GTV, SA217 C12A, QAPAMI..."
11801,CESARE BONETTI INDIA PVT. LTD.,"YOKE CAST 10"" #150 TCV; A217WC6",ALLOY STEEL CASTING,RAJ ENGINEERS,"YOKE CAST 10"" #150 TCV; A217WC6RAJ ENGINEERS"
11802,WAAREE INDUSTRIES PVT. LTD.,"YOKE SLEEVE;8"" # 150;GTV; A439 D2",cicasting,Waaree Industries Pvt.Ltd.,"YOKE SLEEVE;8"" # 150;GTV; A439 D2Waaree Indus..."


In [6]:
## Picking up required info from un-labelled data:

col = ['from_company_name', 'buyer_item_name', 'to_company_name']
tranzact_data_new_2 = tranzact_data_new_1[col]
tranzact_data_new_2.columns = ['OEM', 'item', 'supplier']

## Removing duplicate (item + supplier) combinations :

tranzact_data_new_2['find_duplicate'] = tranzact_data_new_2['item'].map(str)+tranzact_data_new_2['supplier'].map(str)
tranzact_data_new_2.drop_duplicates(subset='find_duplicate', keep = 'first', inplace = True)
tranzact_data_un_labelled = tranzact_data_new_2.reset_index(drop=True)

print(len(tranzact_data_un_labelled))
tranzact_data_un_labelled.tail()

17405


Unnamed: 0,OEM,item,supplier,find_duplicate
17400,R-Tech Products & Packaging Pvt Ltd,POLYBAG_10X15 INCH_PP,GLORY PACK INDUSTRIES,POLYBAG_10X15 INCH_PPGLORY PACK INDUSTRIES
17401,CESARE BONETTI INDIA PVT. LTD.,"HANDLE NUT 1"" #600,HTB,A194 Gr8",Shah Brothers,"HANDLE NUT 1"" #600,HTB,A194 Gr8Shah Brothers"
17402,Entech Controls,ROUND BAR OD50 ASTM A276 TYPE 431,ALPESH METALS,ROUND BAR OD50 ASTM A276 TYPE 431ALPESH METALS
17403,CESARE BONETTI INTERNATIONAL PVT. LTD.,"PTV 6"" #300 RF BVE WCB HW IBR",CESARE BONETTI INDIA PVT. LTD.,"PTV 6"" #300 RF BVE WCB HW IBRCESARE BONETTI IN..."
17404,R-Tech Products & Packaging Pvt Ltd,PP_MONOBOX_ST5,Shree Arun Packaging Co. Pvt. Ltd.,PP_MONOBOX_ST5Shree Arun Packaging Co. Pvt. Ltd.


In [7]:
## make all in lower case :

tranzact_data_labelled['OEM'] = tranzact_data_labelled['OEM'].map(lambda x: x if type(x)!=str else x.lower()) 
tranzact_data_labelled['item'] = tranzact_data_labelled['item'].map(lambda x: x if type(x)!=str else x.lower()) 
tranzact_data_labelled['product'] = tranzact_data_labelled['product'].map(lambda x: x if type(x)!=str else x.lower())

tranzact_data_un_labelled['OEM'] = tranzact_data_un_labelled['OEM'].map(lambda x: x if type(x)!=str else x.lower()) 
tranzact_data_un_labelled['item'] = tranzact_data_un_labelled['item'].map(lambda x: x if type(x)!=str else x.lower()) 

## remove rows with any cell value = none

tranzact_data_labelled_final = tranzact_data_labelled.dropna()
tranzact_data_un_labelled_final = tranzact_data_un_labelled.dropna()

tranzact_data_labelled_final = tranzact_data_labelled_final.reset_index(drop=True)
tranzact_data_un_labelled_final = tranzact_data_un_labelled_final.reset_index(drop=True)

print(len(tranzact_data_labelled_final))
print(len(tranzact_data_un_labelled_final))

11803
17405


In [36]:
## digitising 'product' and creating a dictionary for labelled data :

tranzact_data_labelled_final['product_id'] = tranzact_data_labelled_final['product'].factorize()[0]

product_id_df = tranzact_data_labelled_final[['product', 'product_id']].drop_duplicates().sort_values('product_id')
product_to_id = dict(product_id_df.values)
id_to_product = dict(product_id_df[['product_id', 'product']].values)

## removing puncuations from items of both labelled and un-labelled data :

pd.options.mode.chained_assignment = None

tranzact_data_labelled_final['item'] = tranzact_data_labelled_final['item'].str.replace(r'\W',' ')
tranzact_data_un_labelled_final['item'] = tranzact_data_un_labelled_final['item'].str.replace(r'\W',' ')

## remove gaps between words to single gap

tranzact_data_labelled_final['item'] = tranzact_data_labelled_final['item'].str.replace(r'\s+',' ')
tranzact_data_un_labelled_final['item'] = tranzact_data_un_labelled_final['item'].str.replace(r'\s+',' ')

print(len(tranzact_data_labelled_final))
print(len(tranzact_data_un_labelled_final))
tranzact_data_labelled_final.tail()

11803
17405


Unnamed: 0,OEM,item,product,supplier,find_duplicate,product_id
11798,cesare bonetti india pvt. ltd.,yoke casting 12 2500 gtv sa216 wcc qap,cs casting,AMI Alloys,"YOKE CASTING 12"" #2500 GTV, SA216 WCC, QAPAMI ...",18
11799,cesare bonetti india pvt. ltd.,yoke cast 3 2500 gtv sa216 wcc qap,cs casting,Tulip Casting Pvt. Ltd.,"YOKE CAST, 3"" #2500 GTV, SA216 WCC, QAPTulip C...",18
11800,cesare bonetti india pvt. ltd.,yoke casting 12 2500 gtv sa217 c12a qap,alloy steel casting,AMI Alloys,"YOKE CASTING 12"" #2500 GTV, SA217 C12A, QAPAMI...",39
11801,cesare bonetti india pvt. ltd.,yoke cast 10 150 tcv a217wc6,alloy steel casting,RAJ ENGINEERS,"YOKE CAST 10"" #150 TCV; A217WC6RAJ ENGINEERS",39
11802,waaree industries pvt. ltd.,yoke sleeve 8 150 gtv a439 d2,cicasting,Waaree Industries Pvt.Ltd.,"YOKE SLEEVE;8"" # 150;GTV; A439 D2Waaree Indus...",17


In [None]:
## DO NOT USE THIS STEP
# get stop words from nltk
stopWords = stopwords.words('english')

# pre processing data
def cleanData(sentence):
    processedList = ""
    
    # convert to lowercase, ignore all special characters - keep only alpha-numericals and spaces (not removing full-stop here)
    sentence = re.sub(r'[^A-Za-z0-9\s.]',r'',str(sentence).lower())
    sentence = re.sub(r'\n',r' ',sentence)
    
    # remove stop words
    sentence = " ".join([word for word in sentence.split() if word not in stopWords])
    
    return sentence


In [None]:
## DO NOT USE THIS STEP
# clean data of one example:
cleanData(data['Description'][2]) ## cleaning of dataframe 'data' column 'Description' index no. '2'

# clean data for whole dataframe
data['Description'] = data['Description'].map(lambda x: cleanData(x))

In [9]:
total_item_labelled_unlabelled = pd.DataFrame(pd.concat([tranzact_data_labelled_final['item'], \
                                            tranzact_data_un_labelled_final['item']]))
total_item_labelled_unlabelled.head()

Unnamed: 0,item
0,new cylinder 404 gas
1,accumulator model a as 5126 3 4 connection eme...
2,accumulator flokool 3 4 400 psi fksa596
3,accumulator 1 5 8 connection
4,accumulator 7 8


In [10]:
print(len(total_item_labelled_unlabelled))
total_item_labelled_unlabelled.tail()

29208


Unnamed: 0,item
17400,polybag_10x15 inch_pp
17401,handle nut 1 600 htb a194 gr8
17402,round bar od50 astm a276 type 431
17403,ptv 6 300 rf bve wcb hw ibr
17404,pp_monobox_st5


In [11]:
total_item_labelled_unlabelled = total_item_labelled_unlabelled.reset_index(drop=True)
total_item_labelled_unlabelled.tail()

Unnamed: 0,item
29203,polybag_10x15 inch_pp
29204,handle nut 1 600 htb a194 gr8
29205,round bar od50 astm a276 type 431
29206,ptv 6 300 rf bve wcb hw ibr
29207,pp_monobox_st5


In [12]:
from gensim.models import Word2Vec
from nltk import word_tokenize

In [27]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/scar3crow/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [13]:
total_item_labelled_unlabelled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29208 entries, 0 to 29207
Data columns (total 1 columns):
item    29208 non-null object
dtypes: object(1)
memory usage: 228.3+ KB


In [14]:
tmp_corpus = total_item_labelled_unlabelled['item'].map(lambda x: x.split('.'))

In [15]:
print(type(tmp_corpus))
print(tmp_corpus)

<class 'pandas.core.series.Series'>
0                                   [new cylinder 404 gas]
1        [accumulator model a as 5126 3 4 connection em...
2                [accumulator flokool 3 4 400 psi fksa596]
3                           [accumulator 1 5 8 connection]
4                                       [accumulator 7 8 ]
5                                      [accumulator 13 8 ]
6                [accumulator flokool 5 8 400 psi fksa596]
7                                      [accumulator 1 1 8]
8                                      [accumulator 2 1 8]
9                                    [accumulator 5216 7s]
10                                        [buthyl acetate]
11                                         [ethyl acetate]
12                                 [npac n proyl acetate ]
13                                      [pma p m acetate ]
14         [flashback arrester acetylene cylinder mounted]
15                                [acetylene cylinder 4kg]
16                  

In [16]:
from tqdm import tqdm

In [17]:
print(len(tmp_corpus))
print(tmp_corpus[29207])

29208
['pp_monobox_st5']


In [18]:
corpus = []
for i in tqdm(range(len(tmp_corpus))):
    for line in tmp_corpus[i]:
        words = [x for x in line.split()]
        corpus.append(words)

100%|██████████| 29208/29208 [00:00<00:00, 48421.05it/s]


In [20]:
print(corpus)



In [21]:
num_of_items = len(corpus)
num_of_words = 0
for line in corpus:
    num_of_words += len(line)

print('Num of sentences - %s'%(num_of_items))
print('Num of words - %s'%(num_of_words))

Num of sentences - 29208
Num of words - 216958


In [45]:
emb_dim = 80

model_1 = Word2Vec(corpus, size = emb_dim, window=5, \
               min_count=1, negative = 15, iter = 10, workers = 10, sg=1)

In [23]:
print(corpus[0])
print(corpus[29207])
print(len(corpus[0]))
print(len(corpus))

['new', 'cylinder', '404', 'gas']
['pp_monobox_st5']
4
29208


In [24]:
print(model_1)
aa = model_1.wv[corpus[0]]
bb = model_1.wv[corpus[29207]]
print(aa.shape)
print(bb.shape)
print(type(aa))

max_length = 0

for i in range(len(corpus)):
    
    if len(corpus[i]) > max_length:
        max_length = len(corpus[i])
        
    if i == len(corpus) - 1:
        break
    
print(max_length)    

Word2Vec(vocab=14883, size=100, alpha=0.025)
(4, 100)
(1, 100)
<class 'numpy.ndarray'>
121


In [25]:
print(aa)

[[-1.36920251e-02 -1.91915229e-01 -1.46016665e-02  8.07005912e-02
  -2.70050615e-01  2.19435766e-01 -2.86316901e-01  1.24589562e-01
  -4.53182250e-01  2.38419920e-01  1.24956265e-01 -1.15979061e-01
  -2.85552591e-01  2.81532854e-01  1.73605710e-01  1.70120880e-01
  -2.32791245e-01 -5.49034365e-02  2.78966241e-02  4.94771451e-01
   1.81356911e-02 -3.02705187e-02 -3.29154506e-02  1.99016929e-01
   7.21514583e-01  5.14852516e-02  4.89855379e-01 -2.78054625e-01
   1.47114679e-01  3.84537131e-01 -2.23729596e-01 -8.75706151e-02
  -7.86551178e-01  2.26516008e-01 -1.41225439e-02 -5.28826594e-01
  -9.71559510e-02 -1.17123842e-01 -6.27429485e-02 -4.56747226e-02
   2.05717206e-01  1.60853341e-02  2.05927581e-01  5.37982546e-02
   1.93050250e-01 -1.55765221e-01  3.55313987e-01  1.02828987e-01
  -1.17567621e-01 -2.51858503e-01  3.60882789e-01  3.47413085e-02
  -1.21743113e-01 -3.76049042e-01  6.43394351e-01  6.78088129e-01
  -5.20658016e-01  4.47445270e-03 -4.07726228e-01 -5.32257020e-01
   1.38198

In [26]:
tokenizer_obj = Tokenizer()
tokenizer_obj_1 = Tokenizer()

tokenizer_obj.fit_on_texts(total_item_labelled_unlabelled['item'])
tokenizer_obj_1.fit_on_texts(tranzact_data_labelled_final['item'])

## pad sequences :

max_length = max([len(s.split()) for s in total_item_labelled_unlabelled['item']])
min_length = min([len(s.split()) for s in total_item_labelled_unlabelled['item']])

## define vocabulary size :

vocab_size = len(tokenizer_obj.word_index) + 1
vocab_size_1 = len(tokenizer_obj_1.word_index) + 1

total_item_tokens = tokenizer_obj.texts_to_sequences(total_item_labelled_unlabelled['item'])
total_item_tokens_1 = tokenizer_obj_1.texts_to_sequences(tranzact_data_labelled_final['item'])

total_item_pad = pad_sequences(total_item_tokens, maxlen = max_length, padding = 'post')
total_item_pad_1 = pad_sequences(total_item_tokens_1, maxlen = max_length, padding = 'post')

In [68]:
wd_index = tokenizer_obj.word_index
print(wd_index)



In [63]:
word_index = tokenizer_obj_1.word_index
print('Found %s unique tokens' % len(word_index))

Found 7948 unique tokens


In [67]:
print(word_index)



In [61]:
wrd = 'prmium'
print(model_1.wv.word_vec(wrd))


[ 0.24179202  0.06549174 -0.38604274  0.2220079  -0.17099561 -0.00210161
 -0.19163914  0.02628648  0.1819302  -0.13675494  0.03219317 -0.00996133
 -0.05369972  0.2527915   0.2741835   0.12086015 -0.04894331 -0.16053675
 -0.00583438  0.00325824  0.06969016 -0.11515658  0.0805554   0.09401441
  0.2272013   0.03878359  0.27700585 -0.00347151  0.03068962  0.12152074
  0.20699036  0.09057083  0.06172016  0.08677412  0.16805473  0.0134774
 -0.09955977 -0.23010123  0.11060134  0.06406747  0.2174236  -0.03278314
  0.10400584  0.06704241 -0.21097016 -0.12631032 -0.05094669 -0.09503785
 -0.10903387 -0.0955196   0.22453734 -0.03931703 -0.17506832 -0.19794413
  0.3891973   0.35113436 -0.5200029   0.1512953  -0.04274882 -0.15493187
  0.07763566 -0.08534835  0.02209762  0.04802343 -0.04624603  0.12014635
 -0.162002   -0.21071538 -0.04798426  0.07724632 -0.19445488 -0.2439696
 -0.23706     0.02334258 -0.12904176 -0.14800826  0.02665363  0.13792053
  0.11965785  0.28002065]


In [78]:
j = 0

In [79]:
## prepare embeddings

## Preparing embedding matrix

nb_words = min(vocab_size, vocab_size_1)-1

embedding_matrix = np.zeros((nb_words, emb_dim))

    
for word, i in word_index.items():
        
        
    if word in model_1.wv.vocab:
            
        embedding_matrix[j] = model_1.wv.word_vec(word)
        j = j+1
    
    else:
                
        print(word)
        
        
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))


iot
le18b22rd
uc18h7tft
18g16sk
ver1
txmr
1ohm
m66
935uh
le18b28pid
1755uh
le15k17cl
52uh
05w
54w
gaurds
le18h23ap
16uh
le18j27dg2
jsw
guards
le17h8ap
le18k1m
18b3sk
le17l2ap
le18j19ap
jsav150
le18h16dg2
le18l12dg2
89x74
atomfan4
48x38
le17f5wc
elettro
tft
Null word embeddings: 35


In [76]:
print(type(embedding_matrix))
print(embedding_matrix.shape)
print(embedding_matrix[0])

<class 'numpy.ndarray'>
(7948, 80)
[ 0.24179202  0.06549174 -0.38604274  0.2220079  -0.17099561 -0.00210161
 -0.19163914  0.02628648  0.1819302  -0.13675494  0.03219317 -0.00996133
 -0.05369972  0.25279149  0.27418351  0.12086015 -0.04894331 -0.16053675
 -0.00583438  0.00325824  0.06969016 -0.11515658  0.0805554   0.09401441
  0.2272013   0.03878359  0.27700585 -0.00347151  0.03068962  0.12152074
  0.20699036  0.09057083  0.06172016  0.08677412  0.16805473  0.0134774
 -0.09955977 -0.23010123  0.11060134  0.06406747  0.2174236  -0.03278314
  0.10400584  0.06704241 -0.21097016 -0.12631032 -0.05094669 -0.09503785
 -0.10903387 -0.0955196   0.22453734 -0.03931703 -0.17506832 -0.19794413
  0.38919729  0.35113436 -0.5200029   0.1512953  -0.04274882 -0.15493187
  0.07763566 -0.08534835  0.02209762  0.04802343 -0.04624603  0.12014635
 -0.162002   -0.21071538 -0.04798426  0.07724632 -0.19445488 -0.2439696
 -0.23706     0.02334258 -0.12904176 -0.14800826  0.02665363  0.13792053
  0.11965785  0.28

In [None]:
## Total no. of unique words = vocab_size = 14707
## each word has got 150 parameters as per Embedding Dimension defined.
## Hence total Param # = 14707 * 150 = 2206050


In [35]:
from keras.models import Sequential

model_2 = Sequential()
model_2.add(Embedding(vocab_size_1, emb_dim, weights=[embedding_matrix],\
                    input_length=max_length, trainable=False))
model_2.add(SpatialDropout1D(0.2))
#model.add(LSTM(32, dropout=0.2, recurrent_dropout=0.2))
model_2.add(GRU(units=32, dropout=0.2, recurrent_dropout=0.2))
model_2.add(Dense(256, activation='relu'))
model_2.add(Dense(71, activation='softmax'))
model_2.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

model_2.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 121, 100)          794900    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 121, 100)          0         
_________________________________________________________________
gru_1 (GRU)                  (None, 32)                12768     
_________________________________________________________________
dense_1 (Dense)              (None, 256)               8448      
_________________________________________________________________
dense_2 (Dense)              (None, 71)                18247     
Total params: 834,363
Trainable params: 39,463
Non-trainable params: 794,900
_________________________________________________________________


In [39]:
## Creating X and Y variables for training. X = Items or item 'feature'. Y = 'product' or product category

X = []
for i in range(len(tranzact_data_labelled_final)):
    
    X.append(tranzact_data_labelled_final.iloc[i]['item'])
#Q = np.array(tranzact_data_3["product"])
Y = tranzact_data_labelled_final['product']
print(X[1])

print(Y[1])


accumulator model a as 5126 3 4 connection emerson make
cooling


In [40]:
#train test split in 80% / 20% ratio

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state = 5)


In [41]:
## X_train and X_test word embedding :

X_train_tokens = tokenizer_obj_1.texts_to_sequences(X_train)
X_test_tokens = tokenizer_obj_1.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_tokens, maxlen = max_length, padding = 'post')
X_test_pad = pad_sequences(X_test_tokens, maxlen = max_length, padding = 'post')

In [44]:
print(X_train_pad.shape)
print(X_test_pad.shape)
print(max_length)
print(vocab_size_1)

(9442, 121)
(2361, 121)
121
7949


In [42]:
from sklearn.preprocessing import LabelBinarizer

encoder = LabelBinarizer()

Y_train_coded = encoder.fit_transform(Y_train)
Y_test_coded = encoder.fit_transform(Y_test)

print(Y_train_coded.shape)
print(Y_test_coded.shape)


(9442, 71)
(2361, 69)


In [43]:
model_2.fit(X_train_pad,Y_train_coded,batch_size=128,epochs=25,
          validation_split=0.2)

Train on 7553 samples, validate on 1889 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7f1e54ea3f28>