# LSTM based next word prediction with unique brands

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

import sys
import heapq
import seaborn as sns
import matplotlib
from numpy.core.multiarray import dtype
from keras.models import Sequential, load_model
from keras.layers import Dense, Activation
from keras.layers import LSTM, Dropout
from keras.layers import TimeDistributed
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from keras.layers.core import Dense, Activation, Dropout, RepeatVector
from tensorflow import keras
from keras.utils.vis_utils import plot_model
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.models import load_model

import re
import pickle

import os
from pylab import rcParams
matplotlib.use('agg')
np.random.seed(42)

In [2]:
!pip install openpyxl

In [3]:
sale_data_df=pd.read_csv("../input/sale-count-data/word_cnt.csv")
sale_data_df.head()

In [4]:
brands = list(sale_data_df['Item_Name'])
  
# cleaning and combining words
all_brands_str = ' '.join(str(brand).lower().replace('\n', '').replace('\r', '').replace('\ufeff', '') for brand in brands)
print(all_brands_str[0:100])

In [5]:
# Storing Tokenizer into pickle file which will be used for subsquent calls without re-doing the same process again.
tokenizer = Tokenizer()
tokenizer.fit_on_texts([all_brands_str])
pickle.dump(tokenizer, open('all_brands_tokenizer.pkl', 'wb'))
sequence_data = tokenizer.texts_to_sequences([all_brands_str])[0]
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

In [6]:
sequences = []

for i in range(1, len(sequence_data)):
    X_y_sequence_pair = sequence_data[i-1:i+1]
    sequences.append(X_y_sequence_pair)
    
sequences = np.array(sequences)

X = []
y = []

for i in sequences:
    X.append(i[0])
    y.append(i[1])
    
X = np.array(X)
y = np.array(y)

y = to_categorical(y, num_classes = vocab_size)
print("X length=",len(X))
print("y length=",len(y))

In [7]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=1))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))
model.summary()

In [8]:
checkpoint = ModelCheckpoint("lstm_next_word_model.h5", monitor='loss', verbose=1 ,save_best_only=True, mode='auto')

reduce = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=3, min_lr=0.0001, verbose = 1)

tensorboard_visualization = TensorBoard(log_dir='logs')

In [9]:
model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.001))
model.fit(X, y, epochs=500, batch_size=64, callbacks=[checkpoint, reduce, tensorboard_visualization])

In [11]:
model = load_model('lstm_next_word_model.h5')
tokenizer = pickle.load(open('all_brands_tokenizer.pkl', 'rb'))

def predict_next_word(model, tokenizer, all_brands):
    brand_sequences = tokenizer.texts_to_sequences([all_brands])
    brand_sequences = np.array(brand_sequences)
    preds = np.argmax(model.predict(brand_sequences))
    predicted_word = ""
  
    for key, value in tokenizer.word_index.items():
        if value == preds:
            predicted_word = key
            break
  
    print(predicted_word)
    return predicted_word


In [12]:
def get_band_name_from_search_words(search_words):
    final_brand_list = []
    for i in range(len(brands)):
        if re.search(search_words.upper(), brands[i].upper()):
            final_brand_list.append(brands[i])
    return final_brand_list

In [13]:
get_band_name_from_search_words("Harshita")

In [None]:
#df[df['Item_Name']=='Harshita Creation Satin Navy Dresses']["Item_Name"][0]
# df[(df['col1']==2) & (df['col3']=='Y') ]
#df[df['Item_Name']=='Harshita Creation Satin Navy Dresses']['Count']
df[df['Item_Name']=='Harshita Creation Satin Navy Dresses'].values[0]


In [26]:
data = input("Enter your word: ")
next_word = predict_next_word(model, tokenizer, data)

import re
search_words = ''.join(data +" "+next_word)
print(search_words)
brand_name = get_band_name_from_search_words(search_words)
#print(brand_name)
brand_list=[]
brand_list_count=[]

for brand in set(brand_name):
    brand_list.append(brand)        
    brand_list_count.append(sale_data_df[sale_data_df['Item_Name']==brand]['Count'].values[0])
    
df_out = pd.DataFrame({"Items":brand_list, "Counts":brand_list_count})
df_out = df_out.sort_values('Counts', ascending=False)
print(df_out)