<a href="https://colab.research.google.com/github/radwaahmed20112000/QA-Chatbot/blob/main/SuperAgent_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Data Preprocessing**

## Data Preparation

### Imports


In [None]:
import numpy as np
import pandas as pd 
import os
import io
import gzip
from google.colab import drive
from sklearn.model_selection import train_test_split

In [None]:
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
from keras import backend as K
K._get_available_gpus()

['/device:GPU:0']

### Global Variables

In [None]:
drive_root_path = '/content/drive/My Drive/Colab Notebooks/chatbot project/Chatbot/'
test_dev_ratio = 0.2
chitchat_train_set = chitchat_dev_set = chitchat_test_set = pd.DataFrame(columns=['question','answer'])
categories_train_set, categories_dev_set, categories_test_set = [], [], []

### Dataset parsing


In [None]:
def parse(path):
  g = gzip.open(path, 'rb')
  
  for l in g:
    yield eval(l)

In [None]:
def getDF(path):
  i = 0
  df = {}

  for d in parse(path):
    df[i] = d
    i += 1
    
  return pd.DataFrame.from_dict(df, orient='index')

## Clean Data

### Imports


In [None]:
import string
import re
import nltk
from nltk.tokenize import word_tokenize

### Punctuation Removal

In [None]:
def remove_punctuation(text):  
  return text.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))

### Deconstruction


In [None]:
def decontracted(phrase):
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

### Clean

In [None]:
clean_question = lambda x: remove_punctuation(decontracted(x.lower()))
clean_answer = lambda x: 'START_ '+ remove_punctuation(decontracted(x.lower())) + ' _END' 

## Data load and Split

### Split

In [None]:
def data_split(dataset):

  dataset = dataset.sample(frac=1, random_state=1).reset_index(drop=True)

  train, test = train_test_split(dataset, test_size=test_dev_ratio, 
                                         random_state=0)

  test, dev = train_test_split(test, test_size=0.5, 
                                         random_state=0) 
  
  return train, dev, test

### Load

In [None]:
def load_new_dataset():

  global categories_train_set, categories_dev_set, categories_test_set

  files = ['qa_1.txt', 
           'qa_2.txt',
           'qa_3.txt',
           'WikiQA.tsv']

  df = pd.DataFrame(columns=['question','answer'])
  for file in files:

    data = pd.read_csv(drive_root_path + 'dataset/' + file, sep="\t",
                        encoding = 'unicode_escape')

    if file == 'WikiQA.tsv':
      data.columns = ['QuestionID', 'question',	'DocumentID',	'DocumentTitle', 
                      'SentenceID', 'answer', 'Label']
    else:  
      data.columns = ['ArticleTitle', 'question', 'answer', 'diff_q', 'diff_a', 
                    'ArticleFile']

    data = data[['question', 'answer']]

    data['question'] = data['question'].values.astype(str)
    data['answer'] = data['answer'].values.astype(str)

    df = df[df['question'].apply(lambda x: len(x.split()) <= 8)]

    data['question'] = data['question'].apply(clean_question)
    data['answer']   = data['answer'].apply(clean_answer)

    df = pd.concat([df, data])

  return data_split(df.head(20000))

In [None]:
def load_split_amazon_dataset():

  global categories_train_set, categories_dev_set, categories_test_set

  geners = ['qa_Clothing_Shoes_and_Jewelry.json.gz', 
            'qa_Health_and_Personal_Care.json.gz',
            'qa_Sports_and_Outdoors.json.gz']

  for gener in geners:

    df = getDF(drive_root_path + gener)

    df = df[df['answer'].apply(lambda x: len(x.split()) <= 50)]

    df['question'] = df['question'].apply(clean_question)
    df['answer']   = df['answer'].apply(clean_answer)    
    
    train, dev, test = data_split(df[['question', 'answer']])

    categories_train_set.append(train)
    categories_dev_set.append(dev)
    categories_test_set.append(test)

In [None]:
def load_split_chitchat_dataset():
  
  global chitchat_train_set, chitchat_dev_set, chitchat_test_set
  
  data = pd.DataFrame(columns = ["Question", "Answer", "Source", "Metadata"])
  files = ["English_Professional.tsv", "English_Friendly.tsv", 
           "English_Witty.tsv", "English_Caring.tsv", "English_Enthusiastic.tsv"]
  
  for file in files:
    path = drive_root_path + 'chitchat/' + file
    df = pd.read_csv(path, sep='\t')
    data = pd.concat([data, df])
  
  data = data[["Question", "Answer"]].copy()

  data.rename(columns = {'Question':'question', 'Answer':'answer'}, inplace = True)
  data['question'] = data['question'].apply(clean_question)
  data['answer']   = data['answer'].apply(clean_answer)
  chitchat_train_set, chitchat_dev_set, chitchat_test_set = data_split(data)

### Shuffle Chitchat Dataset

In [None]:
def shuffle_dataset():
  global chitchat_train_set, chitchat_dev_set, chitchat_test_set
  chitchat_train_set = chitchat_train_set.sample(frac=1, random_state=1).reset_index(drop=True)
  chitchat_dev_set   = chitchat_dev_set.sample(frac=1, random_state=1).reset_index(drop=True)
  chitchat_test_set  = chitchat_test_set.sample(frac=1, random_state=1).reset_index(drop=True)

## Generate Data after processing


In [None]:
load_split_chitchat_dataset()
shuffle_dataset()
load_split_amazon_dataset()

In [None]:
train, dev, test = load_new_dataset()

In [None]:
x_train, y_train = train['question'], train['answer']
x_dev, y_dev = dev['question'], dev['answer']
x_test, y_test = test['question'], test['answer']

Chit Chat Model


In [None]:
chitchat_x_train, chitchat_y_train = chitchat_train_set['question'], chitchat_train_set['answer']
chitchat_x_dev, chitchat_y_dev     = chitchat_dev_set['question'], chitchat_dev_set['answer']
chitchat_x_test, chitchat_y_test   = chitchat_test_set['question'], chitchat_test_set['answer']

Clothing Model

In [None]:
clothing_x_train, clothing_y_train = categories_train_set[0]['question'], categories_train_set[0]['answer']
clothing_x_dev, clothing_y_dev     = categories_dev_set[0]['question'], categories_dev_set[0]['answer']
clothing_x_test, clothing_y_test   = categories_test_set[0]['question'], categories_test_set[0]['answer']

Health Model


In [None]:
health_x_train, health_y_train = categories_train_set[1]['question'], categories_train_set[1]['answer']
health_x_dev, health_y_dev     = categories_dev_set[1]['question'], categories_dev_set[1]['answer']
health_x_test, health_y_test   = categories_test_set[1]['question'], categories_test_set[1]['answer']

Sports Model

In [None]:
sport_x_train, sport_y_train = categories_train_set[2]['question'], categories_train_set[2]['answer']
sport_x_dev, sport_y_dev     = categories_dev_set[2]['question'], categories_dev_set[2]['answer']
sport_x_test, sport_y_test   = categories_test_set[2]['question'], categories_test_set[2]['answer']

# **Words Vectorization & Embedding**

### Imports

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.utils import to_categorical
from tensorflow.io import write_file, read_file
import gc
import os

### Constants

In [None]:
SAVE_DIR = '/content/drive/My Drive/Colab Notebooks/chatbot project/SuperAgentModel/'
current_model = 'ClothingModel'
VOCAB_SIZE = 1
if current_model == 'ClothingModel':
  VOCAB_SIZE = 11929
elif current_model == 'ChitChatModel':
  VOCAB_SIZE = 2820
elif current_model == 'HealthModel':
  VOCAB_SIZE = 36059
elif current_model == 'SportsModel2':
  VOCAB_SIZE = 54202
elif current_model == 'QAModel':
  VOCAB_SIZE = 26817

BATCH_SIZE = 64

### Vectorization

In [None]:
def create_vectorize_layer(x_train, y_train):
  vectorize_layer = tf.keras.layers.TextVectorization(standardize=None)
  vectorize_layer.adapt(pd.concat([x_train, y_train]))
  VOCAB_SIZE = vectorize_layer.vocabulary_size()
  gc.collect()
  return vectorize_layer, VOCAB_SIZE

In [None]:
def get_seq2se2_data(x, y, vectorize_layer):

  enc_input_data = vectorize_layer(x)

  dec_input_data = vectorize_layer(y)
  dec_output_data = dec_input_data[:, 1:]
  
  dec_output_data = tf.concat([dec_output_data, tf.zeros((dec_output_data.shape[0], 1), dtype=tf.int64)], 1)
  gc.collect()
  return enc_input_data, dec_input_data, dec_output_data

### Save Processed Data

In [None]:
def save_as_batches(foldername, np_array, batch_size):
  num_batches = np_array.shape[0] // batch_size
  dir = SAVE_DIR + foldername + '/'
  for i in range(num_batches):
    np.save(dir + str(i) + '.npy', np_array[i*batch_size:(i+1)*batch_size, :])
  if(num_batches*batch_size < np_array.shape[0]):
    np.save(dir + str(num_batches) + '.npy', np_array[num_batches*batch_size:, :])

In [None]:
def save(foldername, np_array):
  path = SAVE_DIR + foldername + '/0.npy'
  np.save(path, np_array)

In [None]:
import pickle
def save_vectorizer(vectorizer):
  path = SAVE_DIR + current_model + '/vectorizer.pkl'
  pickle.dump({'config': vectorizer.get_config(),
               'weights': vectorizer.get_weights()}
              , open(path, "wb"))

In [None]:
  vectorize_layer, vocab_size = create_vectorize_layer(health_x_train, health_y_train)


In [None]:
def save_model(foldername, x_train, y_train, x_dev, y_dev, x_test, y_test):

  vectorize_layer, vocab_size = create_vectorize_layer(x_train, y_train)
  print(foldername)
  print(vocab_size)

  save_vectorizer(vectorize_layer)

  enc_input_data, dec_input_data, dec_output_data = get_seq2se2_data(x_train,
                                                                     y_train, 
                                                                     vectorize_layer)
  
  val_enc_input_data, val_dec_input_data, val_dec_output_data = get_seq2se2_data(x_dev, 
                                                                                 y_dev, 
                                                                                 vectorize_layer)
  
  test_enc_input_data, test_dec_input_data, test_dec_output_data = get_seq2se2_data(x_test, 
                                                                                 y_test, 
                                                                                 vectorize_layer)

  save_as_batches(foldername + '/dec_output_data', dec_output_data, BATCH_SIZE)
  save_as_batches(foldername + '/dec_input_data', dec_input_data, BATCH_SIZE)
  save_as_batches(foldername + '/enc_input_data', enc_input_data, BATCH_SIZE)

  save(foldername + '/val_dec_output_data', val_dec_output_data)
  save(foldername + '/val_dec_input_data', val_dec_input_data)
  save(foldername + '/val_enc_input_data', val_enc_input_data)

  save(foldername + '/test_dec_output_data', test_dec_output_data)
  save(foldername + '/test_dec_input_data', test_dec_input_data)
  save(foldername + '/test_enc_input_data', test_enc_input_data)

In [None]:
save_model('ChitChatModel', chitchat_x_train, chitchat_y_train, chitchat_x_dev, 
           chitchat_y_dev, chitchat_x_test, chitchat_y_test)
save_model('ClothingModel', clothing_x_train, clothing_y_train, clothing_x_dev, 
           clothing_y_dev, clothing_x_test, clothing_y_test)
save_model('HealthModel', health_x_train, health_y_train, health_x_dev, 
           health_y_dev, health_x_test, health_y_test)
save_model('SportsModel', sport_x_train, sport_y_train, sport_x_dev, 
           sport_y_dev, sport_x_test, sport_y_test)

HealthModel
36059


In [None]:
save_model('QAModel', x_train, y_train, x_dev, y_dev, x_test, y_test)

QAModel
26817


### Load Processed Data

In [None]:
class DataGenerator(tf.keras.utils.Sequence):
    def __init__(self, input_folder1, input_folder2, output_folder, batch_size=64):
        self.input_folder1 = input_folder1
        self.input_folder2 = input_folder2
        self.output_folder = output_folder
        self.batch_size = batch_size

    def __len__(self):
        # assuming there is nothing in the folders other than the preprocessed data, and all folders have the same number of files
        return len(os.listdir(SAVE_DIR + self.output_folder))

    def __getitem__(self, index):
        enc_input = np.load(SAVE_DIR + self.input_folder1 + '/' + str(index) + '.npy')
        dec_input = np.load(SAVE_DIR + self.input_folder2 + '/' + str(index) + '.npy')
        dec_output = np.load(SAVE_DIR + self.output_folder + '/' + str(index) + '.npy')
        return [enc_input, dec_input], dec_output

In [None]:
def load(foldername):
  path = SAVE_DIR + foldername + '/0.npy'
  return np.load(path)

In [None]:
val_dec_output_data = load(current_model + '/val_dec_output_data')
val_dec_input_data = load(current_model + '/val_dec_input_data')
val_enc_input_data = load(current_model + '/val_enc_input_data')

In [None]:
test_dec_output_data = load(current_model + '/test_dec_output_data')
test_dec_input_data = load(current_model + '/test_dec_input_data')
test_enc_input_data = load(current_model + '/test_enc_input_data')

In [None]:
val_dec_output_data.shape

(2204, 187)

# **Training Model**

### Imports

In [None]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Input, Dense, Embedding, Dropout
from tensorflow.keras.layers import GRU, LSTM, Bidirectional, Concatenate
from tensorflow.keras import Model
from tensorflow.keras.models import load_model
from tensorflow.keras.optimizers import RMSprop, Adam
from tensorflow.keras.losses import CategoricalCrossentropy, SparseCategoricalCrossentropy
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers.schedules import ExponentialDecay

from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

### Custom Evaluation Functions

In [None]:
def bleu_score(y_true, y_pred):
  return sentence_bleu(y_true, y_pred, smoothing_function=SmoothingFunction().method1)

### Constants

In [None]:
EMBEDDING_SIZE = 200
UNITS = 100

## Embedding Layer

### GloVe Layer

In [None]:
enc_embedding_layer = glove_model.get_keras_embedding()
dec_embedding_layer = glove_model.get_keras_embedding()

#### Downloading Glove Embedding

In [None]:
!wget http://nlp.stanford.edu/data/glove.42B.300d.zip
!apt install unzip
!unzip "glove.42B.300d.zip"

--2022-09-10 12:25:11--  http://nlp.stanford.edu/data/glove.42B.300d.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.42B.300d.zip [following]
--2022-09-10 12:25:11--  https://nlp.stanford.edu/data/glove.42B.300d.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.42B.300d.zip [following]
--2022-09-10 12:25:11--  https://downloads.cs.stanford.edu/nlp/data/glove.42B.300d.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1877800501 (1.7G) [application/zip]


In [None]:
import os
os.listdir()

['.config', 'glove.42B.300d.txt', 'drive', 'glove.42B.300d.zip', 'sample_data']

#### Convert Glove Embedding to Word2Vec Embedding

In [None]:
from gensim.scripts.glove2word2vec import glove2word2vec
glove2word2vec(glove_input_file="glove.42B.300d.txt", word2vec_output_file="gensim_glove_vectors.txt")

from gensim.models.keyedvectors import KeyedVectors
glove_model = KeyedVectors.load_word2vec_format("gensim_glove_vectors.txt", binary=False)

### Normal Embedding Layer

In [None]:
enc_embedding_layer = Embedding(VOCAB_SIZE, EMBEDDING_SIZE, mask_zero=True)
dec_embedding_layer = Embedding(VOCAB_SIZE, EMBEDDING_SIZE, mask_zero=True)

##LSTM Model

### Encoder

In [None]:
enc_input = Input(shape=(None,))

enc_embedding = enc_embedding_layer(enc_input)
enc_outputs, state_h, state_c = LSTM(UNITS, return_state=True,
                                     kernel_regularizer='l2')(enc_embedding)
enc_states = [state_h, state_c]

###  Decoder

In [None]:
dec_input = Input(shape=(None,))

dec_embedding = dec_embedding_layer(dec_input)
dec_outputs, _, _  = LSTM(UNITS, return_state=True, 
                          return_sequences=True,
                          kernel_regularizer='l2')(dec_embedding, initial_state=enc_states)
dropout = Dropout(0.5)(dec_outputs) 
output = Dense(VOCAB_SIZE, activation='softmax')(dropout)

## Training Model

In [None]:
model = Model([enc_input, dec_input], output, name='health_model')
opt = Adam(learning_rate=0.01)
model.compile(optimizer=opt, loss=SparseCategoricalCrossentropy(), metrics=['accuracy'])
model.summary()

Model: "health_model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, None, 200)    2385800     ['input_3[0][0]']                
                                                                                                  
 embedding_3 (Embedding)        (None, None, 200)    2385800     ['input_4[0][0]']                
                                                                                       

In [None]:
path = SAVE_DIR + 'checkpoints/health_model'
model = load_model(path)
model.summary()

Model: "health_model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, None, 200)    2385800     ['input_3[0][0]']                
                                                                                                  
 embedding_3 (Embedding)        (None, None, 200)    2385800     ['input_4[0][0]']                
                                                                                       

In [None]:
data_generator = DataGenerator(current_model + '/enc_input_data', 
                               current_model + '/dec_input_data', 
                               current_model + '/dec_output_data')

save_callback = ModelCheckpoint(SAVE_DIR + 'checkpoints/health_model')

history = model.fit(x=data_generator,
                    epochs=30, 
                    validation_data=([val_enc_input_data, val_dec_input_data],
                                                          val_dec_output_data), 
                    callbacks=[save_callback])

In [None]:
history

# **Testing Model**

In [None]:
model.evaluate([val_enc_input_data, val_dec_input_data], val_dec_output_data, batch_size=BATCH_SIZE)



[0.6237284541130066, 0.30688631534576416]

In [None]:
model.evaluate([test_enc_input_data, test_dec_input_data], test_dec_output_data, batch_size=BATCH_SIZE)



[0.5883707404136658, 0.313035249710083]

# **Inference Model**

### Load Vectorized Layer


In [None]:
import pickle
def load_vectorizer(folder_name):
  path = SAVE_DIR + folder_name +'vectorizer.pkl'
  from_disk = pickle.load(open(path, "rb"))
  vectorizer = TextVectorization.from_config(from_disk['config'])
  vectorizer.set_weights(from_disk['weights'])
  return vectorizer

### LSTM 

In [None]:
def get_encoder_decoder_model_lstm(model):
  # Get Model's Layers
  enc_input = model.input[0]
  dec_input = model.input[1]
  enc_embedding_layer = model.get_layer(index=2)
  dec_embedding_layer = model.get_layer(index=3)
  enc_layer = model.get_layer(index=4)
  dec_layer = model.get_layer(index=5)
  dec_dense_layer = model.get_layer(index=-1)

  # Model's Parameters
  EMBEDDING_SIZE = dec_embedding_layer.output.shape[-1]
  UNITS = dec_layer.output[0].shape[-1]

  # Encoder Model
  _ , enc_state_h, enc_state_c = enc_layer.output
  enc_model = Model(enc_input, [enc_state_h, enc_state_c], 
                  name='encoder_inference_model')
  
  # Decoder Model
  dec_embedding = dec_embedding_layer.output 
  dec_state_input_h = Input(shape=(UNITS,), name='decoder_input_state_h')
  dec_state_input_c = Input(shape=(UNITS,), name='decoder_input_state_c')
  dec_outputs, dec_state_output_h, dec_state_output_c = dec_layer(dec_embedding, 
                                                                      initial_state=[dec_state_input_h, dec_state_input_c])
  output = dec_dense_layer(dec_outputs)
  dec_model = Model([dec_input, dec_state_input_h, dec_state_input_c], 
                    [output, dec_state_output_h, dec_state_output_c], 
                    name='decoder_inference_model')
  
  return enc_model, dec_model

## Main Loop

In [None]:
START_TOKEN = 'START_'
END_TOKEN = '_END'
MAX_ANSWER_LEN = 200

LSTM

In [None]:
def answer(question, enc_model, dec_model, vectorizer):
  vectorized_question = np.reshape(vectorizer(question).numpy(), (1, -1))
  print(f'question: {question}')
  state_h, state_c = enc_model.predict(vectorized_question)

  empty_target_seq = np.reshape(vectorizer(START_TOKEN).numpy(), (1, -1))
  stop_condition = False
  decoded_translation = 'answer : '
  while not stop_condition:

      dec_outputs, dec_state_h, dec_state_c = dec_model.predict([empty_target_seq, state_h, state_c])         

      sampled_word_index = np.argmax(dec_outputs[0, -1, :])
      sampled_word = vectorizer.get_vocabulary()[sampled_word_index]
      if sampled_word != END_TOKEN:
        decoded_translation += f' {sampled_word}'
      
      if sampled_word == END_TOKEN or len(decoded_translation.split()) > MAX_ANSWER_LEN:
          stop_condition = True

      empty_target_seq = np.zeros((1, 1))
      empty_target_seq[0, 0] = sampled_word_index
      state_h, state_c = dec_state_h, dec_state_c

  return decoded_translation

In [None]:
print(answer('skin care products?'))

question: skin care products?
answer :  a lot  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a


# **Reranking Model**

## Data Preparation

### Global Variables

In [None]:
drive_root_path = '/content/drive/My Drive/Colab Notebooks/chatbot project/Chatbot/'
test_dev_ratio = 0.1
gener_examples = 20000
train_set = dev_set = test_set = pd.DataFrame(columns=['question', 'label'])

## Data load and Split

### Split

In [None]:
def data_split(dataset):

  global train_set, dev_set, test_set

  train, test = train_test_split(dataset, test_size=test_dev_ratio, 
                                         random_state=0)

  test, dev = train_test_split(test, test_size=0.5, 
                                         random_state=0) 
  
  train_set = pd.concat([train_set, train])
  dev_set   = pd.concat([dev_set, dev])
  test_set  = pd.concat([test_set, test])

### Load

In [None]:
def load_split_amazon_dataset():

  geners = ['qa_Clothing_Shoes_and_Jewelry.json.gz',
            'qa_Health_and_Personal_Care.json.gz',
            'qa_Sports_and_Outdoors.json.gz']

  for i, gener in enumerate(geners):

    df = getDF(drive_root_path + gener)

    df = df[['question']]
    df['label'] = i

    data_split(df)
    print(df)

In [None]:
def load_split_chitchat_dataset():
  data = pd.DataFrame(columns = ["Question", "Answer", "Source", "Metadata"])
  files = ["English_Professional.tsv", "English_Friendly.tsv", "English_Witty.tsv", "English_Caring.tsv",   "English_Enthusiastic.tsv"]
  
  for file in files:
    path = drive_root_path + 'chitchat/' + file
    df = pd.read_csv(path, sep='\t')
    data = pd.concat([data, df])
  
  data = data[["Question"]].copy()
  print('length of dataset = ', len(data))
  data.rename(columns = {'Question':'question'}, inplace = True)
  data['label'] = 3
  data_split(data)
  print(data)

### Shuffle

In [None]:
def shuffle_dataset():
  global train_set, dev_set, test_set

  train_set = train_set.sample(frac=1, random_state=1).reset_index(drop=True)
  dev_set   = dev_set.sample(frac=1, random_state=1).reset_index(drop=True)
  test_set  = test_set.sample(frac=1, random_state=1).reset_index(drop=True)

## Generate Data after processing


In [None]:
load_split_chitchat_dataset()
load_split_amazon_dataset()
shuffle_dataset()

length of dataset =  48965
                   question  label
0          Do you get hurt?      3
1      Do you have fingers?      3
2       Do you ever breathe      3
3         Do you masticate?      3
4         Can you throw up?      3
...                     ...    ...
9788    I'm tired from work      3
9789    I'm totally drained      3
9790  I'm totally exhausted      3
9791                  Zzzzz      3
9792          I'm so sleepy      3

[48965 rows x 2 columns]
                                                question  label
0      You bought level one . Will you buy the next l...      0
1                    Will you buy the next level edition      0
2                                  Did you learn Chinese      0
3      Do you see words written as well as hearing th...      0
4                           does it work with windows 8?      0
...                                                  ...    ...
22063  So the watch is waterproof, but is this leathe...      0
22064  What is 

In [None]:
x_train, y_train = train_set['question'], train_set['label']
x_dev, y_dev     = dev_set['question'], dev_set['label']
x_test, y_test   = test_set['question'], test_set['label']

## Model


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

In [None]:
dir = '/content/drive/My Drive/Colab Notebooks/chatbot project/'
clothing_model = load_model(dir + 'SuperAgentModel2/checkpoints/cmodel_3')
health_model = load_model(dir + 'SuperAgentModel/checkpoints/health_model')
sports_model = load_model(dir + 'SuperAgentModel2/checkpoints/smodel_7')
chichat_model = load_model(dir + 'SuperAgentModel/checkpoints/chitchat_lstm_150_300_0.01') 

In [None]:
def chatbot(question):
  text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB()),])

  text_clf = text_clf.fit(x_train, y_train.tolist())
  predicted = text_clf.predict([question])
  predicted = predicted[0]

  if predicted == 0:
    print("Clothing Model :")
    vectorizer = load_vectorizer("ClothingModel/")
    model = clothing_model
  elif predicted == 1:
    print("Health Model :")
    vectorizer = load_vectorizer("HealthModel/")
    model = health_model
  elif predicted == 2:
    print("Sports Model :")
    vectorizer = load_vectorizer("SportsModel/")
    model = sports_model
  elif predicted == 3:
    print("Chitchat Model :")
    vectorizer = load_vectorizer("ChitChatModel/")
    model = chichat_model
  
  enc_model, dec_model = get_encoder_decoder_model_lstm(model)
  print(answer(question, enc_model, dec_model, vectorizer))

In [None]:
chatbot("how are you")

Sports Model :
question: how are you
answer :  i would not say it is a good question


In [None]:
chatbot("what is the size of this jacket")

Sports Model :
question: what is the size of this jacket
answer :  i am not sure but i am not sure i would say it is a good question


In [None]:
chatbot("what are the best steps for a daily skincare routine")

Health Model :
question: what are the best steps for a daily skincare routine
answer :  the this it START_ this off


In [None]:
chatbot("is potato good for kids")

Sports Model :
question: is potato good for kids




answer :  yes


In [None]:
chatbot("nice to meet you")

Chitchat Model :
question: nice to meet you
answer :  nice to meet you too


In [None]:
chatbot("what is the best place for picnic")

Sports Model :
question: what is the best place for picnic
answer :  i am not sure but i am not sure i would say it is a good question


In [None]:
chatbot("are you happy now")

Chitchat Model :
question: are you happy now
answer :  i am a bot so kind of like a robot but without all the moving parts


In [None]:
chatbot("tell me a joke")

Chitchat Model :
question: tell me a joke
answer :  i do not really know any jokes
