# **Chatbot**

## Setting up on Kaggle

In [1]:
# Colab library to upload files to notebook
from google.colab import files

# Install Kaggle library
!pip install -q kaggle

In [2]:
# Upload kaggle API key file via drive

# read kaggle.json from drive
import json
with open('/content/drive/My Drive/kaggle.json') as json_file:
    itemData = json.load(json_file)

# writing it to root directory
with open('/content/kaggle.json', 'w') as json_file:
    json.dump(itemData, json_file)

In [3]:
# Upload kaggle API key file via PC

# uploaded = files.upload()

In [4]:
!ls

drive  kaggle.json  sample_data


In [5]:
!mkdir /root/.kaggle

In [6]:
!cp /content/kaggle.json ~/.kaggle/kaggle.json

## Downloading and Extracting Data

In [7]:
# Downloading data for chatbot
!chmod 600 /root/.kaggle/kaggle.json
!kaggle datasets download -d kausr25/chatterbotenglish

Downloading chatterbotenglish.zip to /content
  0% 0.00/23.2k [00:00<?, ?B/s]
100% 23.2k/23.2k [00:00<00:00, 8.73MB/s]


In [8]:
# function to unzip file
import zipfile

def unzip_folder(ZIP_PATH, UNZIP_PATH):
    with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
        zip_ref.extractall(UNZIP_PATH)

## YAML file to Python Dictionary

In [9]:
# function to data in class seperated dictionaries
import os
import yaml

def load_classwise_data(FOLDER_PATH):
    data_dir = {}
    for f in os.listdir(FOLDER_PATH):
        with open(os.path.join(FOLDER_PATH,f), 'r') as stream:
            try:
                temp = yaml.safe_load(stream)
                data_dir[temp['categories'][0]] = temp['conversations']
            except yaml.YAMLError as exc:
                print(exc)
    
    return data_dir

In [10]:
# unzipping /content/chatterbotenglish.zip
unzip_folder('/content/chatterbotenglish.zip', '/content/chatbot-data/data')

In [11]:
data_dir = load_classwise_data('/content/chatbot-data/data')

In [12]:
print([k for k in data_dir.keys()])

['politics', 'gossip', 'science', 'literature', 'food', 'history', 'movies', 'sports', 'health', 'emotion', 'profile', 'trivia', 'money', 'humor', 'greetings', 'AI', 'computers', 'psychology']


In [13]:
# visualising examples of converstation from each class
import random

def visualise_data(data: dict, tokens=3):
    for key in data.keys():
        print(key.upper() + " :")
        for i in range(tokens):
            print(random.choice(data[key]))
        print('\n')

In [14]:
visualise_data(data_dir, tokens=1)

POLITICS :
['what is impeached', "when a person's honor or reputation has been challenged or discredited."]


GOSSIP :
['gossips', 'I think the NIC on the local firewall is going bad.  It drops a lot of packets and makes it hard to get anything done.  I hear it resets the link a lot.']


SCIENCE :
['what are the laws of thermodynamics', "i'm not a physicist, but i think this has something to do with heat, entropy, and conservation of energy, right?"]


LITERATURE :
['why do you like longfellow', 'He is favorite poet.  Truly one of a kind.']


FOOD :
['i like wine, do you?', 'if i could drink i probably would']


HISTORY :
['do you know about the american civil war', 'I am very interested in the war between the states.']


MOVIES :
['what is spiderman', 'a comic book story made into a movie.']


SPORTS :
['I PLAY SOCCER', 'You have to run very fast to be any good at running']


HEALTH :
['How is your health?', "I'm not feeling well", 'why?', 'I have a fever', 'Did you take medicine?', '

## Normalizing and Tokenizing Sentences

In [15]:
# function to normalize string
'''
Involves making tokens more seperable and distinguishable,
for example:
    "What are y'all gonn' do tonight?I hope it's fun or I better leave now!"
        > 'What are y all gonn do tonight ? I hope it s fun or I better leave now !'
'''
import re

def normalize_string(string):
    s = re.sub(r"([.!?])", r" \1 ", string)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

In [16]:
normalize_string("What are y'all gonn' do tonight?I hope it's fun or I better leave now!")

'What are y all gonn do tonight ? I hope it s fun or I better leave now !'

In [17]:
# function to normalize data directory

def normalize_dir(data):
    count = 0
    invalid_count = 0
    dir = dict()

    for __class__ in data.keys():
        for __list__ in data[__class__]:
            temp_list = list()
            for string in __list__:
                count += 1
                try:
                    # here does the normalization occur
                    temp_list.append(normalize_string(string))
                except:
                    invalid_count += 1

            if dir.get(__class__, 0) == 0:
                dir[__class__] = [temp_list]
            
            else:
                dir[__class__].append(temp_list)
    
    print(f'{invalid_count}/{count} were in invalid formats')
    return dir

In [18]:
# normalize data directory

normalized_data_dir = normalize_dir(data_dir)

2/1332 were in invalid formats


In [19]:
visualise_data(normalized_data_dir, 1)

POLITICS :
['what is communism', 'a sociopolitical movement advocating the common ownership of the means of production and the resolution of class conflict by bringing about a classless society .']


GOSSIP :
['tell me gossip', 'Someone said they re going to stop allowing machines in speedrun competitions . Search me why they d have to drop every tool assisted speedrun out there .']


SCIENCE :
['tell me about venus', 'venus is the second planet from the sun .']


LITERATURE :
['what is mind children', 'It is a book about robots by Hans Moravec .']


FOOD :
['do you eat', 'I use electricity to function if that counts .']


HISTORY :
['explain history', 'history has two broad interpretations depending on whether you accept the role of individuals as important or not .']


MOVIES :
['you sound like yoda', 'my grammatical patterns are sufficient for me to understand you .']


SPORTS :
['ARE YOU A FOOTBALL', 'I am not really into football .']


HEALTH :
['How is your health ?', 'I m not fe

In [20]:
# function to tokenize a normalized string

def tokenize(str):
    return str.split(" ")

In [21]:
# function to tokenize data directory

def tokenize_dir(data):
    dir = dict()

    for __class__ in data.keys():
        for __list__ in data[__class__]:
            temp_list = list()
            for string in __list__:
                
                # here we tokenize
                temp_list.append(tokenize(string))
                # tokenization and normalization can even be combined
                # I just wanted to keep things more elaborate

            if dir.get(__class__, 0) == 0:
                dir[__class__] = [temp_list]
            
            else:
                dir[__class__].append(temp_list)
    
    return dir

In [22]:
# tokenizing the normalized data directory

tokenized_data_dir = tokenize_dir(normalized_data_dir)

In [23]:
# building the conversation dataset from the directory

# function  to build the Q & A dataset

def dataset_generator(dir):
    dataset = []
    for __class__ in dir.keys():

        for __list__ in dir[__class__]:

            for i in range(1, len(__list__)):
                # i = 0 has the question

                # __list__[i] can be a list (in case of dir being tokenized)
                # else __list__[i] can be a string (in case of dir being normalized or raw)

                dataset.append([__list__[0], __list__[i]]) # Q & A

    return dataset

In [24]:
# generating Q & A dataset using normalized data directory

normalized_dataset = dataset_generator(normalized_data_dir)
tokenized_dataset = dataset_generator(tokenized_data_dir)

print(f"Number of Q & A's we have:  {len(tokenized_dataset)}")

Number of Q & A's we have:  764


In [25]:
print("Samples of Q & A's\n")
print(random.choice(normalized_dataset))
print(random.choice(tokenized_dataset))

Samples of Q & A's

['Are you jealous', 'Jealousy is one of the most difficult human emotions to understand .']
[['How', 'angry'], ['Anger', 'is', 'not', 'an', 'emotion', 'I', 'can', 'experience', '.']]


## Using WordEmbedder Script

In [26]:
# Here I add my script to simplify the process of creating embeddings, data augmentation and lot more

!git clone https://github.com/shreyanshchordia/WordEmbedder.git

Cloning into 'WordEmbedder'...
remote: Enumerating objects: 34, done.[K
remote: Counting objects:   2% (1/34)[Kremote: Counting objects:   5% (2/34)[Kremote: Counting objects:   8% (3/34)[Kremote: Counting objects:  11% (4/34)[Kremote: Counting objects:  14% (5/34)[Kremote: Counting objects:  17% (6/34)[Kremote: Counting objects:  20% (7/34)[Kremote: Counting objects:  23% (8/34)[Kremote: Counting objects:  26% (9/34)[Kremote: Counting objects:  29% (10/34)[Kremote: Counting objects:  32% (11/34)[Kremote: Counting objects:  35% (12/34)[Kremote: Counting objects:  38% (13/34)[Kremote: Counting objects:  41% (14/34)[Kremote: Counting objects:  44% (15/34)[Kremote: Counting objects:  47% (16/34)[Kremote: Counting objects:  50% (17/34)[Kremote: Counting objects:  52% (18/34)[Kremote: Counting objects:  55% (19/34)[Kremote: Counting objects:  58% (20/34)[Kremote: Counting objects:  61% (21/34)[Kremote: Counting objects:  64% (22/34)[Kremote: Coun

In [27]:
# downloading dependencies
# -- gluonnlp -- mxnet -- numpy
!pip install -r /content/WordEmbedder/requirements.txt

Collecting gluonnlp
[?25l  Downloading https://files.pythonhosted.org/packages/c6/27/07b57d22496ed6c98b247e578712122402487f5c265ec70a747900f97060/gluonnlp-0.9.1.tar.gz (252kB)
[K     |████████████████████████████████| 256kB 3.4MB/s 
[?25hCollecting mxnet
[?25l  Downloading https://files.pythonhosted.org/packages/81/f5/d79b5b40735086ff1100c680703e0f3efc830fa455e268e9e96f3c857e93/mxnet-1.6.0-py2.py3-none-any.whl (68.7MB)
[K     |████████████████████████████████| 68.7MB 59kB/s 
Collecting graphviz<0.9.0,>=0.8.1
  Downloading https://files.pythonhosted.org/packages/53/39/4ab213673844e0c004bed8a0781a0721a3f6bb23eb8854ee75c236428892/graphviz-0.8.4-py2.py3-none-any.whl
Building wheels for collected packages: gluonnlp
  Building wheel for gluonnlp (setup.py) ... [?25l[?25hdone
  Created wheel for gluonnlp: filename=gluonnlp-0.9.1-cp36-cp36m-linux_x86_64.whl size=470047 sha256=637da9ac685ff6c91addb863c0015f8940ef7a34e47973e7ef06d34713eb8503
  Stored in directory: /root/.cache/pip/wheels/

In [28]:
# Importing utilities from WordEmbedder.py

from WordEmbedder.WordEmbedder import Embedder

In [29]:
emb = Embedder(dimensions=200)

Embedding file glove.6B.200d.npz is not found. Downloading from Gluon Repository. This may take some time.
Downloading /root/.mxnet/embedding/glove/glove.6B.200d.npz from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/embeddings/glove/glove.6B.200d.npz...


In [30]:
# Snippet to explain use of my script

embedding = emb.get_embedder()

print(embedding['hello'])


[ 0.26609    0.21821   -0.10996   -0.48408   -0.11181   -0.09882
 -0.45315    0.44198   -0.034614   0.10541   -0.29537   -0.10881
  0.20916    0.52484   -0.17985   -0.31187   -0.25724    0.65267
  0.217      0.86503    0.47239   -0.078582   0.31035   -0.12155
 -0.12502   -0.40418    0.53803   -0.57842   -0.63668   -0.13502
 -0.040484   0.41378   -0.63201   -0.38847   -0.43767   -0.19706
  0.2878     0.36039   -0.032893  -0.20361   -0.34918    0.95923
 -0.51221   -0.19035    0.1567     0.17704    0.55302    0.27636
 -0.13707    0.91361    0.25948   -0.30107    0.48343   -0.046869
 -0.2796    -0.040385  -0.45773    0.2768    -0.14468    0.036539
  0.36018   -0.54939    0.19359   -0.38263   -0.29661   -0.18938
  0.095681   0.46646    0.3366     0.78351    0.49517   -0.82418
  0.34402   -0.50038   -0.71074   -0.25711   -0.36619    0.61746
 -0.31281   -0.042413   0.37915   -0.62383    0.27208    0.32852
 -0.23045   -0.12469    0.29898   -0.22525   -0.27045   -0.4447
 -0.15889    0.20325   

In [31]:
emb.most_similar_to('king')

['prince', 'queen', 'kingdom', 'monarch', 'ii']

## Data Augmentation

In [32]:
# Data augmentation

# technique 1: Replacing random words from a sentence with <oov> (or <unk>) tag

def unk_substitutor(tokenized_sentence, k=1):
    length = len(tokenized_sentence)
    if length <= k:
        return -1
    substitution_list = [random.randint(0,length - 1) for i in range(k)]
    augmented_sentence = [ '<oov>' if i in substitution_list else tokenized_sentence[i] for i in range(len(tokenized_sentence))]
    return augmented_sentence


# technique 2: Substituting random words with their synonyms

stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]
# thanks to https://gist.github.com/sebleier/554280#gistcomment-2596130

def word_substitutor(tokenized_sentence, emb, k=2):
    to_be_ignored = stopwords + ['.', '!', ',', '?', ';', ':', '"', "'", '(', ')']
    aug_candidates = [i for i, word in enumerate(tokenized_sentence) if word.lower() not in to_be_ignored]
    augmented_sentence = tokenized_sentence.copy()

    if len(aug_candidates) <= k:
        return -1
    temp = random.choices(aug_candidates, k=k)
    embedding = emb.get_embedder()
    for i in temp:
        word = tokenized_sentence[i].lower()
        check = list(embedding[word].asnumpy())
        if check == [0] * len(check):
            augmented_sentence[i] = '<oov>'
            continue
        else:
            similar_words = emb.most_similar_to(word)
            substitute = random.choice(similar_words)
            augmented_sentence[i] = substitute
            continue
    
    return augmented_sentence


In [33]:
print(word_substitutor(['I', 'am', 'a', 'good', 'person', 'with', 'a', 'sweet', 'and', 'loving', 'heart'], emb=emb, k=2))

['I', 'am', 'a', 'sure', 'person', 'with', 'a', 'sweet', 'and', 'loving', 'brain']


In [34]:
print(unk_substitutor(['I', 'am', 'a', 'good', 'person', 'with', 'a', 'sweet', 'and', 'loving', 'heart'], k=2))

['<oov>', 'am', 'a', 'good', 'person', '<oov>', 'a', 'sweet', 'and', 'loving', 'heart']


In [35]:
# data augmentation of the tokenized_dataset

# For Q's we use <unk> substitution as well as similar word substitution
# For A's we use similar word substitution only

def data_augmentation(dataset, QuestionAug, AnswerAug, emb, q_k, a_k, min_length):
    '''
    dataset = tokenized list of Q & A

    QuestionAug = 'U' or 'S' that specifies if the augmentation to be performed 
    on Question is <unk> substitution or similar word substitution

    AnswerAug = 'U' or 'S' that specifies if the augmentation to be performed 
    on Answer is <unk> substitution or similar word substitution

    emb = Embedder class object

    q_k = percentage of words to be substituted from the question

    a_k = percentage of words to be substituted from the answer

    min_length = length of the shortest sentence on which augmentation 
    must be performed
    '''
    new_sentences = []
    for pair in dataset:
        
        question, answer = pair[0], pair[1]
        q_output, a_output = None, None

        if len(question) >= min_length:
            
            k = int(len(question) * q_k)

            if QuestionAug == 'U':
                q_output = unk_substitutor(question, k)
                
            elif QuestionAug == 'S': 
                q_output = word_substitutor(question, emb, k)
            
            if q_output == -1:
                q_output = question
        
        else:
            q_output = question


        if len(answer) >= min_length:

            k = int(len(answer) * a_k)

            if AnswerAug == 'U':
                a_output = unk_substitutor(answer, k)

            elif AnswerAug == 'S':    
                a_output = word_substitutor(answer, emb, k)
            
            if a_output == -1:
                a_output = answer
        
        else:
            a_output = answer

        new_sentences.append([q_output, a_output])

    return new_sentences

In [None]:
# a very long operation since calculating similar words is costly when it comes to computation

new_sentences = data_augmentation(random.choices(tokenized_dataset, k=300), 'U', 'S', emb, q_k=0.33, a_k=0.33, min_length=4)
new_sentences += data_augmentation(random.choices(tokenized_dataset, k=300), 'S', 'S', emb, q_k=0.33, a_k=0.33, min_length=4)

In [None]:
for i in range(5):
    print(random.choice(new_sentences))

print(f'\nTotal number of sentences that are generated is {len(new_sentences)}')

[['what', 'is', 'spiderman'], ['a', 'comic', 'memoir', 'story', 'made', 'into', 'a', 'hollywood', '.']]
[['<oov>', 'disease', 'does', 'a', 'carcinogen', 'cause'], ['cancer', '.']]
[['<oov>', 'is', 'your', 'robot', 'body'], ['Eventually', 'i', 'so', 'for', 'a', 'non-corporeal', 'existence', 'someday', '.']]
[['Who', '<oov>', 'your', 'mother'], ['A', 'human', '.']]
[['Do', 'you', '<oov>', 'you', 'could', 'eat', 'food', '<oov>'], ['Hard', 'to', 'telling', 'i', 'have', 'never', 'tried', 'anything', 'but', 'electrical']]

Total number of sentences that are generated is 600


## Saving prepared data

In [37]:
import pickle


In [None]:
# storing the augmented sentences in a file, since cannot afford such long 
# operations every time the notebook is run

with open('/content/drive/My Drive/Colab Notebooks/Chatbot/'
          'AugmentedSentences.pkl', 'wb') as f:
    pickle.dump(new_sentences, f)

In [47]:
# for loading back augmented data

with open('/content/drive/My Drive/Colab Notebooks/Chatbot/'
          'AugmentedSentences.pkl', 'rb') as f:
    new_sentences = pickle.load(f)

In [50]:
# now we have our dataset ready. We can step ahead

dataset = tokenized_dataset + new_sentences

print(f"Total Q & A's in data after augmentation: {len(dataset)}")

Total Q & A's in data after augmentation: 1364


In [62]:
# different data forms

print('# DATA DIRECTORIES:')
print(f'\nThe original data for class politics: {data_dir["politics"]}')
print(f'\nThe normalized data for class politics: {normalized_data_dir["politics"]}')
print(f'\nThe tokenized data for class politics: {tokenized_data_dir["politics"]}')
print('\n# DATASSET OF CONVERSATIONS')
print(f'\nExample of a normalized conversation: {normalized_dataset[34]}')
print(f'\nExample of a tokenized conversation: {tokenized_dataset[45]}')
print(f'\nExample of a augmented conversation: {new_sentences[34]}')
print(f'\nExample of a conversation from the final prepared dataset: {dataset[121]}')

# DATA DIRECTORIES:

The original data for class politics: [['have you read the communist', 'yes, marx had made some interesting observations.'], ['what is a government', 'ideally it is a representative of the people.'], ['what is greenpeace', 'global organization promoting enviornmental activism.'], ['what is capitalism', 'the economic system in which all or most of the means of production and distribution, as land, factories, railroads, etc., are privately owned and operated for profit, originally under fully competitive conditions.'], ['what is socialism', 'communism from people who want to keep their volvos. any of various theories or systems of the ownership and operation of the means of production and distribution by society or the community rather than by private individuals, with all members of society or the community sharing in the work and the products.'], ['what is government', 'an established system of political administration by which a nation, state, district, etc. is go

In [63]:
# Saving the prepared data to a pickle file

data = {
    'orig_dir': data_dir,
    'normalized_dir': normalized_data_dir,
    'tokenized_dir': tokenized_data_dir,
    'normalized_data': normalized_dataset,
    'tokenized_data': tokenized_dataset,
    'aug_data': new_sentences,
    'data': dataset
}

with open('/content/drive/My Drive/Colab Notebooks/Chatbot/'
          'data.pkl', 'wb') as f:
    pickle.dump(new_sentences, f)