In [1]:
import numpy as np
import nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def tokenize(sentence):
    return nltk.word_tokenize(sentence)

def stem(word):
    return stemmer.stem(word.lower())

def bag_of_words(tokenized_sentence, words):
    # stem each word
    sentence_words = [stem(word) for word in tokenized_sentence]
    # initialize bag with 0 for each word
    bag = np.zeros(len(words), dtype=np.float32)
    for idx, w in enumerate(words):
        if w in sentence_words: 
            bag[idx] = 1

    return bag

# Model Training

In [5]:
import random
import json
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

with open('dataset/movies.json', 'r') as f:
    movies = json.load(f)

all_words = []
tags = []
xy = []
# loop through each sentence in our intents patterns
for movie in movies['movies']:
    tag = movie['tag']
    # add to tag list
    tags.append(tag)
    for pattern in movie['patterns']:
        # tokenize each word in the sentence
        w = tokenize(pattern)
        # add to our words list
        all_words.extend(w)
        # add to xy pair
        xy.append((w, tag))

# stem and lower each word
ignore_words = ['?', '.', '!']
all_words = [stem(w) for w in all_words if w not in ignore_words]
# remove duplicates and sort
all_words = sorted(set(all_words))
tags = sorted(set(tags))

print(len(xy), "patterns")
print(len(tags), "tags:", tags)
print(len(all_words), "unique stemmed words:", all_words)

# create training data
X_train = []
y_train = []
for (pattern_sentence, tag) in xy:
    # X: bag of words for each pattern_sentence
    bag = bag_of_words(pattern_sentence, all_words)
    X_train.append(bag)
    # y: PyTorch CrossEntropyLoss needs only class labels, not one-hot
    label = tags.index(tag)
    y_train.append(label)

X_train = np.array(X_train)
y_train = np.array(y_train)

# Hyper-parameters 
num_epochs = 1500
batch_size = 8
learning_rate = 0.001
input_size = len(X_train[0])
hidden_size = 12
output_size = len(tags)
print(input_size, output_size)


21 patterns
5 tags: ['comedy', 'goodbye', 'greeting', 'recommendation', 'thanks']
39 unique stemmed words: ["'m", "'s", 'a', 'action', 'ani', 'anoth', 'appreci', 'bye', 'can', 'comedi', 'do', 'for', 'good', 'goodby', 'have', 'hello', 'hey', 'hi', 'i', 'in', 'it', 'later', 'like', 'look', 'lot', 'mood', 'movi', 'other', 'recommend', 'see', 'suggest', 'talk', 'thank', 'the', 'there', 'to', 'watch', 'what', 'you']
