In [1]:
import spacy

In [2]:
NER = spacy.load("en_core_web_sm")

raw_text = 'I would like to book a table for ten people  five at the evening'
sent = NER(raw_text)

for word in sent.ents:
    print(word.text,word.label_)

ten CARDINAL
five CARDINAL
evening TIME


In [3]:
for chunk in sent:
    print(chunk, chunk.pos_)

I PRON
would AUX
like VERB
to PART
book VERB
a DET
table NOUN
for ADP
ten NUM
people NOUN
  SPACE
five NUM
at ADP
the DET
evening NOUN


In [4]:
for chunk in sent.noun_chunks:
    print(chunk)

I
a table
ten people
the evening


In [5]:
import json
with open("intent.json","r") as f:
    intents = json.load(f)

In [6]:
dico = []
corpus = []
data = []
tags = []

def tokenizer(text):
    tokens = NER(text)
    tokens = [token.lemma_ for token in tokens if token.lemma_.isalpha()]
    return tokens

for intent in intents['intents']:
    tag = intent[('tag')]
    dico += tokenizer(tag)
    for pattern in intent['patterns']:
        tags.append(tag)
        pattern_tokens = tokenizer(pattern)
        dico += pattern_tokens
        data.append([pattern_tokens, tag])
        corpus.append(' '.join(pattern_tokens))
    for resp in intent['responses']:
        dico += tokenizer(resp)
        


In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder

dico_tags = {tag : i for (i, tag) in enumerate(set(tags))}

vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(corpus)
y_train = [dico_tags[tag] for tag in tags]

In [8]:
from torch.utils.data import Dataset, DataLoader
import numpy as np

class ChatDataset(Dataset):
    def __init__(self):
        self.n_samples = len(X_train.toarray())
        self.x_data = X_train.toarray()
        self.y_data = np.array(y_train)
        
    def __getitem__(self, idx):
        return self.x_data[idx], self.y_data[idx]
    
    def __len__(self):
        return self.n_samples
batch_size = 8

    
dataset = ChatDataset()
train_loader = DataLoader(dataset=dataset, 
                            batch_size=batch_size, 
                            shuffle=True, 
                            num_workers=2)

In [None]:
import torch
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

hidden_size = 8
output_size = len(set(tags))
input_size = len(X_train.toarray()[0])
learning_rate = 0.001
n_epochs=300


class NeuralNet(nn.Module):
    
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNet, self).__init__()
        self.l1 = nn.Linear(input_size, hidden_size)
        self.l2 = nn.Linear(hidden_size, hidden_size)
        self.l3 = nn.Linear(hidden_size, num_classes)
        self.relu = nn.ReLU()
        
    def forward(self,x):
        out =self.l1(x.float())
        out = self.relu(out)
        out = self.l2(out)
        out = self.relu(out)
        out = self.l3(out)
        return out
    
model = NeuralNet(input_size, hidden_size, output_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)



In [None]:
model.to(device)
def binary_accuracy(preds, y):
    correct = preds.argmax(axis=1)==y
    acc = correct.sum() / len(correct)
    return acc

def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    for batch in iterator:
        predictions = model.forward(batch[0].to(device))
        loss = criterion(predictions, batch[1].to(device))  
        acc = binary_accuracy(predictions, batch[1].to(device)) 
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad(): 
        for batch in iterator:
            predictions = model(batch[0]) 
            loss = criterion(predictions, batch[1])       
            acc = binary_accuracy(predictions, batch[1])

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


import time


N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    #valid_loss, valid_acc = evaluate(model, dataloader_val, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    #if valid_loss < best_valid_loss:
    #    best_valid_loss = valid_loss
    #    torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    #print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

In [None]:
for epoch in range(n_epochs):
    for (words,labels) in train_loader:
        words = words.to(device).float()
        labels = labels.to(device).float()
        
        
        outputs = model(words).float()
        loss = criterion(outputs, labels.long())
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    if (epoch+1) % 50 == 0:
        print(f"epoch: {epoch+1}/{n_epochs}, loss: {loss.item():.4f}")
print(f"final loss, loss: {loss.item():.4f}")

In [None]:
model.eval()
test_sentence = 'I would like to book a table for tomorow evening'

vect_sent = vectorizer.transform([' '.join(tokenizer(test_sentence))]).toarray()
output = model(torch.tensor(vect_sent).to(device).float())
list(dico_tags.keys())[torch.softmax(output, dim=1).argmax().item()]

In [None]:
set(tags)

In [None]:
verbs = ['want', 'would like', 'wish']
complements = ['to book a table', 'a table']
times = ['this morning', 'for lunch', 'for this evening', 'for tomorow evening']

bookings = [f"I {verb} {complement} {time}" for verb in verbs for time in times for complement in complements]
bookings

In [None]:
from fuzzywuzzy import process

listing = ['Hello', 'Hi', '9', 'like']
process.extract('Hlo', listing)

In [None]:
tokens = NER("I don't want today I want not tomorrow")

print(' '.join(token.lemma_ for token in tokens if not token.is_stop or "n't" in token.lemma_))

In [None]:
for token in tokens:
    print(token)