In [23]:
import json
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

import re
import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


In [24]:
with open('intents.json') as file:
    #print(file)
    data = json.load(file)

#data

In [25]:

def tokenize(text):
    print(text)
    word = re.sub(r'[^A-Za-z0-9\s]', '',text)
    #print(word)
    words = word_tokenize(word)
    #print(words)
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words("english"))
    words = [lemmatizer.lemmatize(w.lower(),pos="v") for w in words if w not in stop_words]
    #print(words)
    return words
    
    
tokenize("'Hi', 'How are you', 'Is anyone's there?', 'Hello', 'Good day', 'Whats up'")  

'Hi', 'How are you', 'Is anyone's there?', 'Hello', 'Good day', 'Whats up'


['hi', 'how', 'be', 'anyones', 'hello', 'good', 'day', 'whats']

In [26]:
inp = []
label = []

for intent in data['intents']:
    tok = ' '.join(intent['patterns'])
    #print(intent['tag'])
    #print(intent['patterns'])
    #print(tokenize(tok))
    inp.append(tok)
    label.append(intent['tag'])
    #print(intent["responses"])
    #print(" ")

print(inp)
print(label)

['Hi How are you Is anyone there? Hello Good day Whats up', 'cya See you later Goodbye I am Leaving Have a Good day', 'how old how old is tim what is your age how old are you age?', 'what is your name what should I call you whats your name?', 'Id like to buy something whats on the menu what do you reccommend? could i get something to eat', 'when are you guys open what are your hours hours of operation']
['greeting', 'goodbye', 'age', 'name', 'shop', 'hours']


In [17]:
count_vect = CountVectorizer(tokenizer=tokenize)

X = count_vect.fit_transform(inp)

print(X.toarray())
#tfidf = TfidfTransformer()

#model = RandomForestClassifier(random_state=99)

print(count_vect.get_feature_names())
tfidf = TfidfTransformer()
tf = tfidf.fit_transform(X)

print(tf.toarray())

hi how are you is anyone there? hello good day whats up
cya see you later goodbye i am leaving have a good day
how old how old is tim what is your age how old are you age?
what is your name what should i call you whats your name?
id like to buy something whats on the menu what do you reccommend? could i get something to eat
when are you guys open what are your hours hours of operation
[[0 1 0 0 0 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0]
 [2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 1 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 1]
 [0 0 1 0 1 0 0 1 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 1 0 2 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 2 0 0 0 0 0 0 0 1 1 0 0 0 0 0]]
['age', 'anyone', 'buy', 'call', 'could', 'cya', 'day', 'eat', 'get', 'good', 'goodbye', 'guy', 'hello', 'hi', 'hours', 'id', 'later', 'leave', 'like', 'menu', 'name', 'old', 'open', 'operation', 'reccommend', 'see', 'something', 'tim', 'whats']

In [27]:

model_d = DecisionTreeClassifier(random_state=99)
model = RandomForestClassifier(random_state=99)

pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                    ('tfidf', TfidfTransformer()),
                    ('clf', model)])

parameters = {'clf__estimator__n_estimators': [100],
                'clf__estimator__criterion': ['entropy']
            }

#pipeline.fit(, label)


In [28]:
pipeline.fit(inp,label)

hi how are you is anyone there? hello good day whats up
cya see you later goodbye i am leaving have a good day
how old how old is tim what is your age how old are you age?
what is your name what should i call you whats your name?
id like to buy something whats on the menu what do you reccommend? could i get something to eat
when are you guys open what are your hours hours of operation




Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize at...
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.

In [45]:
res = pipeline.predict(inp)

hi how are you is anyone there? hello good day whats up
cya see you later goodbye i am leaving have a good day
how old how old is tim what is your age how old are you age?
what is your name what should i call you whats your name?
id like to buy something whats on the menu what do you reccommend? could i get something to eat
when are you guys open what are your hours hours of operation


In [48]:
pred = (res == label).mean()

In [49]:
pred

1.0

In [59]:
pipeline.predict(["id like whats menu what eat"])

id like whats menu what eat


array(['name'], dtype='<U8')

In [60]:
from joblib import dump,load

In [61]:
dump(pipeline, 'chatbot.pkl')

['chatbot.pkl']

In [62]:
model = load('chatbot.pkl')

In [66]:
pipeline.predict(["like what's menu what to"])

like what's menu what to eat


array(['name'], dtype='<U8')