In [1]:
import json
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

import re
import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


In [2]:
with open('intents.json') as file:
    #print(file)
    data = json.load(file)

#data

In [3]:

def tokenize(text):
    print(text)
    word = re.sub(r'[^A-Za-z0-9\s]', '',text)
    #print(word)
    words = word_tokenize(word)
    #print(words)
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words("english"))
    words = [lemmatizer.lemmatize(w.lower(),pos="v") for w in words if w not in stop_words]
    #print(words)
    return words
    
    
tokenize("'Hi', 'How are you', 'Is anyone's there?', 'Hello', 'Good day', 'Whats up'")  

'Hi', 'How are you', 'Is anyone's there?', 'Hello', 'Good day', 'Whats up'


['hi', 'how', 'be', 'anyones', 'hello', 'good', 'day', 'whats']

In [36]:
inp = []
label = []

columns = ['messages','tag'] 
df = pd.DataFrame(columns=columns)

for intent in data['intents']:
    tok = ' '.join(intent['patterns'])
    #print(intent['tag'])
    print(intent['patterns'])
    
    pat = intent['patterns']
    tg = intent['tag']
    
    df = pd.concat([df,pd.DataFrame({'messages':pat,'tag':[tg]*len(pat)})],ignore_index=True)
    #pd.concat([df,pd.Series(intent['tag'])])
    print(df.head())
    #print(tokenize(tok))
    inp.append(tok)
    label.append(intent['tag'])
    #print(intent["responses"])
    #print(" ")

print(inp)
print(label)

['Hi', 'How are you', 'Is anyone there?', 'Hello', 'Good day', 'Whats up']
           messages       tag
0                Hi  greeting
1       How are you  greeting
2  Is anyone there?  greeting
3             Hello  greeting
4          Good day  greeting
['cya', 'See you later', 'Goodbye', 'I am Leaving', 'Have a Good day']
           messages       tag
0                Hi  greeting
1       How are you  greeting
2  Is anyone there?  greeting
3             Hello  greeting
4          Good day  greeting
['how old', 'how old is tim', 'what is your age', 'how old are you', 'age?']
           messages       tag
0                Hi  greeting
1       How are you  greeting
2  Is anyone there?  greeting
3             Hello  greeting
4          Good day  greeting
['what is your name', 'what should I call you', 'whats your name?']
           messages       tag
0                Hi  greeting
1       How are you  greeting
2  Is anyone there?  greeting
3             Hello  greeting
4          Good day

In [37]:
df

Unnamed: 0,messages,tag
0,Hi,greeting
1,How are you,greeting
2,Is anyone there?,greeting
3,Hello,greeting
4,Good day,greeting
5,Whats up,greeting
6,cya,goodbye
7,See you later,goodbye
8,Goodbye,goodbye
9,I am Leaving,goodbye


In [40]:
inp = df.messages
label = df.tag

In [41]:
count_vect = CountVectorizer(tokenizer=tokenize)

X = count_vect.fit_transform(inp)

print(X.toarray())
#tfidf = TfidfTransformer()

#model = RandomForestClassifier(random_state=99)

print(count_vect.get_feature_names())
tfidf = TfidfTransformer()
tf = tfidf.fit_transform(X)

print(tf.toarray())

hi
how are you
is anyone there?
hello
good day
whats up
cya
see you later
goodbye
i am leaving
have a good day
how old
how old is tim
what is your age
how old are you
age?
what is your name
what should i call you
whats your name?
id like to buy something
whats on the menu
what do you reccommend?
could i get something to eat
when are you guys open
what are your hours
hours of operation
[[0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [

In [42]:

model_d = DecisionTreeClassifier(random_state=99)
model = RandomForestClassifier(random_state=99,n_jobs=1)

pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                    ('tfidf', TfidfTransformer()),
                    ('clf', model)])

parameters = {'clf__estimator__n_estimators': [100],
                'clf__estimator__criterion': ['entropy']
            }

#pipeline.fit(, label)


In [43]:
pipeline.fit(inp,label)

hi
how are you
is anyone there?
hello
good day
whats up
cya
see you later
goodbye
i am leaving
have a good day
how old
how old is tim
what is your age
how old are you
age?
what is your name
what should i call you
whats your name?
id like to buy something
whats on the menu
what do you reccommend?
could i get something to eat
when are you guys open
what are your hours
hours of operation




Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize at...
                ('clf',
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        

In [44]:
res = pipeline.predict(inp)

hi
how are you
is anyone there?
hello
good day
whats up
cya
see you later
goodbye
i am leaving
have a good day
how old
how old is tim
what is your age
how old are you
age?
what is your name
what should i call you
whats your name?
id like to buy something
whats on the menu
what do you reccommend?
could i get something to eat
when are you guys open
what are your hours
hours of operation


In [45]:
pred = (res == label).mean()

In [46]:
pred

0.8846153846153846

In [47]:
pipeline.predict(["id like whats menu what eat"])

id like whats menu what eat


array(['shop'], dtype=object)

In [48]:
from joblib import dump,load

In [49]:
dump(pipeline, 'chatbot.pkl')

['chatbot.pkl']

In [50]:
model = load('chatbot.pkl')

In [51]:
pipeline.predict(["like what's menu what to"])

like what's menu what to


array(['shop'], dtype=object)

In [54]:
pred = pipeline.predict(["hi hello"])

hi hello


In [55]:
pred[0]

'greeting'

In [18]:
pdf = pd.read_csv("recent_msgs.csv")


In [19]:
pdf


Unnamed: 0.1,Unnamed: 0,query,output
0,0,what is the time now,
1,1,goodbye,
2,0,need water to drink,
3,1,goodbye,
