In [1]:
import nltk 

In [2]:
import re
import os
import csv
from nltk.stem.snowball import SnowballStemmer
import random
from nltk.classify import SklearnClassifier
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import numpy as np
import pandas as pd

In [3]:
#getting multiple input in the same sell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity= 'all'

#ignore all warning
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings(action='ignore',category=DeprecationWarning)

In [4]:
#display all the row and column of the dataframe instead of truncated version
from IPython.display import display
pd.set_option("display.max_columns",None)
pd.set_option("display.max_rows",None)

## Preprocess

In [5]:
statement = "The Big brown fox jumped over a lazy dog."
statement2 = "This is particularly important is today's world where we are swamped with unstructured natural language date on the variety of social media platform people engage in now-a-days(note - now-a-days in the decade of 2010-2020)"

In [6]:
#convert sentence to lower case 
'This' == 'this'
statement.lower()
statement2.lower()

False

'the big brown fox jumped over a lazy dog.'

"this is particularly important is today's world where we are swamped with unstructured natural language date on the variety of social media platform people engage in now-a-days(note - now-a-days in the decade of 2010-2020)"

In [7]:
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(statement)
tokens
tokens2 = tokenizer.tokenize(statement2)
tokens2
                            

['The', 'Big', 'brown', 'fox', 'jumped', 'over', 'a', 'lazy', 'dog']

['This',
 'is',
 'particularly',
 'important',
 'is',
 'today',
 's',
 'world',
 'where',
 'we',
 'are',
 'swamped',
 'with',
 'unstructured',
 'natural',
 'language',
 'date',
 'on',
 'the',
 'variety',
 'of',
 'social',
 'media',
 'platform',
 'people',
 'engage',
 'in',
 'now',
 'a',
 'days',
 'note',
 'now',
 'a',
 'days',
 'in',
 'the',
 'decade',
 'of',
 '2010',
 '2020']

## Stopword: Filter words to remove non-useful words

In [8]:
filtered_word = [w for w in tokens if not w in stopwords.words('english')]
filtered_word

['The', 'Big', 'brown', 'fox', 'jumped', 'lazy', 'dog']

In [9]:
filtered_word = [w for w in tokens2 if not w in stopwords.words('english')]
filtered_word

['This',
 'particularly',
 'important',
 'today',
 'world',
 'swamped',
 'unstructured',
 'natural',
 'language',
 'date',
 'variety',
 'social',
 'media',
 'platform',
 'people',
 'engage',
 'days',
 'note',
 'days',
 'decade',
 '2010',
 '2020']

In [10]:
def preprocess(sentence):
    sentence = sentence.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentence)
    filtered_word = [w for w in tokens if not  w in stopwords.words('english')]
    return filtered_word

In [11]:
preprocessed_sentence = preprocess(statement)
preprocessed_sentence

['big', 'brown', 'fox', 'jumped', 'lazy', 'dog']

In [12]:
preprocess(statement2)

['particularly',
 'important',
 'today',
 'world',
 'swamped',
 'unstructured',
 'natural',
 'language',
 'date',
 'variety',
 'social',
 'media',
 'platform',
 'people',
 'engage',
 'days',
 'note',
 'days',
 'decade',
 '2010',
 '2020']

# tagging

In [13]:
tags = nltk.pos_tag(preprocessed_sentence)
tags

[('big', 'JJ'),
 ('brown', 'NN'),
 ('fox', 'NN'),
 ('jumped', 'VBD'),
 ('lazy', 'JJ'),
 ('dog', 'NN')]

In [14]:
tags = nltk.pos_tag(preprocess(statement2))
tags

[('particularly', 'RB'),
 ('important', 'JJ'),
 ('today', 'NN'),
 ('world', 'NN'),
 ('swamped', 'VBD'),
 ('unstructured', 'JJ'),
 ('natural', 'JJ'),
 ('language', 'NN'),
 ('date', 'NN'),
 ('variety', 'NN'),
 ('social', 'JJ'),
 ('media', 'NNS'),
 ('platform', 'JJ'),
 ('people', 'NNS'),
 ('engage', 'VBP'),
 ('days', 'NNS'),
 ('note', 'VBP'),
 ('days', 'NNS'),
 ('decade', 'NN'),
 ('2010', 'CD'),
 ('2020', 'CD')]

In [15]:
def extract_tagged(sentences):
    feature = []
    for tagged_words in sentences:
        word,tag = tagged_words
        if tag=='NN' or tag == 'VBN' or tag == 'VBP' or tag =='RB' or tag == 'VBZ' or tag =='VBG' or tag =='PRP'or tag =='JJ'or tag =='NNS' :
            feature.append(word)
    return feature

In [16]:
extract_tagged(tags)

['particularly',
 'important',
 'today',
 'world',
 'unstructured',
 'natural',
 'language',
 'date',
 'variety',
 'social',
 'media',
 'platform',
 'people',
 'engage',
 'days',
 'note',
 'days',
 'decade']

In [17]:
lmtzr = WordNetLemmatizer()
print(lmtzr.lemmatize('cacti'))
print(lmtzr.lemmatize('willing'))
print(lmtzr.lemmatize('feet'))
print(lmtzr.lemmatize('stemmed'))

#also do this with orignal text


cactus
willing
foot
stemmed


In [18]:
nltk.download("wordnet")

[nltk_data] Error loading wordnet: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>


False

## Stem Word

In [19]:
word_for_streaming = ['sten','streaming','stemmed','stemmer','stems','feet','willing']

In [20]:
stemmer = SnowballStemmer('english')
[stemmer.stem(x) for x in word_for_streaming]

['sten', 'stream', 'stem', 'stemmer', 'stem', 'feet', 'will']

## Putting it all together

In [21]:
def extract_feature(text):
    word = preprocess(text)
    tags = nltk.pos_tag(word)
    extracted_features = extract_tagged(tags)
    stemmed_words = [stemmer.stem(x) for x in extracted_features]
    result = [lmtzr.lemmatize(x) for x in stemmed_words]
    return result

In [22]:
words = extract_feature(statement2)
print(words)

['particular', 'import', 'today', 'world', 'unstructur', 'natur', 'languag', 'date', 'varieti', 'social', 'medium', 'platform', 'peopl', 'engag', 'day', 'note', 'day', 'decad']


In [23]:
extract_feature("He hurt his right foot while he was wearing white shoes on his feet")

['hurt', 'right', 'foot', 'wear', 'white', 'shoe', 'foot']

## Implement bag of words

In [24]:
def word_feats(words):
    return dict([(word,True) for word in words])

In [25]:
word_feats(words)

{'particular': True,
 'import': True,
 'today': True,
 'world': True,
 'unstructur': True,
 'natur': True,
 'languag': True,
 'date': True,
 'varieti': True,
 'social': True,
 'medium': True,
 'platform': True,
 'peopl': True,
 'engag': True,
 'day': True,
 'note': True,
 'decad': True}

## PARSING the  whole document

In [26]:
def extract_feature_from_doc(data):
    result = []
    corpus = []
    #responses of the chatbot
    answers = {}
    for (text,category,answer) in data:
        features = extract_feature(text)
        corpus.append(features)
        result.append([word_feats(features),category])
        answers[category] = answer
    return (result,sum(corpus,[]),answers)

In [27]:
extract_feature_from_doc([['this is the input text from the user','category','answer to give']])

([[{'input': True, 'user': True}, 'category']],
 ['input', 'user'],
 {'category': 'answer to give'})

In [28]:
def get_content(filename):
    doc = os.path.join(filename)
    with open(doc,'r',encoding='utf-8') as content_file:
        lines = csv.reader(content_file,delimiter='|')
        data = [x for x in lines if len(x) == 3]
        return data


In [29]:
filename = "/Users/Anonymous/OneDrive/Untitled Folder/leave.txt"

In [30]:
data = get_content(filename)

In [31]:
data

[['Hello there!', 'Greeting', 'Hello! How can I assist you today?'],
 ['Hey, chatbot!',
  'Greeting',
  'Hello there! How can I assist you with your leave request?'],
 ['I need to request personal leave.',
  'Request',
  'Certainly! Please provide the start date and the reason for your personal leave request.'],
 ["I'd like to take a day off for my child's school event.",
  'Request',
  "That's important! Please specify the date and the details of your child's school event."],
 ['How many vacation days do I have left?',
  'Query',
  'I can check your remaining vacation days. Please provide your employee ID or username.'],
 ["What's the process for requesting bereavement leave?",
  'Query',
  "Requesting bereavement leave is sensitive. Please provide the details, and I'll explain the process."],
 ["I'd like to change my phone number for contact purposes.",
  'Action',
  "Of course! Let's update your phone number. Please provide the new phone number."],
 ['Can you assist me in rescheduli

In [32]:
feature_data,corpus,answer = extract_feature_from_doc(data)

In [33]:
print(feature_data[8])

[{'great': True, 'job': True}, 'Closing']


In [34]:
feature_data

[[{'hello': True}, 'Greeting'],
 [{'hey': True, 'chatbot': True}, 'Greeting'],
 [{'need': True, 'person': True, 'leav': True}, 'Request'],
 [{'day': True, 'child': True, 'school': True, 'event': True}, 'Request'],
 [{'mani': True, 'vacat': True, 'day': True}, 'Query'],
 [{'process': True, 'request': True, 'bereav': True, 'leav': True}, 'Query'],
 [{'chang': True,
   'phone': True,
   'number': True,
   'contact': True,
   'purpos': True},
  'Action'],
 [{'assist': True, 'reschedul': True, 'upcom': True, 'leav': True}, 'Action'],
 [{'great': True, 'job': True}, 'Closing'],
 [{'care': True}, 'Closing'],
 [{'hello': True, 'bot': True}, 'Greeting'],
 [{'need': True, 'request': True, 'leav': True, 'famili': True, 'emerg': True},
  'Request'],
 [{'appli': True, 'educ': True, 'leav': True}, 'Request'],
 [{'inform': True, 'compani': True, 'holiday': True, 'calendar': True},
  'Query'],
 [{'possibl': True, 'half': True, 'day': True}, 'Query'],
 [{'chang': True, 'direct': True, 'deposit': True, 

In [35]:
corpus

['hello',
 'hey',
 'chatbot',
 'need',
 'person',
 'leav',
 'day',
 'child',
 'school',
 'event',
 'mani',
 'vacat',
 'day',
 'process',
 'request',
 'bereav',
 'leav',
 'chang',
 'phone',
 'number',
 'contact',
 'purpos',
 'assist',
 'reschedul',
 'upcom',
 'leav',
 'great',
 'job',
 'care',
 'hello',
 'bot',
 'need',
 'request',
 'leav',
 'famili',
 'emerg',
 'appli',
 'educ',
 'leav',
 'inform',
 'compani',
 'holiday',
 'calendar',
 'possibl',
 'half',
 'day',
 'chang',
 'direct',
 'deposit',
 'detail',
 'cancel',
 'thank',
 'assist',
 'today',
 'goodby',
 'great',
 'day',
 'greet',
 'chatbot',
 'interest',
 'take',
 'long',
 'vacat',
 'guid',
 'need',
 'juri',
 'duti',
 'leav',
 'want',
 'take',
 'day',
 'chariti',
 'event',
 'polici',
 'sick',
 'leav',
 'document',
 'chang',
 'emerg',
 'contact',
 'person',
 'assist',
 'extend',
 'vacat',
 'leav',
 'thank',
 'help',
 'farewel',
 'care',
 'hi',
 'need',
 'request',
 'leav',
 'child',
 'graduat',
 'want',
 'take',
 'day',
 'person',

In [36]:
answer

{'Greeting': 'Hi! How can I assist you with your leave request?',
 'Request': 'Of course! Please specify the date and the nature of the family event.',
 'Query': 'We have a total of [number] paid holidays this year. Would you like to know the specific dates?',
 'Action': 'Certainly! Please provide the leave request ID or the details of the request you want to cancel.',
 'Closing': "Goodbye! Take care, and remember, I'm here whenever you need assistance. Have a wonderful day!",
 'category': 'input" as shown. Here are the responses you provided in that format:',
 'Contact Number': 'You can reach our support team at [contact number].',
 'General Information': 'For general information, you can visit our website at [website URL].'}

## Trai a model using these features

In [37]:
##split data into train test
split_ratio = 0.8

In [38]:
def split_dataset(data,split_ratio):
    random.shuffle(data)
    data_length = len(data)
    train_split = int(data_length*split_ratio)
    return (data[:train_split]),(data[train_split:])

In [39]:
training_data,test_data = split_dataset(feature_data,split_ratio)

In [40]:
training_data

[[{'chang': True, 'email': True, 'address': True, 'notif': True}, 'Action'],
 [{'chang': True,
   'phone': True,
   'number': True,
   'contact': True,
   'purpos': True},
  'Action'],
 [{'want': True, 'sabbat': True, 'leav': True}, 'Request'],
 [{'farewel': True}, 'Closing'],
 [{'help': True, 'cancel': True, 'leav': True, 'request': True}, 'Action'],
 [{'need': True, 'juri': True, 'duti': True, 'leav': True}, 'Request'],
 [{'cancel': True}, 'Action'],
 [{'hi': True, 'chatbot': True}, 'Greeting'],
 [{'help': True, 'cancel': True, 'leav': True, 'request': True}, 'Action'],
 [{'hello': True}, 'Greeting'],
 [{'general': True, 'inform': True, 'compani': True}, 'General Information'],
 [{'greet': True, 'chatbot': True}, 'Greeting'],
 [{'need': True, 'request': True, 'compassion': True, 'leav': True},
  'Request'],
 [{'submit': True, 'leav': True, 'request': True, 'portal': True}, 'Query'],
 [{'hello': True}, 'Greeting'],
 [{'explain': True,
   'process': True,
   'request': True,
   'remot'

In [41]:
#save data 
np.save('training_data',training_data)
np.save('test_data',test_data)

## Classification using Decision tree

In [42]:
training_data = np.load('training_data.npy', allow_pickle=True)
test_data = np.load('test_data.npy', allow_pickle=True)


In [43]:
test_data

array([[{'chang': True, 'emerg': True, 'contact': True, 'person': True},
        'Action'],
       [{'want': True, 'take': True, 'day': True, 'famili': True, 'event': True},
        'Request'],
       [{'farewel': True}, 'Closing'],
       [{'day': True, 'child': True, 'school': True, 'event': True},
        'Request'],
       [{'see': True, 'respons': True, 'format': True}, 'category'],
       [{'interest': True, 'take': True, 'long': True, 'vacat': True, 'guid': True},
        'Query'],
       [{'goodby': True, 'wish': True}, 'Closing'],
       [{'updat': True, 'emerg': True, 'evacu': True, 'locat': True},
        'Action'],
       [{'help': True, 'absenc': True, 'extens': True}, 'Action'],
       [{'greet': True}, 'Greeting'],
       [{'assist': True, 'reschedul': True, 'upcom': True, 'leav': True},
        'Action'],
       [{'goodby': True, 'great': True, 'day': True}, 'Closing'],
       [{'contact': True, 'support': True, 'team': True},
        'Contact Number'],
       [{'polici

In [44]:
def train_using_decision_tree(training_data,test_data):
    classifier = nltk.classify.DecisionTreeClassifier.train(training_data,entropy_cutoff=0.6,support_cutoff=6)
    classifier_name = type(classifier).__name__
    training_set_accuracy = nltk.classify.accuracy(classifier,training_data)
    print('Training set accuracy:',training_set_accuracy)
    test_set_accuracy = nltk.classify.accuracy(classifier,test_data)
    print('Test set accuracy:',test_set_accuracy)
    return classifier, classifier_name, test_set_accuracy, training_set_accuracy
    

In [45]:
dtclassifier,classifier_name, test_set_accuracy, training_set_accuracy = train_using_decision_tree(training_data,test_data)

Training set accuracy: 0.9696969696969697
Test set accuracy: 0.29411764705882354


## Classification using Naive Bayes

In [46]:
def train_using_naive_bayes(training_data,test_data):
    classifier = nltk.NaiveBayesClassifier.train(training_data)
    classifier_name = type(classifier).__name__
    training_set_accuracy = nltk.classify.accuracy(classifier,training_data)
    print('Training set accuracy:',training_set_accuracy)
    test_set_accuracy = nltk.classify.accuracy(classifier,test_data)
    print('Test set accuracy:',test_set_accuracy)
    return classifier, classifier_name, test_set_accuracy, training_set_accuracy
    

In [47]:
classifier,classifier_name,test_set_accuracy,training_set_accuracy = train_using_naive_bayes(training_data,test_data)
print(training_set_accuracy)
print(test_set_accuracy)
print(len(classifier.most_informative_features()))
classifier.show_most_informative_features()

Training set accuracy: 0.8939393939393939
Test set accuracy: 0.5294117647058824
0.8939393939393939
0.5294117647058824
100
Most Informative Features
                 compani = True           Genera : Query  =      7.5 : 1.0
                  inform = True           Genera : Query  =      7.5 : 1.0
                    leav = None           Closin : Reques =      6.2 : 1.0
                    need = True           Reques : Action =      5.5 : 1.0
                  assist = True           Closin : Action =      2.5 : 1.0
                    need = None            Query : Reques =      2.4 : 1.0
                     day = True            Query : Reques =      2.3 : 1.0
                    leav = True           Reques : Query  =      2.3 : 1.0
                   vacat = True            Query : Action =      2.2 : 1.0
                 request = True           Reques : Action =      1.6 : 1.0


In [48]:
extract_feature("hello")

['hello']

In [49]:
word_feats(extract_feature('hello'))

{'hello': True}

In [50]:
Input_sentence = "Hi Good evening!"
classifier.classify(word_feats(extract_feature(Input_sentence)))

'Greeting'

In [51]:
def replay(input_statement):
    category = dtclassifier.classify(word_feats(extract_feature(input_statement)))
    return answer[category]

In [60]:
replay('Hello')

'Hi! How can I assist you with your leave request?'

In [53]:
replay("Hey, chatbot!")

'Hi! How can I assist you with your leave request?'

In [54]:
replay("Is it possible to take half-day leave?")

'We have a total of [number] paid holidays this year. Would you like to know the specific dates?'

In [55]:
replay("Thank you for your assistance today!")

"Goodbye! Take care, and remember, I'm here whenever you need assistance. Have a wonderful day!"