In [0]:
!pip install autocorrect

In [0]:
import nltk
nltk.download('popular')

In [0]:
import random
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from autocorrect import Speller
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 

# Init the Wordnet Lemmatizer
lemmatizer = WordNetLemmatizer()



In [0]:
from google.colab import files
import io


uploaded = files.upload()

Saving music_intent_entities.json to music_intent_entities.json
Saving restaurant_intent_entities.json to restaurant_intent_entities.json
Saving weather_intent_entities.json to weather_intent_entities.json


In [0]:

music_file = pd.read_json(io.BytesIO(uploaded['music_intent_entities.json']))
restaurant_file = pd.read_json(io.BytesIO(uploaded['restaurant_intent_entities.json']))
weather_file = pd.read_json(io.BytesIO(uploaded['weather_intent_entities.json']))

In [0]:
data = {};
data['music'] = music_file['text'].to_numpy()
data['restaurant'] = restaurant_file['text'].to_numpy()
data['weather'] = weather_file['text'].to_numpy()


Getting the words from the data

In [0]:
all_words = []


document = [(text, category) for category in data.keys() for text in data[category]]
random.shuffle(document)

array_words = [nltk.word_tokenize(w) for (w, cat) in document];
flat_list = [word for sent in array_words for word in sent]

Removes the **stop words** like ( ‘off’, ‘is’, ‘s’, ‘am’, ‘or’) and  ***non alphabetical*** characters

In [0]:
stopWords = set(stopwords.words('english'))

def remove_stop_words(words):
  wordsFiltered = []

  for w in words:
    if w not in stopWords:
      if w.isalpha():
        wordsFiltered.append(w)

  return wordsFiltered

flat_list=remove_stop_words(flat_list)

**Lemmatization** i.e., tranforms tarnsforms different forms of words to a single one

In [0]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


def lemmatization(words):
  return [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in words]


filtered_list=lemmatization(flat_list)

Getting the ***frequency*** of each word and extracting top 2000

In [0]:

frequencyDistribution = nltk.FreqDist(w.lower() for w in filtered_list)
word_features = list(frequencyDistribution)[:2000]


**FEATURE** **EXTRACTION**

In [0]:

def feature_Extraction(doc):
    document_words = [word.lower() for word in nltk.word_tokenize(doc)]
    
    document_words=remove_stop_words(document_words)
    document_words=lemmatization(document_words)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

Training the model

In [0]:
test_set=nltk.classify.apply_features(feature_Extraction, document[:500])
train_set=nltk.classify.apply_features(feature_Extraction, document[500:])
classifier = nltk.NaiveBayesClassifier.train(train_set)

Testing the model *accuracy*

In [0]:
print(nltk.classify.accuracy(classifier, test_set))

0.998


In [0]:
classifier.show_most_informative_features(20)

Most Informative Features
          contains(book) = True           restau : music  =    320.1 : 1.0
         contains(table) = True           restau : weathe =    194.1 : 1.0
    contains(restaurant) = True           restau : weathe =    149.8 : 1.0
        contains(people) = True           restau : music  =     78.7 : 1.0
           contains(eat) = True           restau : weathe =     71.4 : 1.0
          contains(food) = True           restau : weathe =     65.4 : 1.0
   contains(reservation) = True           restau : weathe =     58.8 : 1.0
           contains(rat) = True           restau : music  =     50.5 : 1.0
      contains(national) = True           weathe : music  =     49.6 : 1.0
          contains(find) = True           restau : weathe =     46.6 : 1.0
          contains(park) = True           weathe : music  =     44.0 : 1.0
          contains(make) = True           restau : weathe =     43.9 : 1.0
        contains(minute) = True           weathe : music  =     43.0 : 1.0

**OUTPUTS**

In [0]:
print(classifier.classify(feature_Extraction("Is it sunnier today?")))
print(classifier.classify(feature_Extraction("book a table")))
print(classifier.classify(feature_Extraction(" I want to listen to popular telugu song ")))


weather
restaurant
music
