# Classify Intent

In [105]:
import spacy
import unicodedata
import pandas as pd
from spacy import displacy
from string import punctuation
from nltk.stem import RSLPStemmer
from nltk.corpus import stopwords
from spacy.pipeline import EntityRuler
from nltk.tokenize import word_tokenize
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

In [106]:
df = pd.DataFrame([
     # sales
     ['vendas do último mês', 'sales'],
     ['quanto vendi esse mês', 'sales'],
     ['quanto vendi no mês de agosto', 'sales'],
     ['quais são os 10 produtos que mais vendi esse ano', 'sales'],
     ['quais são os 10 produtos que mais vendi esse mês', 'sales'],
     ['quais são os 10 produtos que mais vendi hoje', 'sales'],
     ['qual produto eu mais vendi hoje', 'sales'],
     # products
     ['quantos produtos eu tenho na minha base', 'products'],
     ['quantos produtos eu tenho na minha base de dados', 'products'],
     ['quantos produtos eu tenho na minha loja', 'products'],
     ['quantos refrigerantes eu vendo na minha loja', 'products'],
     ['quanto é o preço da coca-cola', 'products'],
     ['qual é o preço da coca-cola', 'products'],
     ['quantos refrigerantes tenho no estoque', 'products']
 ], columns=['message', 'label'])

In [107]:
class ChatBot:
    '''
     Business Chat Bot by Mee 🤖🔮💜
     Author: Guilherme Kodama 06/2020
    '''
    def __init__(self, df, column_message='message', column_label='label'):
        self.df = df.copy()
        self.clf = MultinomialNB()
        self.nlp = spacy.load('pt')
        self.stemmer = RSLPStemmer()
        self.vectorizer = TfidfVectorizer()
        self.ruler = EntityRuler(nlp)
        
        # custom pattern for entity recognition
        self.patterns = [
            {
                'label': 'DATE', 'pattern': 
                [    # this will run per token and check the token sequence to match the rules
                    {'TEXT' : {"REGEX": "[uú]ltimo"} },
                    {'TEXT' : {"REGEX": "m[êe]s"} },
                ], 
                'id': 'date'
            },
            {
                'label': 'DATE', 'pattern': 
                [
                    {'TEXT' : {"REGEX": "[uú]ltimo"} },
                    {'TEXT' : {"REGEX": "\d"} },
                    {'TEXT' : {"REGEX": "m[êe]s"} },
                ], 
                'id': 'date'
            },
            {
                'label': 'DATE', 'pattern': 
                [
                    {'TEXT' : {"REGEX": "([Jj]aneiro|[Ff]evereiro|[Mm]ar[çc]o|[Aa]bril|[Mm]aio|[Jj]unho|[Jj]ulho|[Aa]gosto|[Ss]etembro|[Oo]utubro|[Nn]ovembro|[Dd]ezembro)"} }
                ], 
                'id': 'date'
            }, {
                'label': 'DATE', 'pattern': 
                [
                    {'TEXT' : {"REGEX": "(hoje|ontem)"} }
                ], 
                'id': 'date'
            }
        ]
        
        self.ruler.add_patterns(patterns)
        self.nlp.add_pipe(ruler)
    
    def understand(self, message):
        print('--- INTENT ---')
        print('')
        self.train()
        prediction = self.predict(message)
        print('PREDICTION: ', prediction)
        print('')
        print('--- ENTITIES ---')
        print('')
        self.extract_entities(message, '')
    
    '''
     PREPROCESSING TEXT
    '''

    def normalize(self, message):
        normalized = unicodedata.normalize('NFKD', message).encode('ASCII', 'ignore').decode('utf-8')
        return normalized

    def stem(self, message):
        words = word_tokenize(message)
        words = [stemmer.stem(word) for word in words]
        return ' '.join(words)

    def remove_stopwords(self, message):
        blacklist = set(stopwords.words('portuguese') + list(punctuation))
        clean_words = [word for word in word_tokenize(message) if word not in blacklist]
        return ' '.join(clean_words)

    def preprocess_message(self, message):
        message = self.normalize(message)
        message = self.remove_stopwords(message)
    #     message = stem(message)
        return message

    def preprocess(self, df, column_in='message', column_out='message_clean'):
        df[column_out] = df[column_in].apply(lambda message : self.preprocess_message(message))

    '''
     VECTORIZE
    '''
    def vectorize(self, df, column = 'message'):
        X = vectorizer.fit_transform(df[column])
        return X

    def vectorize_message(self, message):
        return vectorizer.transform([message])
    '''
     PREDICTION
    '''
    def train(self):
        self.preprocess(self.df, column_in='message', column_out='message_clean')
        self.X = vectorize(self.df, column='message_clean')
        print('CORPUS:', vectorizer.get_feature_names())
        print('')
        print('CORPUS SHAPE: ', X.shape)
        print('')
        self.model = clf.fit(self.X, self.df['label'])

    def predict(self, message):
        vector = vectorize_message(message)
        predict_label = self.model.predict(vector)
        predict_proba = self.model.predict_proba(vector)
        return { 'label': predict_label, 'probability': predict_proba, 'classes': model.classes_ }

    '''
     NER - Named Entity Recognition
    '''

    def extract_entities(self, message, intent):
        doc = self.nlp(message)
        print([(ent.text, ent.label_) for ent in doc.ents])
        

In [None]:
chatbot = ChatBot(df)

In [None]:
chatbot.understand('qual produto eu mais vendi hoje')