# Processing Chat Inputs and making intent Decision using Naive Bayes

## Author: Umer Qaisar - CS131014

In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
messages = pd.read_csv("chatinputsCSVL - Copy.csv", names=["inputs", "intents"])
messages.head()

Unnamed: 0,inputs,intents
0,When is our bus coming,bus
1,When does the bus come to station 1?,bus
2,Is there still a city-bus today?,bus
3,When does a public transport go?,bus
4,I want a bus,bus


In [3]:
messages.groupby('intents').describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,inputs
intents,Unnamed: 1_level_1,Unnamed: 2_level_1
bus,count,13
bus,unique,13
bus,top,When is the scheduled bus in the evening?
bus,freq,1
contacts,count,22
contacts,unique,22
contacts,top,Open the sent mails folder
contacts,freq,1
weather,count,16
weather,unique,16


In [4]:
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [5]:
import string

In [6]:
def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [7]:
messages['inputs'].head(5).apply(text_process)

0              [bus, coming]
1    [bus, come, station, 1]
2    [still, citybus, today]
3    [public, transport, go]
4                [want, bus]
Name: inputs, dtype: object

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
# Might take awhile...
bow_transformer = CountVectorizer(analyzer=text_process).fit(messages['inputs'])

# Print total number of vocab words
print(len(bow_transformer.vocabulary_))

94


In [10]:
bow_transformer.vocabulary_

{'1': 0,
 '20': 1,
 '30': 2,
 'Answer': 3,
 'Call': 4,
 'Check': 5,
 'Create': 6,
 'Delete': 7,
 'Forward': 8,
 'Give': 9,
 'Go': 10,
 'Open': 11,
 'Remove': 12,
 'Reply': 13,
 'Search': 14,
 'Send': 15,
 'Start': 16,
 'Wednesday': 17,
 'Write': 18,
 'app': 19,
 'arrive': 20,
 'bus': 21,
 'call': 22,
 'called': 23,
 'citybus': 24,
 'cold': 25,
 'come': 26,
 'coming': 27,
 'contact': 28,
 'daily': 29,
 'day': 30,
 'done': 31,
 'e': 32,
 'email': 33,
 'evening': 34,
 'folder': 35,
 'get': 36,
 'go': 37,
 'high': 38,
 'home': 39,
 'horst': 40,
 'hot': 41,
 'hour': 42,
 'ice': 43,
 'inbox': 44,
 'johns': 45,
 'junk': 46,
 'junkmails': 47,
 'last': 48,
 'latest': 49,
 'mail': 50,
 'mailbox': 51,
 'mails': 52,
 'many': 53,
 'messages': 54,
 'mike': 55,
 'minutes': 56,
 'need': 57,
 'new': 58,
 'newest': 59,
 'nice': 60,
 'number': 61,
 'numer': 62,
 'office': 63,
 'paris': 64,
 'phone': 65,
 'probability': 66,
 'public': 67,
 'quickly': 68,
 'rain': 69,
 'rainy': 70,
 'scheduled': 71,
 'scra

In [11]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

In [12]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [13]:
pipeline.fit(messages["inputs"],messages["intents"])

Pipeline(steps=[('bow', CountVectorizer(analyzer=<function text_process at 0x000002EA346CD378>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocesso...f=False, use_idf=True)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [14]:
predictions = pipeline.predict(["How is the weather today"])

In [15]:
predictions

array(['weather '], 
      dtype='<U8')

In [16]:
predictions = pipeline.predict(["How is the it's likely to rain today"])
print(predictions)

['weather ']


In [18]:
predictions = pipeline.predict(["I am stuck in the rain, will I get a bus today at station?"])
print(predictions)

['bus']


In [19]:
pipeline

Pipeline(steps=[('bow', CountVectorizer(analyzer=<function text_process at 0x000002EA346CD378>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocesso...f=False, use_idf=True)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [20]:
import pickle
pickle_out = open("classifier.pickle", "wb")

In [21]:
pickle.dump(pipeline, pickle_out)

In [22]:
pickle_out.close()

In [23]:
pickle_in = open("classifier.pickle", "rb")
myclassifier = pickle.load(pickle_in)

In [24]:
clpredictions = myclassifier.predict(["I am stuck in the rain, will I get a bus today at station?"])
print(clpredictions)

['bus']
