# NLP Based Chatbot - Intent Classifier

In [2]:
# Tokenization
from nltk.tokenize import word_tokenize
# Stopwords list
from nltk.corpus import stopwords
# Stemming
from nltk.stem import PorterStemmer
# Lemmatization
from nltk.stem import WordNetLemmatizer
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [3]:
training_data = [
    ("hello", "greet"), ("hi", "greet"), ("hello there", "greet"),
    ("hi there", "greet"),("hey", "greet"),("good morning", "greet"),
    ("what's the weather today","weather"),("what's the temperature","weather"),
    ("how's the weather today","weather"),("is it raining","weather"),
    ("tell me temperature","weather"),("open google","open_web"),
    ("open facebook","open_web"),("open youtube","open_web"),
    ("go to google","open_web"),("bye","exit"),
    ("goodbye","exit"),("exit","exit"),
]

In [5]:
sentences = []  # features / X
labels = []     # target / y

for text, intent in training_data:
    sentences.append(text)
    labels.append(intent)

In [7]:
def preprocess_text(documents):
    english_stopwords = stopwords.words("english")
    punctuations = string.punctuation
    cleaned_documents = []
    for doc in documents:
        # Step-1 : Lowercase
        raw_text = doc.lower()
        tokens = word_tokenize(raw_text)
        filtered_tokens = []
        for word in tokens:
            if word not in english_stopwords:
                filtered_tokens.append(word)
        clean_tokens = [word for word in filtered_tokens if word not in punctuations]

        lemmatized_words = []
        wnet = WordNetLemmatizer()
        for word in clean_tokens:
            lemmatized_words.append(wnet.lemmatize(word,"v"))

        final_tokens = []
        for word in lemmatized_words:
            if word.isalpha():
                final_tokens.append(word)

        cleaned_text = " ".join(final_tokens)        
        cleaned_documents.append(cleaned_text)
    return cleaned_documents

In [8]:
sentences[:5]

['hello', 'hi', 'hello there', 'hi there', 'hey']

In [9]:
cleaned_data = preprocess_text(sentences)

In [11]:
# cleaned_data

In [12]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(cleaned_data)

In [13]:
model = LogisticRegression()
model.fit(X, labels)

In [17]:
user_msg = "open linkedin"
processed = preprocess_text([user_msg])
user_vector = vectorizer.transform(processed)
prediction = model.predict(user_vector)
print("Prediction is :",prediction)

Prediction is : ['open_web']
