In [19]:
import nltk
import random
import json
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import accuracy_score

In [7]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [18]:
with open('intents.json') as json_file:
    intents = json.load(json_file)

In [21]:
text_data = []
labels = []
stopwords = set(nltk.corpus.stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

for intent in intents['intents']:
    for example in intent['patterns']:
        tokens = nltk.word_tokenize(example.lower())
        filtered_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stopwords and token.isalpha()]
        if filtered_tokens:
            text_data.append(' '.join(filtered_tokens))
            labels.append(intent['tag'])

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(text_data)
y = labels

In [22]:
def find_best_model(X, y, test_size=0.2):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    models = [
        ('Logistic Regression', LogisticRegression(), {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': [0.1, 1.0, 10.0],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [100, 1000, 10000]
}),
        ('Multinomial Naive Bayes', MultinomialNB(), {'alpha': [0.1, 0.5, 1.0]}),
        ('Linear SVC', LinearSVC(), {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'loss': ['hinge', 'squared_hinge'],
    'C': [0.1, 1, 10],
    'max_iter': [100, 1000, 10000]
}),
        ('Decision Tree', DecisionTreeClassifier(), {
    'max_depth': [5, 10, 20, 30, 50, None],
    'min_samples_split': [2, 5, 10, 15, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': ['auto', 'sqrt', 'log2', None],
    'criterion': ['gini', 'entropy']
}),
        ('Random Forest', RandomForestClassifier(), {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
})
    ]

    for name, model, param_grid in models:
        grid = GridSearchCV(model, param_grid, cv=5, n_jobs=-1)
        grid.fit(X_train, y_train)
        y_pred = grid.predict(X_test)
        score = accuracy_score(y_test, y_pred)
        print(f'{name}: {score:.4f} (best parameters: {grid.best_params_})')
 
    best_model = max(models, key=lambda x: GridSearchCV(x[1], x[2], cv=5, n_jobs=-1).fit(X_train, y_train).score(X_test, y_test))
    print(f'\nBest model: {best_model[0]}')
    
    # fit the best model to the full training data
    best_model[1].fit(X, y)

    return best_model[1]

In [23]:
best_model = find_best_model(X, y)

Logistic Regression: 0.8101 (best parameters: {'C': 10.0, 'max_iter': 100, 'penalty': 'l2', 'solver': 'sag'})
Multinomial Naive Bayes: 0.7342 (best parameters: {'alpha': 0.1})
Linear SVC: 0.8101 (best parameters: {'C': 1, 'loss': 'squared_hinge', 'max_iter': 10000, 'penalty': 'l2'})
Decision Tree: 0.7215 (best parameters: {'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10})
Random Forest: 0.7468 (best parameters: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200})

Best model: Linear SVC


In [24]:
def chatbot_response(user_input):
    input_text = vectorizer.transform([user_input])
    predicted_intent = best_model.predict(input_text)[0]
    
    for intent in intents['intents']:
        if intent['tag'] == predicted_intent:
            response = random.choice(intent['responses'])
            break
            
    return response

In [25]:
print('Hello! I am a chatbot. How can I help you today? Type "quit" to exit.')
while True:
    user_input = input('> ')
    if user_input.lower() == 'quit':
        break
    response = chatbot_response(user_input)
    print(response)

Hello! I am a chatbot. How can I help you today? Type "quit" to exit.
> hello
Good to see you again!
> how
welcome, anything else i can assist you with?
> why
welcome, anything else i can assist you with?
> who
welcome, anything else i can assist you with?
> where
welcome, anything else i can assist you with?
> location
<a target="_blank" href="ADD YOU GOOGLE MAP LINK HERE"> here</a>
> quit


In [8]:
import pickle


with open('chatbot_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)


with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

with open('intents.json', 'w') as f:
    json.dump(intents, f)