In [2]:
from google.colab import drive

drive.mount('/content/drive', force_remount=True)
folder = "/content/drive/My Drive/Colab Notebooks"

Mounted at /content/drive


In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)

In [4]:
import json
import re
import string
import itertools
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

with open(f'{folder}/KB.json', 'r') as file:
    data = json.load(file)

texts = []
labels = []

for intent in data['intents']:
    for pattern in intent['patterns']:
        texts.append(pattern)
        labels.append(intent['tag'])

cleaned_texts = []
for text in texts:
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    cleaned_texts.append(text)

vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(cleaned_texts)
y = labels

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=123)

criterion = ['gini', 'entropy', 'log_loss']
min_samples_split = [2, 5]
min_samples_leaf = [1, 2]
min_weight_fraction_leaf = [0.0, 0.1]
max_features = ['sqrt', 'log2', None]
min_impurity_decrease = [0.0, 0.1]
warm_start = [False, True]
ccp_alpha = [0.0, 0.1]

results = []

for criterion_val, min_split, min_leaf, min_weight_fraction_leaf_val, max_feat, min_impurity_decrease_val, warm_start_val, ccp_alpha_val in itertools.product(
    criterion, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_features, min_impurity_decrease, warm_start, ccp_alpha):

    rf = RandomForestClassifier(
        n_estimators=5,
        criterion=criterion_val,
        max_depth=None,
        min_samples_split=min_split,
        min_samples_leaf=min_leaf,
        min_weight_fraction_leaf=min_weight_fraction_leaf_val,
        max_features=max_feat,
        max_leaf_nodes=None,
        min_impurity_decrease=min_impurity_decrease_val,
        bootstrap=True,
        oob_score=False,
        n_jobs=-1,
        random_state=123,
        verbose=0,
        warm_start=warm_start_val,
        class_weight='balanced',
        ccp_alpha=ccp_alpha_val,
        max_samples=None
    )

    rf.fit(x_train, y_train)

    train_pred = rf.predict(x_train)
    test_pred = rf.predict(x_test)

    train_acc = accuracy_score(y_train, train_pred)
    test_acc = accuracy_score(y_test, test_pred)

    results.append({
        'params': {
            'criterion': criterion_val,
            'min_samples_split': min_split,
            'min_samples_leaf': min_leaf,
            'min_weight_fraction_leaf': min_weight_fraction_leaf_val,
            'max_features': max_feat,
            'min_impurity_decrease': min_impurity_decrease_val,
            'warm_start': warm_start_val,
            'ccp_alpha': ccp_alpha_val
        },
        'train_accuracy': train_acc,
        'test_accuracy': test_acc,
        'model': rf
    })

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


In [7]:
x_train

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 36079 stored elements and shape (5004, 1923)>

In [8]:
x_test

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 9003 stored elements and shape (1251, 1923)>

In [9]:
df = pd.DataFrame(results)

In [10]:
df.head(10)

Unnamed: 0,params,train_accuracy,test_accuracy,model
0,"{'criterion': 'gini', 'min_samples_split': 2, ...",0.985012,0.893685,"(DecisionTreeClassifier(max_features='sqrt', r..."
1,"{'criterion': 'gini', 'min_samples_split': 2, ...",0.0002,0.000799,"(DecisionTreeClassifier(ccp_alpha=0.1, max_fea..."
2,"{'criterion': 'gini', 'min_samples_split': 2, ...",0.985012,0.893685,"(DecisionTreeClassifier(max_features='sqrt', r..."
3,"{'criterion': 'gini', 'min_samples_split': 2, ...",0.0002,0.000799,"(DecisionTreeClassifier(ccp_alpha=0.1, max_fea..."
4,"{'criterion': 'gini', 'min_samples_split': 2, ...",0.0002,0.000799,"(DecisionTreeClassifier(max_features='sqrt', m..."
5,"{'criterion': 'gini', 'min_samples_split': 2, ...",0.0002,0.000799,"(DecisionTreeClassifier(ccp_alpha=0.1, max_fea..."
6,"{'criterion': 'gini', 'min_samples_split': 2, ...",0.0002,0.000799,"(DecisionTreeClassifier(max_features='sqrt', m..."
7,"{'criterion': 'gini', 'min_samples_split': 2, ...",0.0002,0.000799,"(DecisionTreeClassifier(ccp_alpha=0.1, max_fea..."
8,"{'criterion': 'gini', 'min_samples_split': 2, ...",0.98781,0.892886,"(DecisionTreeClassifier(max_features='log2', r..."
9,"{'criterion': 'gini', 'min_samples_split': 2, ...",0.0002,0.000799,"(DecisionTreeClassifier(ccp_alpha=0.1, max_fea..."


In [11]:
best = df.loc[df['test_accuracy'].idxmax()]

best_model = best['params']
best_train_acc = best['train_accuracy']
best_test_acc = best['test_accuracy']

In [12]:
sort_df = df.sort_values(by='test_accuracy', ascending=False).head(10)
sort_df

Unnamed: 0,params,train_accuracy,test_accuracy,model
16,"{'criterion': 'gini', 'min_samples_split': 2, ...",0.980416,0.893685,(DecisionTreeClassifier(random_state=843828734...
18,"{'criterion': 'gini', 'min_samples_split': 2, ...",0.980416,0.893685,(DecisionTreeClassifier(random_state=843828734...
0,"{'criterion': 'gini', 'min_samples_split': 2, ...",0.985012,0.893685,"(DecisionTreeClassifier(max_features='sqrt', r..."
2,"{'criterion': 'gini', 'min_samples_split': 2, ...",0.985012,0.893685,"(DecisionTreeClassifier(max_features='sqrt', r..."
106,"{'criterion': 'gini', 'min_samples_split': 5, ...",0.978217,0.892886,"(DecisionTreeClassifier(max_features='log2', m..."
104,"{'criterion': 'gini', 'min_samples_split': 5, ...",0.978217,0.892886,"(DecisionTreeClassifier(max_features='log2', m..."
8,"{'criterion': 'gini', 'min_samples_split': 2, ...",0.98781,0.892886,"(DecisionTreeClassifier(max_features='log2', r..."
10,"{'criterion': 'gini', 'min_samples_split': 2, ...",0.98781,0.892886,"(DecisionTreeClassifier(max_features='log2', r..."
114,"{'criterion': 'gini', 'min_samples_split': 5, ...",0.97482,0.892086,"(DecisionTreeClassifier(min_samples_split=5, r..."
112,"{'criterion': 'gini', 'min_samples_split': 5, ...",0.97482,0.892086,"(DecisionTreeClassifier(min_samples_split=5, r..."


In [13]:
import joblib

joblib.dump(best['model'], 'best_rf_model.pkl')

['best_rf_model.pkl']

In [14]:
print(f"Best Model Hyperparameters: {best_model}")
print(f"Best Train Accuracy: {best_train_acc}")
print(f"Best Test Accuracy: {best_test_acc}")

Best Model Hyperparameters: {'criterion': 'gini', 'min_samples_split': 2, 'min_samples_leaf': 1, 'min_weight_fraction_leaf': 0.0, 'max_features': 'sqrt', 'min_impurity_decrease': 0.0, 'warm_start': False, 'ccp_alpha': 0.0}
Best Train Accuracy: 0.9850119904076738
Best Test Accuracy: 0.8936850519584333
