<a href="https://colab.research.google.com/github/rickwag/ML/blob/main/AskUbuntuIntentClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Data Acquisation

In [1]:
data_path = "/content/drive/MyDrive/Datasets/AskUbuntu Corpus.json"

In [2]:
import pandas as pd
import json

In [3]:
with open(data_path, encoding="utf-8") as file:
    data = json.load(file)

In [4]:
#sample questions
for n in range(5):
    print(data["sentences"][n]["text"])

What software can I use to view epub documents?
Which PDF Viewer would you recommend?
What IDEs are available for Ubuntu?
What's the best Mind Mapping Software?
Software to read a QR code?


In [5]:
from IPython.core.display import HTML

#sample responses
for n in range(1):
    display(HTML(data["sentences"][n]["answer"]["text"]))

In [6]:
len(data["sentences"])

162

In [7]:
df = pd.DataFrame(data["sentences"])
df.head(5)

Unnamed: 0,author,url,text,entities,intent,answer,training
0,Olivier Lalonde,http://askubuntu.com/questions/14378/what-soft...,What software can I use to view epub documents?,[],Software Recommendation,{'text': '<p>You can use calibre software for ...,False
1,NES,http://askubuntu.com/questions/18495/which-pdf...,Which PDF Viewer would you recommend?,[],Software Recommendation,{'text': '<p>Here are some well known PDF read...,False
2,RolandiXor,http://askubuntu.com/questions/48299/what-ides...,What IDEs are available for Ubuntu?,[],Software Recommendation,"{'text': '<h1><a href=""http://www.geany.org/"" ...",False
3,Mark Davidson,http://askubuntu.com/questions/622/whats-the-b...,What's the best Mind Mapping Software?,[],Software Recommendation,"{'text': '<p>I like <a href=""https://apps.ubun...",False
4,Benjamin,http://askubuntu.com/questions/22871/software-...,Software to read a QR code?,[],Software Recommendation,{'text': '<p><strong>FOR QR CODE</strong></p> ...,False


In [8]:
#classes
df["intent"].unique()

array(['Software Recommendation', 'None', 'Shutdown Computer',
       'Make Update', 'Setup Printer'], dtype=object)

## Text Preprocessing

In [9]:
#lowercasing
df["text"] = [entry.lower() for entry in df["text"]]

df.head(2)

Unnamed: 0,author,url,text,entities,intent,answer,training
0,Olivier Lalonde,http://askubuntu.com/questions/14378/what-soft...,what software can i use to view epub documents?,[],Software Recommendation,{'text': '<p>You can use calibre software for ...,False
1,NES,http://askubuntu.com/questions/18495/which-pdf...,which pdf viewer would you recommend?,[],Software Recommendation,{'text': '<p>Here are some well known PDF read...,False


In [10]:
#tokenization
import nltk
from nltk.tokenize import word_tokenize
nltk.download("punkt")

df["text"] = [word_tokenize(entry) for entry in df["text"]]

df.head(2)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unnamed: 0,author,url,text,entities,intent,answer,training
0,Olivier Lalonde,http://askubuntu.com/questions/14378/what-soft...,"[what, software, can, i, use, to, view, epub, ...",[],Software Recommendation,{'text': '<p>You can use calibre software for ...,False
1,NES,http://askubuntu.com/questions/18495/which-pdf...,"[which, pdf, viewer, would, you, recommend, ?]",[],Software Recommendation,{'text': '<p>Here are some well known PDF read...,False


In [11]:
#stopwords removal and lemmatization
from nltk.stem.wordnet import WordNetLemmatizer

nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

from nltk.corpus import stopwords

lemmatizer = WordNetLemmatizer()

for index, doc in enumerate(df["text"]):
  new_doc = []
  for token in doc:
    if token not in stopwords.words("english"):
      token = lemmatizer.lemmatize(token) 
      new_doc.append(token)

  df.loc[index, "final_text"] = str(new_doc)

df.head(5)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Unnamed: 0,author,url,text,entities,intent,answer,training,final_text
0,Olivier Lalonde,http://askubuntu.com/questions/14378/what-soft...,"[what, software, can, i, use, to, view, epub, ...",[],Software Recommendation,{'text': '<p>You can use calibre software for ...,False,"['software', 'use', 'view', 'epub', 'document'..."
1,NES,http://askubuntu.com/questions/18495/which-pdf...,"[which, pdf, viewer, would, you, recommend, ?]",[],Software Recommendation,{'text': '<p>Here are some well known PDF read...,False,"['pdf', 'viewer', 'would', 'recommend', '?']"
2,RolandiXor,http://askubuntu.com/questions/48299/what-ides...,"[what, ides, are, available, for, ubuntu, ?]",[],Software Recommendation,"{'text': '<h1><a href=""http://www.geany.org/"" ...",False,"['ides', 'available', 'ubuntu', '?']"
3,Mark Davidson,http://askubuntu.com/questions/622/whats-the-b...,"[what, 's, the, best, mind, mapping, software, ?]",[],Software Recommendation,"{'text': '<p>I like <a href=""https://apps.ubun...",False,"[""'s"", 'best', 'mind', 'mapping', 'software', ..."
4,Benjamin,http://askubuntu.com/questions/22871/software-...,"[software, to, read, a, qr, code, ?]",[],Software Recommendation,{'text': '<p><strong>FOR QR CODE</strong></p> ...,False,"['software', 'read', 'qr', 'code', '?']"


In [12]:
#train test split
from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(df["final_text"], df["intent"], test_size=.2)

In [14]:
print(X_train.shape)
print(X_test.shape)

(129,)
(33,)


In [17]:
#label encoding
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder.fit(df["intent"])

y_train_encoded = encoder.transform(y_train)
y_test_encoded = encoder.transform(y_test)

## Feature Engineering

In [15]:
#vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

tfIdfVectorizer = TfidfVectorizer()

tfIdfVectorizer.fit(df["final_text"])

X_train = tfIdfVectorizer.transform(X_train)
X_test = tfIdfVectorizer.transform(X_test)

## Modelling

In [18]:
#naive bayes
from sklearn import naive_bayes

naive_model = naive_bayes.MultinomialNB()

#training
naive_model.fit(X_train, y_train)

MultinomialNB()

In [73]:
#naive bayes evaluation
y_pred_naive = naive_model.predict(X_test)

from sklearn.metrics import classification_report 

print(classification_report(y_test, y_pred_naive))

                         precision    recall  f1-score   support

            Make Update       0.56      1.00      0.72         9
                   None       0.00      0.00      0.00         1
          Setup Printer       1.00      0.14      0.25         7
      Shutdown Computer       1.00      0.83      0.91         6
Software Recommendation       0.82      0.90      0.86        10

               accuracy                           0.73        33
              macro avg       0.68      0.58      0.55        33
           weighted avg       0.80      0.73      0.67        33



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
#svm 
from sklearn import svm 

svm_model = svm.SVC(kernel="linear", gamma="auto")

#training
svm_model.fit(X_train, y_train)

SVC(gamma='auto', kernel='linear')

In [72]:
#svm evaluation
y_pred_svm = svm_model.predict(X_test)

print(classification_report(y_test, y_pred_svm))

                         precision    recall  f1-score   support

            Make Update       1.00      1.00      1.00         9
                   None       0.00      0.00      0.00         1
          Setup Printer       1.00      0.71      0.83         7
      Shutdown Computer       1.00      1.00      1.00         6
Software Recommendation       0.77      1.00      0.87        10

               accuracy                           0.91        33
              macro avg       0.75      0.74      0.74        33
           weighted avg       0.90      0.91      0.89        33



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [64]:
#Decision Tree
from sklearn.tree import DecisionTreeClassifier

decision_tree_model = DecisionTreeClassifier()

#training
decision_tree_model.fit(X_train, y_train)

DecisionTreeClassifier()

In [71]:
#decision tree evaluation
y_pred_tree = decision_tree_model.predict(X_test)

print(classification_report(y_test, y_pred_tree))

                         precision    recall  f1-score   support

            Make Update       0.78      0.78      0.78         9
                   None       0.00      0.00      0.00         1
          Setup Printer       1.00      0.57      0.73         7
      Shutdown Computer       1.00      1.00      1.00         6
Software Recommendation       0.71      1.00      0.83        10

               accuracy                           0.82        33
              macro avg       0.70      0.67      0.67        33
           weighted avg       0.82      0.82      0.80        33



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [67]:
#Multi Layer Perceptron model
from sklearn.neural_network import MLPClassifier

MLP_model = MLPClassifier()

#training
MLP_model.fit(X_train, y_train)



MLPClassifier()

In [70]:
#MLP evaluation
y_pred_MLP = MLP_model.predict(X_test) 

print(classification_report(y_test, y_pred_MLP))

                         precision    recall  f1-score   support

            Make Update       0.90      1.00      0.95         9
                   None       0.00      0.00      0.00         1
          Setup Printer       1.00      1.00      1.00         7
      Shutdown Computer       1.00      1.00      1.00         6
Software Recommendation       0.90      0.90      0.90        10

               accuracy                           0.94        33
              macro avg       0.76      0.78      0.77        33
           weighted avg       0.91      0.94      0.93        33



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [30]:
text = "how to view pdfs"
text_vec = tfIdfVectorizer.transform([text])
svm_model.predict(text_vec)

array(['Software Recommendation'], dtype=object)

## Evaluation

In [102]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

model_names = ["Naive Bayes", "Support Vector Machine", "Decision Tree", "Multi Layer Perceptron"]
predictions = [y_pred_naive, y_pred_svm, y_pred_tree, y_pred_MLP]
precisions = [precision_score(y_test, predictions[n], average="macro") for n in range(len(predictions))]
recalls = [recall_score(y_test, predictions[n], average="macro") for n in range(len(predictions))]
f1s = [f1_score(y_test, predictions[n], average="macro") for n in range(len(predictions))]
accuracies = [accuracy_score(y_test, predictions[n]) for n in range(len(predictions))]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [105]:
data = [[model_names[n], precisions[n], recalls[n], f1s[n], accuracies[n]] for n in range(len(model_names))]
print(data)

[['Naive Bayes', 0.6761363636363636, 0.5752380952380952, 0.5472467532467532, 0.7272727272727273], ['Support Vector Machine', 0.7538461538461538, 0.7428571428571429, 0.7405797101449275, 0.9090909090909091], ['Decision Tree', 0.6984126984126984, 0.6698412698412699, 0.6676767676767676, 0.8181818181818182], ['Multi Layer Perceptron', 0.76, 0.78, 0.7694736842105263, 0.9393939393939394]]


In [106]:
from tabulate import tabulate

col_names = ["model", "precision", "recall", "f1", "accuracy"]

print(tabulate(data, headers=col_names))

model                     precision    recall        f1    accuracy
----------------------  -----------  --------  --------  ----------
Naive Bayes                0.676136  0.575238  0.547247    0.727273
Support Vector Machine     0.753846  0.742857  0.74058     0.909091
Decision Tree              0.698413  0.669841  0.667677    0.818182
Multi Layer Perceptron     0.76      0.78      0.769474    0.939394
