## Download CookDial from git

In [None]:
! git clone https://github.com/YiweiJiang2015/CookDial.git

## Or download CookDial from Google Drive

In [2]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
path_to_file = "/content/CookDial"
path_to_output = "/content/drive/MyDrive/CookDial"

## CookDial from MyDrive

In [4]:
import zipfile
with zipfile.ZipFile(path_to_output + ".zip","r") as zip_ref:
    zip_ref.extractall(path_to_file)

## CookDial to MyDrive

In [71]:
import shutil

shutil.make_archive(path_to_output, 'zip', path_to_file)

'/content/drive/MyDrive/CookDial.zip'

## Get dialogues number

In [5]:
import os
APP_FOLDER = '/content/CookDial/data/dialog'
totalFiles = 0
for base, _, files in os.walk(APP_FOLDER):
    print('Searching in : ',base)
    for File in files:
        totalFiles += 1

print('Total number of files',totalFiles)


Searching in :  /content/CookDial/data/dialog
Total number of files 260


## Read content of data

In [6]:
import re 

pattern = re.compile(r"\"intent\": \"([^\"]*)", re.IGNORECASE)

def parse_annotation(annotation):
    # print(annotation)
    result = re.search(pattern, annotation)
    value = result.group(1)
    value = value.replace(";", "")
    value = value.replace(" ", "#")
    return value

In [8]:
import json
import pandas as pd

utt_dict = {'label': [], 'sentence': []}

for number in range(totalFiles):
    with open(APP_FOLDER + "/" + f"{number:03d}" + ".1.json") as f:

        data = json.load(f)
        for row in data['messages']:
            if False == row["bot"]:
                parsed_ann = parse_annotation(row["annotations"])
                if "" != parsed_ann:
                    utt_dict["label"].append(parsed_ann)
                    utt_dict["sentence"].append(row["utterance"].lower())

IntentDataFrame = pd.DataFrame(utt_dict)

In [43]:
IntentDataFrame.sample(n=5)

Unnamed: 0,label,sentence
2260,10,ok good! i am ready to start now.
4431,17,great. the chicken are added back. can i eat it?
4425,19,ok. done. how long to wait from now?
2246,29,"ok nice, for how long should they cook?"
141,5,ok. i have added the baking powder to the bowl.


In [10]:
print("There are {} rows and {} columns".format(IntentDataFrame.shape[0], IntentDataFrame.shape[1]))

There are 4610 rows and 2 columns


In [11]:
# explore unique labels
print(IntentDataFrame.label.unique())

['greeting#req_start' 'req_temperature' 'thank#req_instruction'
 'confirm#req_instruction' 'req_repeat' 'confirm' 'confirm#req_repeat'
 'negate#thank' 'negate' 'req_amount' 'req_instruction'
 'confirm#req_parallel_action' 'req_amount#req_ingredient' 'thank#confirm'
 'req_use_all' 'thank' 'other' 'confirm#req_is_recipe_finished' 'req_tool'
 'confirm#req_duration' 'confirm#thank' 'affirm#req_instruction'
 'req_repeat#confirm' 'confirm#req_temperature'
 'confirm#req_is_recipe_ongoing' 'req_ingredient' 'confirm#req_amount'
 'thank#confirm#req_instruction' 'thank#req_repeat' 'req_duration'
 'thank#req_duration' 'confirm#thank#req_instruction'
 'thank#confirm#req_is_recipe_finished' 'req_repeat#thank'
 'greeting#req_title' 'req_start' 'confirm#other' 'affirm'
 'confirm#req_start' 'confirm#req_duration#req_is_recipe_finished'
 'affirm#req_amount' 'req_ingredient_list' 'thank#goodbye'
 'req_parallel_action' 'confirm#goodbye' 'affirm#req_ingredient'
 'thank#req_ingredient' 'thank#confirm#req_in

In [12]:
# explore which labels are the most and least common
IntentDataFrame.label.value_counts()

confirm#req_instruction    1222
confirm                     407
req_instruction             320
thank                       225
greeting#req_title          216
                           ... 
other#req_instruction         1
req_repeat#thank              1
confirm#req_start             1
confirm#goodbye               1
thank#req_amount              1
Name: label, Length: 91, dtype: int64

In [13]:
# drop rows with multiple labels
# df = df[df["label"].str.contains("#")==False]
# df.label.value_counts()

## Preprocessing


In [14]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.8.0-py3-none-any.whl (452 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/452.9 KB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━[0m [32m286.7/452.9 KB[0m [31m8.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m452.9/452.9 KB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash
  Downloading xxhash-3.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (213 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/213.0 KB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 KB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting huggingface

In [15]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/5.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.4/5.8 MB[0m [31m12.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m3.1/5.8 MB[0m [31m48.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m5.8/5.8 MB[0m [31m64.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m46.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [16]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import json
import datasets #Hugging Face library
from transformers import DataCollatorWithPadding
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#for dirname, _, filenames in os.walk(ModelPath):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [17]:
# replace the labels strings by label numbers 
unique_labels = IntentDataFrame.label.unique()
LabelToIndex = {}

for i in range(len(unique_labels)):
    LabelToIndex[unique_labels[i]] = i

IntentDataFrame["label"]=IntentDataFrame["label"].map(LabelToIndex)

In [18]:
train_data = IntentDataFrame.sample(frac=0.8, random_state=25)
test_data = IntentDataFrame.drop(train_data.index)

train_data = datasets.Dataset.from_pandas(train_data)
test_data = datasets.Dataset.from_pandas(test_data)

print(f"No. of training examples: {train_data.shape[0]}")
print(f"No. of testing examples: {test_data.shape[0]}")

No. of training examples: 3688
No. of testing examples: 922


In [19]:
# Import AutoTokenizer with checkpoint"distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [20]:
# Tokenization work on train_dataset
def preprocess_function(examples):
    return tokenizer(examples["sentence"], truncation=True, padding=True)
tokenize_train=train_data.map(preprocess_function,batched=True)
tokenize_test=test_data.map(preprocess_function,batched=True)


  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [21]:
# data_collator

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [22]:
# Build model 

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(unique_labels))

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classi

In [30]:
from datasets import load_metric
metric = load_metric('accuracy')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

  metric = load_metric('accuracy')


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [33]:
# Model fine tuning training
training_args = TrainingArguments(
    output_dir="/content/results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=20,
    weight_decay=0.01,
    evaluation_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenize_train,
    eval_dataset=tokenize_test,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, sentence. If __index_level_0__, sentence are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 3688
  Num Epochs = 20
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 9220
  Number of trainable parameters = 67023451


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.174354,0.822126
2,0.066100,1.39228,0.784165
3,0.061900,1.246237,0.82321
4,0.056900,1.324116,0.813449
5,0.089800,1.304054,0.819957
6,0.103200,1.212749,0.818872
7,0.093600,1.229501,0.819957
8,0.098000,1.279489,0.817787
9,0.083700,1.188151,0.821041
10,0.068900,1.274577,0.816703


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, sentence. If __index_level_0__, sentence are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 922
  Batch size = 8
Saving model checkpoint to /content/results/checkpoint-500
Configuration saved in /content/results/checkpoint-500/config.json
Model weights saved in /content/results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /content/results/checkpoint-500/tokenizer_config.json
Special tokens file saved in /content/results/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, sentence. If __index_level_0__, sentence are not expected by `DistilBertFor

TrainOutput(global_step=9220, training_loss=0.06487055326490754, metrics={'train_runtime': 624.1342, 'train_samples_per_second': 118.18, 'train_steps_per_second': 14.772, 'total_flos': 869726868928704.0, 'train_loss': 0.06487055326490754, 'epoch': 20.0})

In [36]:
def SentenceClassifier(InputSentence):
    """ Take a sentence as input, return the corresponding label
    
    dependencies : tokenizer, trainer
    """
    
    def preprocess_function(examples):
        return tokenizer(examples["sentence"], truncation=True, padding=True)
    
    # here, we are keeping the input as a Dataset, which could allow us to reuse the code
    # to answer many questions at once
    InputSentenceDFData = {'sentence' : [InputSentence]}
    InputSentenceDataFrame = pd.DataFrame(data = InputSentenceDFData)
    InputSentenceDataset = datasets.Dataset.from_pandas(InputSentenceDataFrame)
    Tokenised_InputSentence = InputSentenceDataset.map(preprocess_function,batched=False)
    
    LabelScores = trainer.predict(Tokenised_InputSentence)
    BestLabel = LabelScores.predictions.argmax(1)
    
    OutputLabelName = list(LabelToIndex.keys())[list(LabelToIndex.values()).index(BestLabel[0])]
    
    return OutputLabelName

In [62]:
InputSentence = "yes please".lower()
OutputLabel = SentenceClassifier(InputSentence)
print(f'Your question was : "{InputSentence}" it was classified as : "{OutputLabel}"')

  0%|          | 0/1 [00:00<?, ?ex/s]

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: sentence. If sentence are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 1
  Batch size = 8


Your question was : "yes please" it was classified as : "affirm"


In [65]:
# Save the model and tokenizer locally
!mkdir /content/CookDial/working
!mkdir /content/CookDial/working/model/
!mkdir /content/CookDial/working/tokenizer/

ModelPath = "/content/CookDial/working/model/"
TokenizerPath = "/content/CookDial/working/tokenizer/"

if os.path.isdir(ModelPath):
    model.save_pretrained(ModelPath)
    print("model ok")
if os.path.isdir(TokenizerPath):
    tokenizer.save_pretrained(TokenizerPath)
    print("tokenizer ok")

Configuration saved in /content/CookDial/working/model/config.json
Model weights saved in /content/CookDial/working/model/pytorch_model.bin
tokenizer config file saved in /content/CookDial/working/tokenizer/tokenizer_config.json
Special tokens file saved in /content/CookDial/working/tokenizer/special_tokens_map.json


model ok
tokenizer ok


In [66]:
# Load the model and tokenizer from a local path\
LocalModel = AutoModelForSequenceClassification.from_pretrained(ModelPath,num_labels=len(unique_labels))
LocalTokenizer = AutoTokenizer.from_pretrained(TokenizerPath)

loading configuration file /content/CookDial/working/model/config.json
Model config DistilBertConfig {
  "_name_or_path": "/content/CookDial/working/model/",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24",
    "25": "LABEL_25",
    "26": "LABEL_26",
    "27": "LABEL_27",
    "28": "LABEL_28",
    "29": "LABEL_29",
    "30": "L

In [69]:
def LocalSentenceClassifier(InputSentence):
    """ Take a sentence as input, return the corresponding label
    
    dependencies : LocalTokenizer, LocalModel
    We use tokenizer2 and trainer2 instead of tokeninzer and trainer
    to be sure that this function works with the data saved and load locally
    """
    
    trainer = Trainer(
        model=LocalModel,
        args=training_args,
        train_dataset=tokenize_train,
        #eval_dataset=tokenize_test,  Here, we work with the entire dataset as training data
        #compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )
    
    def preprocess_function(examples):
        return LocalTokenizer(examples["sentence"], truncation=True, padding=True)
    
    # here, we are keeping the input as a Dataset, which could allow us to reuse the code
    # to answer many questions at once
    InputSentenceDFData = {'sentence' : [InputSentence]}
    InputSentenceDataFrame = pd.DataFrame(data = InputSentenceDFData)
    InputSentenceDataset = datasets.Dataset.from_pandas(InputSentenceDataFrame)
    Tokenised_InputSentence = InputSentenceDataset.map(preprocess_function,batched=False)
    
    LabelScores = trainer.predict(Tokenised_InputSentence)
    BestLabel = LabelScores.predictions.argmax(1)
    
    OutputLabelName = list(LabelToIndex.keys())[list(LabelToIndex.values()).index(BestLabel[0])]
    
    return OutputLabelName

In [70]:
InputSentence = "ok next step"
OutputLabel = LocalSentenceClassifier(InputSentence)
print(f'Your question was : "{InputSentence}" it was classified as : "{OutputLabel}"')

  0%|          | 0/1 [00:00<?, ?ex/s]

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: sentence. If sentence are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 1
  Batch size = 8


Your question was : "ok next step" it was classified as : "req_instruction"


In [None]:
# used train script: https://www.kaggle.com/code/philanoe/intent-classifier-training