# Initial code for our ML model

#### Installation of all important libraries

In [None]:
! pip3 install transformers
! pip3 install datasets
! pip3 install scipy sklearn
! pip3 install huggingface_hub
! pip3 install ipywidgets
! pip3 install "transformers==4.16.*"

#### HuggingFace notebook login so that their functionality will work for pushing the completed model to the internet

In [107]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center>\n<img src=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

#### Basic setup and initial dataset loading

We are going to be performing the glue (General Language Understanding Evaluation) task to evaluate our understanding of the language. NOTE: This will change to be inferring things about the metrics of our data.

We use huawei's TinyBERT as the pre-trained model that we will train on. It is 50mb, much smaller than the original 250mb of traditional BERT.

In [3]:
task = "cola"
model_checkpoint = "huawei-noah/TinyBERT_General_4L_312D"
batch_size = 16

In [110]:
from datasets import load_dataset, load_metric

Loading of our datasets, already parsed into train, test, validation by our direct input. NOTE: Subject to change when we move to full implimentation

In [111]:
dataset = load_dataset('json', data_files={'train':"DsetNew/tr.jsonl", 'test':"DsetNew/te.jsonl", 'validation':"DsetNew/va.jsonl" }, )
metric = load_metric('glue', 'cola')

Using custom data configuration default-0890664f5098fa31
Reusing dataset json (/Users/danielgoldelman/.cache/huggingface/datasets/json/default-0890664f5098fa31/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b)
100%|██████████| 3/3 [00:00<00:00, 544.79it/s]


View of our dataset. This shows that we have individual DatasetDicts containing ids, data, and our labels for that label.

In [112]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'data', 'label'],
        num_rows: 80
    })
    test: Dataset({
        features: ['id', 'data', 'label'],
        num_rows: 10
    })
    validation: Dataset({
        features: ['id', 'data', 'label'],
        num_rows: 11
    })
})

This shows that we have 3 value fields for id, data, and label.

In [113]:
dataset['train'].features

{'id': Value(dtype='int64', id=None),
 'data': Value(dtype='string', id=None),
 'label': Value(dtype='int64', id=None)}

In [114]:
from datasets import ClassLabel

In [None]:
Code to convert our label field to being a ClassLabel. Thus, the model can actually run our inference.

In [115]:
for i in ['test','train','validation']:
    new_features = dataset[i].features.copy()
    new_features["label"] = ClassLabel(num_classes=2, names=['True','False'], id=None)
    dataset[i] = dataset[i].cast(new_features)

Loading cached processed dataset at /Users/danielgoldelman/.cache/huggingface/datasets/json/default-0890664f5098fa31/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b/cache-dfbb20676e6b298e.arrow
Loading cached processed dataset at /Users/danielgoldelman/.cache/huggingface/datasets/json/default-0890664f5098fa31/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b/cache-d44793c1789265b3.arrow
Loading cached processed dataset at /Users/danielgoldelman/.cache/huggingface/datasets/json/default-0890664f5098fa31/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b/cache-e9116d8af522ed68.arrow


A quick peek into how our data actually looks.

In [116]:
for i in range(10):
    print(dataset["train"][i])

{'id': 42, 'data': '{"chatId":null,"ip":"<TARGET_IP>","url":"https://writemyessaytoday.us/","fingerprint":"0e817511c8c1644800d07b37e17a5e7a","userAgent":"Mozilla/5.0 (Macintosh;', 'label': 1}
{'id': 43, 'data': '1Imp":"","_staticFo":false,"_jtags":"","_l2fper":[],"_natpt":41,"_mbr":1,"_anc":[],"_im":[],"_ccTVal":2000,"_mNVisitIdData":"<TARGET_IP>","_mNVsid":"DefVid","_ip2c":"US","_ip2sc":"GA","viewid":"1646982560","_dma":"524","_ip2allsc":"GA","_mxnf":"0","_asn":"46562', 'label': 1}
{'id': 44, 'data': 'var eti = "1646982561";var esi_ip = "<TARGET_IP>";var esi_ua = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/100.0.4', 'label': 1}
{'id': 45, 'data': 'EDDED_PLAYER","INNERTUBE_CLIENT_VERSION":"1.20220309.01.01","INNERTUBE_CONTEXT":{"client":{"hl":"en","gl":"US","remoteHost":"<TARGET_IP>","deviceMake":"Apple","deviceModel":"","visitorData":"Cgs1QmhjSDRpTjBLTSjj6quRBg%3D%3D","userAgent":"Mozilla/5.0 (Macintosh;', 'label': 1}
{'id': 4

Code to show random elements from our dataset.

In [117]:
import datasets
import random
import pandas as pd
from IPython.display import display, HTML


def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(
        dataset
    ), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset) - 1)
        while pick in picks:
            pick = random.randint(0, len(dataset) - 1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [118]:
show_random_elements(dataset["train"])

Unnamed: 0,id,data,label
0,91,"Buyback-Program.html"">Injured Gadgets - Lcd Buyback Program</a></b>\n<br>\n<span class=""adcat"">\nServices / Other\n<br>Norcross, <TARGET_REGION>\n</span>\n</td>\n<td align=""right"" width=""100"">\n</td>\n</tr>\n<tr class=""latest"">\n<td width=""15"">\n<img src=""images/bullet.gif"" al",True
1,176,3 78 3C36.5786 3 3 36.5786 3 78C3 119.421 36.5786 153 78 153ZM57 <TARGET_LAT>C57 45.6421 53.6421 49 49.5 49C45.3579 49 42 45.6421 42 <TARGET_LAT>C42 37.3579 45.3579 34 49.5 34C53.6421 34 57 37.3579 57 <TARGET_LAT>ZM83 74C83 79.5228 78.5228 84 73 84C67.4772 84 63 79.5228 63 74C63 68.4,True
2,138,.5776-53.8624 20.48-76.3904 74.9568 91.7504 186.7776 152.3712 312.9344 158.72-2.6624-11.0592-3.8912-22.7328-3.8912-34.6112 0-<TARGET_LNG> 67.9936-151.7568 151.7568-151.7568 43.6224 0 83.1488 18.432 110.7968 47.9232 34.6112-6.7584 67.1744-19.456 96.4608-36.864-11,True
3,164,"1Imp"":"""",""_staticFo"":false,""_jtags"":"""",""_l2fper"":[],""_natpt"":41,""_mbr"":1,""_anc"":[],""_im"":[],""_ccTVal"":2000,""_mNVisitIdData"":""<TARGET_IP>"",""_mNVsid"":""DefVid"",""_ip2c"":""US"",""_ip2sc"":""NY"",""viewid"":""1647048784"",""_dma"":""501"",""_ip2allsc"":""NY"",""_mxnf"":""0"",""_asn"":""32780",False
4,69,.8325871 28.7276843 59.6359161 31.5299465 57.5965801 <TARGET_LAT> 54.3880248 35.0129818 50.0102502 35 44.2962249 35 40.3865265 <TARGET_LAT> 38.2811549 29.815594 37.0823886 27.6514148 36.5020826 25.1929666 36.6050149 22.7146793L36.6050149 5.86964437C36.768517 4.630,True
5,44,"var eti = ""1646982561"";var esi_ip = ""<TARGET_IP>"";var esi_ua = ""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/100.0.4",False
6,120,"=> 'Atlanta', 'lat' => 33.749, 'long' => -84.388, 'postal_code' => NULL, 'region_code' => 'GA', 'region_name' => '<TARGET_REGION>', ) [] {""class"":""UserCountry"",""request_id"":""129c1""}\n\n[2022-03-11 07:08:51] piwik.DEBUG: array ( 'idvisitor' => '8ac6abfac1",False
7,56,".0331C82.3077 25.0207 83.0546 25.6019 84.8818 26.0468L86.4436 26.4247C89.3522 27.1026 90.5606 28.254 90.5606 30.2974C90.5606 <TARGET_LAT> 88.4481 <TARGET_LAT> 85.088 <TARGET_LAT>C81.9443 <TARGET_LAT> 79.8217 <TARGET_LAT> 79.6846 30.4226L81.8562 30.4227Z"" fill=""white""/>\n<path d=""M95.14",True
8,151,63010713 36.50664 3.3713665 35.863085 2.30347703 34.6749016 1.65701853 33.3492173 1.3119847 32 1.29803778L32 0 49.6878409 0zM<TARGET_LNG> 0L<TARGET_LNG> 1.31556121C82.6841158 1.33369295 81.5443544 1.46931782 80.4269053 1.72034927 79.9590562 1.84975181 79.6450257 2,True
9,116,"dea\"",\""name\"":\""Gudea\"",\""url\"":\""//fonts.googleapis.com/css?family=Gudea:400,400i,700&display=swap\"",\""family\"":\""'Gudea', <TARGET_REGION>, serif\"",\""size\"":16,\""weight\"":400,\""weights\"":[400,700],\""styles\"":{\""letterSpacing\"":\""normal\"",\""textTransform\"":\""none\",True


#### Preprocessing the data

Before feeding text to our model for inference, we must first preprocess the data. We do this by using Transformer's `Tokenizer` to convert the inputs to their corresponding ids in the pretrained vocabulary and generate all other inputs that the model requires.

To do of this, we create our tokenizer with the `AutoTokenizer.from_pretrained` method, which will ensure:

- we get a tokenizer that corresponds to the model architecture we want to use,
- we download the vocabulary used when pretraining this specific checkpoint.

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Example tokenization output.

In [5]:
tokenizer("Hello, this is a sentence!")

{'input_ids': [101, 7592, 1010, 2023, 2003, 1037, 6251, 999, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [10]:
print(f"Data: {dataset['train'][0]['data']}")

NameError: name 'dataset' is not defined

Visual of the preprocessing working on our dataset.

In [123]:
def preprocess_function(examples):
    return tokenizer(examples["data"], truncation=True)

In [124]:
preprocess_function(dataset["train"][:5])

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'input_ids': [[101, 1063, 1000, 11834, 3593, 1000, 1024, 19701, 1010, 1000, 12997, 1000, 1024, 1000, 1026, 4539, 1035, 12997, 1028, 1000, 1010, 1000, 24471, 2140, 1000, 1024, 1000, 16770, 1024, 1013, 1013, 4339, 8029, 7971, 4710, 3406, 10259, 1012, 2149, 1013, 1000, 1010, 1000, 4344, 16550, 1000, 1024, 1000, 1014, 2063, 2620, 16576, 22203, 2487, 2278, 2620, 2278, 16048, 22932, 17914, 2692, 2094, 2692, 2581, 2497, 24434, 2063, 16576, 2050, 2629, 2063, 2581, 2050, 1000, 1010, 1000, 5310, 4270, 3372, 1000, 1024, 1000, 9587, 5831, 4571, 1013, 1019, 1012, 1014, 1006, 22228, 1025, 102], [101, 1015, 5714, 2361, 1000, 1024, 1000, 1000, 1010, 1000, 1035, 10763, 14876, 1000, 1024, 6270, 1010, 1000, 1035, 1046, 15900, 2015, 1000, 1024, 1000, 1000, 1010, 1000, 1035, 1048, 2475, 22540, 2121, 1000, 1024, 1031, 1033, 1010, 1000, 1035, 14085, 13876, 1000, 1024, 4601, 1010, 1000, 1035, 16914, 2099, 1000, 1024, 1015, 1010, 1000, 1035, 2019, 2278, 1000, 1024, 1031, 1033, 1010, 1000, 1035, 10047, 1000, 1

We apply this function to all the sentences using the `map` method of our `dataset` object we created earlier. This applies the function on all the elements of all the splits in `dataset`, so our training, validation and testing data will be preprocessed in one single command.

In [125]:
pre_tokenizer_columns = set(dataset["train"].features)
encoded_dataset = dataset.map(preprocess_function, batched=True)
tokenizer_columns = list(set(encoded_dataset["train"].features) - pre_tokenizer_columns)
print("Columns added by tokenizer:", tokenizer_columns)

Loading cached processed dataset at /Users/danielgoldelman/.cache/huggingface/datasets/json/default-0890664f5098fa31/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b/cache-f55788b6127bd9c1.arrow
Loading cached processed dataset at /Users/danielgoldelman/.cache/huggingface/datasets/json/default-0890664f5098fa31/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b/cache-e3ca9005b3c1aac9.arrow
Loading cached processed dataset at /Users/danielgoldelman/.cache/huggingface/datasets/json/default-0890664f5098fa31/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b/cache-74ea2dc6c6428e57.arrow


Columns added by tokenizer: ['attention_mask', 'input_ids', 'token_type_ids']


In [126]:
encoded_dataset["train"].features

{'id': Value(dtype='int64', id=None),
 'data': Value(dtype='string', id=None),
 'label': ClassLabel(num_classes=2, names=['True', 'False'], id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

We now need to convert the datasets to a `tf.data.Dataset` to convert our dataset into the required formats. 

In [128]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

validation_key = (
    "validation_mismatched"
    if task == "mnli-mm"
    else "validation_matched"
    if task == "mnli"
    else "validation"
)
tf_train_dataset = encoded_dataset["train"].to_tf_dataset(
    columns=tokenizer_columns,
    label_cols=["labels"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)
tf_validation_dataset = encoded_dataset[validation_key].to_tf_dataset(
    columns=tokenizer_columns,
    label_cols=["labels"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [129]:
tf_train_dataset

<PrefetchDataset shapes: ({input_ids: (16, None), token_type_ids: (16, None), attention_mask: (16, None)}, (16,)), types: ({input_ids: tf.int64, token_type_ids: tf.int64, attention_mask: tf.int64}, tf.int64)>

In [130]:
from transformers import TFAutoModelForSequenceClassification
import tensorflow as tf

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
num_labels = 2
model = TFAutoModelForSequenceClassification.from_pretrained(
model_checkpoint, num_labels=num_labels, from_pt=True
)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['fit_denses.4.weight', 'fit_denses.4.bias', 'fit_denses.1.weight', 'fit_denses.0.bias', 'fit_denses.3.bias', 'fit_denses.3.weight', 'fit_denses.1.bias', 'fit_denses.2.weight', 'fit_denses.0.weight', 'fit_denses.2.bias']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classif

The `create_optimizer` function creates an `AdamW` optimizer with weights and learning rate decay. This helps with training networks. 

In [131]:
from transformers import create_optimizer

num_epochs = 5
batches_per_epoch = len(encoded_dataset["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)

optimizer, schedule = create_optimizer(
    init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps
)
model.compile(optimizer=optimizer, loss=loss)

We load the `metric` function from earlier in the code to compute metrics from the predictions. 

We wrap the metric computation in a `KerasMetricCallback`, which will compute the metrric on the validation set each epoc, which helps the `Tensorboard` and `EarlyStopping` callbacks.

NOTE: We may want to switch to a `Accuracy` callback instead.

In [132]:
from transformers.keras_callbacks import KerasMetricCallback

metric_name = (
    "pearson"
    if task == "stsb"
    else "matthews_correlation"
    if task == "cola"
    else "accuracy"
)


def compute_metrics(eval_predictions):
    predictions, labels = eval_predictions
    if task != "stsb":
        predictions = np.argmax(predictions, axis=1)
    else:
        predictions = predictions[:, 0]
    return metric.compute(predictions=predictions, references=labels)


metric_callback = KerasMetricCallback(
    metric_fn=compute_metrics, eval_dataset=tf_validation_dataset
)

We must change the environment to allow for posting to HuggingFace.

In [133]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

We now finetune the model by calling the `fit` method. We add the `PushToHubCallback` to place the model in the HuggingFace Hub.

In [136]:
from transformers.keras_callbacks import PushToHubCallback
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np

model_name = model_checkpoint.split("/")[-1]
push_to_hub_model_id = f"{model_name}-finetuned-{task}-pp-hua-d1"

tensorboard_callback = TensorBoard(log_dir="./text_classification_model_save/logs")

early_stopping_callback = EarlyStopping(patience=1)

push_to_hub_callback = PushToHubCallback(
    output_dir="./text_classification_model_save",
    tokenizer=tokenizer,
    hub_model_id=push_to_hub_model_id,
) 

callbacks = [metric_callback, tensorboard_callback, push_to_hub_callback, early_stopping_callback]

model.fit(
    tf_train_dataset,
    validation_data=tf_validation_dataset,
    epochs=3,
    callbacks=callbacks,
)

/Users/danielgoldelman/Desktop/privacy-tech-lab/privacy-pioneer-machine-learning/ourHua_d1/text_classification_model_save is already a clone of https://huggingface.co/dgoldelman/TinyBERT_General_4L_312D-finetuned-cola-pp-hua-d1. Make sure you pull the latest changes with `repo.git_pull()`.


Epoch 1/3
Epoch 2/3
Epoch 3/3


Upload file tf_model.h5:   0%|          | 32.0k/54.9M [00:00<?, ?B/s]
Upload file tf_model.h5:  99%|█████████▊| 54.1M/54.9M [00:44<00:00, 1.44MB/s]To https://huggingface.co/dgoldelman/TinyBERT_General_4L_312D-finetuned-cola-pp-hua-d1
   b4958fd..5acf7c4  main -> main

Upload file tf_model.h5: 100%|██████████| 54.9M/54.9M [00:45<00:00, 1.27MB/s]
Upload file logs/train/events.out.tfevents.1648825112.Daniels-Computer.local.65897.5.v2: 100%|██████████| 973k/973k [00:45<00:00, 21.3kB/s]

[A
Upload file logs/validation/events.out.tfevents.1648825125.Daniels-Computer.local.65897.6.v2: 100%|██████████| 503/503 [00:45<?, ?B/s]


<keras.callbacks.History at 0x7fe2048147c0>