<a href="https://colab.research.google.com/github/rjac-ml/Gender-Bias-Wiki/blob/add-colab-code/ModelTraining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!rm -r sample_data/

In [30]:
from psutil import virtual_memory
import os 

ram_gb = virtual_memory().total / 1e9
NCPU = os.cpu_count()

print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))
print("Number  of CPU {}\n".format(NCPU))

gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Your runtime has 54.8 gigabytes of available RAM

Number  of CPU 8

Tue May 24 23:39:50 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   52C    P0    40W / 250W |  12889MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-------------------

In [None]:
! cp -r /content/drive/MyDrive/pipeline/dataset ./

### Libraries

In [None]:
%%capture
!pip install datasets transformers spacy

In [19]:
import pandas as pd
import numpy as np
from datasets import load_dataset,load_from_disk,load_metric
from transformers import (
    pipeline
    ,AutoModelForSequenceClassification
    ,AutoTokenizer
    ,Trainer
    ,TrainingArguments
    ,DataCollatorWithPadding
)
import spacy
import re

In [20]:
metric = load_metric("f1")

def compute_metrics(eval_pred):  # custom method to take in logits and calculate accuracy of the eval set
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [21]:
#nlp = spacy.load("en_core_web_sm")

### Data Modeling
Deep Learning and Machine learning algorithms are develop to find patterns/ statistical 
distribution in the data. This mean that if there is no Bias toward the Language use in a text the model would not be able to find a pattern (even if we de-gendered the text). As the paper mention  "*In an ideal world, we would expect little difference between texts describing men, women, and people with other gender identities, aside from the use of explicitly gendered words, like pronouns or names. A machine learning model, then, would be unable to pick up on statistical differences among gender labels (i.e., gender bias), because such differences would not exist*" https://arxiv.org/abs/2005.00614

### Data Sourcing

In [None]:
df = load_from_disk("dataset")

In [None]:
df

DatasetDict({
    train: Dataset({
        features: ['features', 'target'],
        num_rows: 4845668
    })
    test: Dataset({
        features: ['features', 'target'],
        num_rows: 2076715
    })
})

In [None]:
df = df.rename_columns({"target":"label","features":"text"})

In [None]:
df = df.map(lambda line: {"text": line["text"].lower()},num_proc=NCPU)

         

#0:   0%|          | 0/605709 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/605709 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/605709 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/605709 [00:00<?, ?ex/s]

 

#4:   0%|          | 0/605708 [00:00<?, ?ex/s]

 

#5:   0%|          | 0/605708 [00:00<?, ?ex/s]

  

#6:   0%|          | 0/605708 [00:00<?, ?ex/s]

#7:   0%|          | 0/605708 [00:00<?, ?ex/s]

          

#0:   0%|          | 0/259590 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/259590 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/259590 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/259589 [00:00<?, ?ex/s]

 

#4:   0%|          | 0/259589 [00:00<?, ?ex/s]

 

#5:   0%|          | 0/259589 [00:00<?, ?ex/s]

 

#6:   0%|          | 0/259589 [00:00<?, ?ex/s]

#7:   0%|          | 0/259589 [00:00<?, ?ex/s]

In [None]:
model_dir = "distilbert-base-uncased"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_dir)

def tokenize_function(lines):
    result = tokenizer(lines["text"], truncation=True)
    return result

In [None]:
tokenized_datasets = df.map(tokenize_function, batched=True,num_proc=NCPU)

         

#0:   0%|          | 0/606 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/606 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/606 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/606 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/606 [00:00<?, ?ba/s]

  

#5:   0%|          | 0/606 [00:00<?, ?ba/s]

 

#6:   0%|          | 0/606 [00:00<?, ?ba/s]

#7:   0%|          | 0/606 [00:00<?, ?ba/s]

          

#0:   0%|          | 0/260 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/260 [00:00<?, ?ba/s]

#2:   0%|          | 0/260 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/260 [00:00<?, ?ba/s]

   

#5:   0%|          | 0/260 [00:00<?, ?ba/s]

#4:   0%|          | 0/260 [00:00<?, ?ba/s]

 

#6:   0%|          | 0/260 [00:00<?, ?ba/s]

#7:   0%|          | 0/260 [00:00<?, ?ba/s]

In [None]:
tokenized_datasets = tokenized_datasets.map(lambda line: {"label":line["label"]+1},num_proc=NCPU)

         

#0:   0%|          | 0/605709 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/605709 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/605709 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/605709 [00:00<?, ?ex/s]

 

#4:   0%|          | 0/605708 [00:00<?, ?ex/s]

  

#5:   0%|          | 0/605708 [00:00<?, ?ex/s]

 

#6:   0%|          | 0/605708 [00:00<?, ?ex/s]

#7:   0%|          | 0/605708 [00:00<?, ?ex/s]

          

#0:   0%|          | 0/259590 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/259590 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/259590 [00:00<?, ?ex/s]

#3:   0%|          | 0/259589 [00:00<?, ?ex/s]

   

#4:   0%|          | 0/259589 [00:00<?, ?ex/s]

#5:   0%|          | 0/259589 [00:00<?, ?ex/s]

 

#6:   0%|          | 0/259589 [00:00<?, ?ex/s]

#7:   0%|          | 0/259589 [00:00<?, ?ex/s]

In [None]:
#tokenized_datasets.save_to_disk("tokenized_dataset-MalezeroNeutraloneFemaletwo")

In [None]:
#!cp -r tokenized_dataset-MalezeroNeutraloneFemaletwo /content/drive/MyDrive/pipeline

In [None]:
!cp -r /content/drive/MyDrive/pipeline/tokenized_dataset-MalezeroNeutraloneFemaletwo ./

In [31]:
tokenized_datasets = load_from_disk("/content/tokenized_dataset-MalezeroNeutraloneFemaletwo")

In [32]:
model_dir = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSequenceClassification.from_pretrained(model_dir, num_labels=3)

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.19.2",
  "vocab_size": 30522
}

loading file https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/0e1bbfda7f63a99bb52e3915dcf10

In [33]:
model.config.id2label = {0:"M",1:"N",2:"F"}

In [34]:
for param in model.distilbert.parameters():
    param.requires_grad = False

In [35]:
batch_size = 16
logging_steps = len(tokenized_datasets["train"]) // batch_size
#model_name = model_dir.split("/")[-1]

In [36]:
training_args = TrainingArguments(
    output_dir="gender_classification",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=1,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_steps=logging_steps*10,
    save_total_limit=5
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [37]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [38]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 4845668
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 302855


Epoch,Training Loss,Validation Loss


Saving model checkpoint to gender_classification/checkpoint-500
Configuration saved in gender_classification/checkpoint-500/config.json
Model weights saved in gender_classification/checkpoint-500/pytorch_model.bin
tokenizer config file saved in gender_classification/checkpoint-500/tokenizer_config.json
Special tokens file saved in gender_classification/checkpoint-500/special_tokens_map.json
Saving model checkpoint to gender_classification/checkpoint-1000
Configuration saved in gender_classification/checkpoint-1000/config.json
Model weights saved in gender_classification/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in gender_classification/checkpoint-1000/tokenizer_config.json
Special tokens file saved in gender_classification/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to gender_classification/checkpoint-1500
Configuration saved in gender_classification/checkpoint-1500/config.json
Model weights saved in gender_classification/checkpoint-1500/pytorch_