In [None]:
!pip install datasets transformers torch >> /dev/null

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import json
import pandas as pd

def load_qa(path: str='sof_qa.json') -> pd.DataFrame:
    with open(path, 'r', encoding='utf-8') as f:
        sof_qa = json.load(f)
    df = pd.DataFrame(sof_qa)
    return df

In [None]:
df = load_qa('/content/drive/MyDrive/Colab Notebooks/sof_qa.json')
df.drop(['title', 'answers'], axis=1, inplace=True)
df.rename(columns={'type': 'label', 'question': 'text'}, inplace=True)
df.head()

Unnamed: 0,label,text
0,0,\nI'm trying to write a script to check a webs...
1,0,\nHow to solve No module named 'selenium' in V...
2,0,\nI get an error while running this selenium s...
3,0,"\nI'm making a project with selenium, and the ..."
4,0,\nI am using python 3.6 on windows10. I create...


In [None]:
from datasets import Dataset
dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['label', 'text'],
    num_rows: 50
})

In [None]:
dataset = dataset.train_test_split(0.3)
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 35
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 15
    })
})

In [None]:
dataset["train"][0]

{'label': 1,
 'text': '\nI keep getting this error:\nselenium.common.exceptions.WebDriverException: Message: \'chromedriver.exe\' executable needs to be in PATH. Please see https://chromedriver.chromium.org/home\nMy Script: (MAC)\n\nWhat should I do? The Path is okay. I though there is something wrong with the format so I added ".exe" to file, but I am getting the same issue.\nNote: The browser and package version are the same (96)\n'}

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dataset

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]



  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=6)

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=50,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 35
  Num Epochs = 50
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 450


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=450, training_loss=0.3359487575954861, metrics={'train_runtime': 41.0163, 'train_samples_per_second': 42.666, 'train_steps_per_second': 10.971, 'total_flos': 82445362220628.0, 'train_loss': 0.3359487575954861, 'epoch': 50.0})

In [None]:
from transformers import TextClassificationPipeline

device = torch.device('cpu')
model.to(device)
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer)
text = "I'm making a project with selenium, and the following error is disturbing me. I'm using VSCode with Code Runner, and my OS is Ubuntu 20.04:"
pipe(text)

[{'label': 'LABEL_0', 'score': 0.9761324524879456}]

In [None]:
model.save_pretrained('/content/drive/MyDrive/Colab Notebooks/model.pt')

Configuration saved in /content/drive/MyDrive/Colab Notebooks/model.pt/config.json
Model weights saved in /content/drive/MyDrive/Colab Notebooks/model.pt/pytorch_model.bin
