In [1]:
import pandas as pd

In [2]:
import os
import random
import numpy as np
import torch

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"  
os.environ["CUDA_VISIBLE_DEVICES"]="1"
device = 'cuda' if torch.cuda.is_available() else 'cpu'


torch.manual_seed(0)
random.seed(0)
np.random.seed(0)

In [3]:
df = pd.read_csv('../../data/100_sentiment_analysis_sentences.csv')

In [4]:
# replacing values
df['label'].replace(['POSITIVE', 'NEGATIVE', 'NEUTRAL'],
                        [2, 0,1], inplace=True)

In [5]:
from transformers import DistilBertForSequenceClassification,AutoTokenizer
MODEL_NAME = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = DistilBertForSequenceClassification.from_pretrained(MODEL_NAME,num_labels=3)
#tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'classifier

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [9]:
#Dataset preparation
from sklearn.model_selection import train_test_split


X = list(df["text"])
y = list(df["label"])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
X_train_tokenized_text = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized_text = tokenizer(X_val, padding=True, truncation=True, max_length=512)


'\ntrain_data = Dataset(X_train_tokenized_text, y_train)\ntrain_loader = DataLoader(train_data,batch_size=16, shuffle=True)\nval_data = Dataset(X_val_tokenized_text, y_val)\nval_loader = DataLoader(val_data,batch_size=16, shuffle=True)\n'

In [10]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = Dataset(X_train_tokenized_text, y_train)
val_dataset = Dataset(X_val_tokenized_text, y_val)


In [11]:
X_train_tokenized_text.keys()

dict_keys(['input_ids', 'attention_mask'])

In [17]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
def compute_metrics(pred_eval):
    pred, labels = pred_eval
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred,average='weighted')
    precision = precision_score(y_true=labels, y_pred=pred,average='weighted')
    f1 = f1_score(y_true=labels, y_pred=pred,average='weighted')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [18]:
from transformers import TrainingArguments, Trainer
args = TrainingArguments(
    output_dir="output",
    num_train_epochs=1,
    per_device_train_batch_size=16,
    report_to =None

)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [19]:
os.environ["WANDB_DISABLED"] = "true"

In [20]:

trainer.train()

***** Running training *****
  Num examples = 80
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 5


  0%|          | 0/5 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)




{'train_runtime': 19.1797, 'train_samples_per_second': 4.171, 'train_steps_per_second': 0.261, 'train_loss': 1.026472282409668, 'epoch': 1.0}


TrainOutput(global_step=5, training_loss=1.026472282409668, metrics={'train_runtime': 19.1797, 'train_samples_per_second': 4.171, 'train_steps_per_second': 0.261, 'train_loss': 1.026472282409668, 'epoch': 1.0})

In [21]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 20
  Batch size = 8


  0%|          | 0/3 [00:00<?, ?it/s]

<class 'transformers.trainer_utils.EvalPrediction'>


  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 1.0770832300186157,
 'eval_accuracy': 0.4,
 'eval_precision': 0.24000000000000005,
 'eval_recall': 0.4,
 'eval_f1': 0.3,
 'eval_runtime': 0.8825,
 'eval_samples_per_second': 22.662,
 'eval_steps_per_second': 3.399,
 'epoch': 1.0}

In [22]:
trainer.save_model("../models/embedding_hugging_face")

Saving model checkpoint to ../models/embedding_hugging_face
Configuration saved in ../models/embedding_hugging_face\config.json
Model weights saved in ../models/embedding_hugging_face\pytorch_model.bin


In [31]:
#Inference Code
from transformers import DistilBertForSequenceClassification,AutoTokenizer
MODEL_NAME = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
saved_model = DistilBertForSequenceClassification.from_pretrained("../models/embedding_hugging_face")

text = "Tesla produced cars"
inputs = tokenizer(text, padding=True, truncation=True, max_length=512,return_tensors='pt').to(device)
outputs = saved_model(**inputs)
print(outputs)
print(outputs.logits)
predicted_class_id = outputs.logits.argmax().item()
print(predicted_class_id)

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at C:\Users\metes/.cache\huggingface\transformers\23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.19.2",
  "vocab_size": 30522
}

loading file https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt from cache at C:\Users\metes/.cache\huggingface\transformers\0e1bbfda7f6

SequenceClassifierOutput(loss=None, logits=tensor([[-0.0370, -0.0343, -0.0549]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
tensor([[-0.0370, -0.0343, -0.0549]], grad_fn=<AddmmBackward0>)
1
