In [None]:

import torch
import pandas as pd

from transformers import TrainingArguments, Trainer
from torch import cuda
from datasets import load_dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score
from transformers import pipeline, DistilBertForSequenceClassification, DistilBertTokenizerFast


In [2]:

device = 'cuda' if cuda.is_available() else 'cpu'


In [3]:

model_path = 'distilbert-base-uncased'

tokenizer = DistilBertTokenizerFast.from_pretrained( model_path  )

model = DistilBertForSequenceClassification.from_pretrained(
                     model_path, id2label={0: "NEG", 1: "POS"},
                     label2id={"NEG": 0, "POS": 1}
)


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'pre_clas

In [4]:

imdb_train = load_dataset('imdb', split="train")

imdb_test = load_dataset('imdb', split="test[:6250]+test[-6250:]")

imdb_val = load_dataset('imdb', split='test[6250:12500]+test[-12500:-6250]')

print(  imdb_train.shape  )
print(  imdb_test.shape   )
print(  imdb_val.shape    )


Downloading builder script:   0%|          | 0.00/4.31k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.59k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text to C:/Users/user1/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to C:/Users/user1/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0. Subsequent calls will reuse this data.


Found cached dataset imdb (C:/Users/user1/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
Found cached dataset imdb (C:/Users/user1/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


(25000, 2)
(12500, 2)
(12500, 2)


In [5]:


enc_train = imdb_train.map(lambda e: tokenizer(e['text'], padding=True, truncation=True), batched=True,
                  batch_size=1000  )

enc_test = imdb_test.map(lambda e: tokenizer(e['text'], padding=True, truncation=True), batched=True, 
                  batch_size=1000)

enc_val = imdb_val.map(lambda e: tokenizer(e['text'], padding=True, truncation=True), batched=True, 
                  batch_size=1000)




Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/12500 [00:00<?, ? examples/s]

Map:   0%|          | 0/12500 [00:00<?, ? examples/s]

In [6]:
print(  pd.DataFrame(enc_train)   )

                                                    text  label  \
0      I rented I AM CURIOUS-YELLOW from my video sto...      0   
1      "I Am Curious: Yellow" is a risible and preten...      0   
2      If only to avoid making this type of film in t...      0   
3      This film was probably inspired by Godard's Ma...      0   
4      Oh, brother...after hearing about this ridicul...      0   
...                                                  ...    ...   
24995  A hit at the time but now better categorised a...      1   
24996  I love this movie like no other. Another time ...      1   
24997  This film and it's sequel Barry Mckenzie holds...      1   
24998  'The Adventures Of Barry McKenzie' started lif...      1   
24999  The story centers around Barry McKenzie who mu...      1   

                                               input_ids  \
0      [101, 1045, 12524, 1045, 2572, 8025, 1011, 375...   
1      [101, 1000, 1045, 2572, 8025, 1024, 3756, 1000...   
2      [101, 20

In [9]:

training_args = TrainingArguments(
    # The output directory where the model predictions and checkpoints will be written
    output_dir='./MyIMDBModel', 
    do_train=True,
    do_eval=True,
    #  The number of epochs, defaults to 3.0 
    num_train_epochs=3,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=16,
    # Number of steps used for a linear warmup
    warmup_steps=100,                
    weight_decay=0.01,
    logging_strategy='steps', 
    # TensorBoard log directory               
    logging_dir='./logs',            
    logging_steps=50,
    # other options : no, steps
    evaluation_strategy="steps",
    fp16 = cuda.is_available(),
    save_strategy="epoch"
    #load_best_model_at_end=True
)


In [10]:

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds)
    acc = accuracy_score(labels, preds)
    return {
        'Accuracy': acc,
        'f1': f1
    }


In [13]:

trainer = Trainer(
    # the pre-trained model that will be fine-tuned 
    model=model,
     # training arguments that we defined above                        
    args=training_args,
    # training and validation dataset                 
    train_dataset=enc_train,         
    eval_dataset=enc_val,            
    compute_metrics= compute_metrics
)


In [14]:

## results = trainer.train()


In [15]:

# saving the best fine-tuned model & tokenizer
model_save_path = "MyBestIMDBModel"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)


Saving model checkpoint to MyBestIMDBModel
Configuration saved in MyBestIMDBModel\config.json
Model weights saved in MyBestIMDBModel\pytorch_model.bin
tokenizer config file saved in MyBestIMDBModel\tokenizer_config.json
Special tokens file saved in MyBestIMDBModel\special_tokens_map.json


('MyBestIMDBModel\\tokenizer_config.json',
 'MyBestIMDBModel\\special_tokens_map.json',
 'MyBestIMDBModel\\vocab.txt',
 'MyBestIMDBModel\\added_tokens.json',
 'MyBestIMDBModel\\tokenizer.json')

In [None]:

def get_prediction(text):
    inputs = tokenizer(text, padding=True, truncation=True, max_length=250, return_tensors="pt").to(device)
    outputs = model(inputs["input_ids"].to(device),inputs["attention_mask"].to(device))
    probs = outputs[0].softmax(1)
    return probs, probs.argmax()


In [None]:


model.to(device)
text = "I didn't like the movie since it bored me "
res = get_prediction(text)[1].item()
print(res)



In [None]:

model = DistilBertForSequenceClassification.from_pretrained("MyBestIMDBModel")
tokenizer= DistilBertTokenizerFast.from_pretrained("MyBestIMDBModel")
nlp= pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)



In [None]:
r1 = nlp("the movie was very impressive")
r2 = nlp("the script of the picture was very poor")



In [None]:
print(r1)
print(r2)