In [None]:
from fastai.text.all import *

## Data

In [None]:
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
path = untar_data(URLs.IMDB_SAMPLE)
path.ls()

(#2) [Path('/home/renato/.fastai/data/imdb_sample/texts.csv'),Path('/home/renato/.fastai/data/imdb_sample/models')]

In [None]:
df = pd.read_csv(path/'texts.csv')
df.sample(3)

Unnamed: 0,label,text,is_valid
5,negative,"From the start, you know how this movie will end. It's so full of clichés your typical NRA member will not even like this movie. I give it 2 out of 10, only because of the acting of William Benton. I can't believe people voted 6+ for this movie. It's so biased towards a 'certain point of view' (once a thief...). People aren't born bad. Neither are they born good. They are born with a clean slate. It's society, parents and education what makes them who they are. And if they take the wrong turn, somewhere down the line, it certainly isn't going to be the American penal system that gets them ...",False
320,positive,If you ever see a stand up comedy movie this is the one. You will laugh nonstop if you have any sense of humor at all. This is a once in a lifetime performance from a once in a lifetime performer. This is a stand up standard.,False
68,negative,"The story is seen before, but that does'n matter if you can figure out to make a proper storyboard. It is clear that the director haven't spent his work on the storyboard. Alongside this, the cameraman spent far too much time leaning angles that do not match the message of the movie. The funniest is, however, if you take a look at the movie's website, you can read that it was on purpose that the director has chosen to make the film with bad camera angles. Because it remind us about hunting. But I have never heard of hunting with poor camera angles ;-) It will have 1 stars because the story...",False


In [None]:
ds = Dataset.from_pandas(df).rename_columns({'label':'labels'}).class_encode_column('labels')
ds

Casting to class labels: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 419262.69 examples/s]


Dataset({
    features: ['labels', 'text', 'is_valid'],
    num_rows: 1000
})

In [None]:
model_name = 'microsoft/deberta-v3-small'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [None]:
sample_text = ds[0]['text'][:50]
sample_text

"Un-bleeping-believable! Meg Ryan doesn't even look"

In [None]:
tokenizer.tokenize(sample_text)

['▁Un',
 '-',
 'ble',
 'e',
 'ping',
 '-',
 'bel',
 'ievable',
 '!',
 '▁Meg',
 '▁Ryan',
 '▁doesn',
 "'",
 't',
 '▁even',
 '▁look']

In [None]:
ds_tokenized = ds.map(lambda row: tokenizer(row['text'], padding='max_length', truncation=True), batched=True)

Map:   0%|                                                                                                                                                | 0/1000 [00:00<?, ? examples/s]Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 6989.69 examples/s]


In [None]:
ds_tokenized

Dataset({
    features: ['labels', 'text', 'is_valid', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1000
})

In [None]:
row = ds_tokenized[0]
row['text'], row['input_ids']

("Un-bleeping-believable! Meg Ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. Hard to believe she was the producer on this dog. Plus Kevin Kline: what kind of suicide trip has his career been on? Whoosh... Banzai!!! Finally this was directed by the guy who did Big Chill? Must be a replay of Jonestown - hollywood style. Wooofff!",
 [1,
  3405,
  271,
  10339,
  473,
  11166,
  271,
  13804,
  106105,
  300,
  18677,
  3805,
  702,
  280,
  297,
  402,
  468,
  342,
  2956,
  605,
  297,
  34028,
  934,
  267,
  291,
  261,
  319,
  3406,
  682,
  351,
  10658,
  342,
  9564,
  13350,
  608,
  3754,
  115934,
  260,
  6490,
  264,
  770,
  373,
  284,
  262,
  4834,
  277,
  291,
  1560,
  260,
  2904,
  4362,
  50557,
  294,
  339,
  747,
  265,
  5997,
  1400,
  303,
  315,
  1206,
  331,
  277,
  302,
  1876,
  30067,
  260,
  260,
  260,
  11275,
  42918,
  300,
  300,
  300,
  2872,
  291,
  284,
  3655,
 

In [None]:
df.text.str.len().describe()

count    1000.000000
mean     1406.072000
std      1046.166506
min       195.000000
25%       723.000000
50%      1040.500000
75%      1742.250000
max      7382.000000
Name: text, dtype: float64

In [None]:
def get_ds(df):
    return (
        Dataset
        .from_pandas(df)
        .rename_columns({'label':'labels'})
        .class_encode_column('labels')
        .map(lambda row: tokenizer(row['text'], max_length=200, padding='max_length', truncation=True),
             batched=True)
    )

In [None]:
ds = DatasetDict({
    'train': get_ds(df.query('not is_valid')),
    'test': get_ds(df.query('is_valid')),
})
ds

Casting to class labels: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 800/800 [00:00<00:00, 224624.66 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 800/800 [00:00<00:00, 8897.22 examples/s]
Casting to class labels: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:00<00:00, 167003.94 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:00<00:00, 8283.82 examples/s]


DatasetDict({
    train: Dataset({
        features: ['labels', 'text', 'is_valid', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 800
    })
    test: Dataset({
        features: ['labels', 'text', 'is_valid', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 200
    })
})

## Model

In [None]:
import evaluate
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

In [None]:
bs = 64
epochs = 6
lr = 4e-5

In [None]:
args = TrainingArguments(
    'outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True, evaluation_strategy='epoch',
    num_train_epochs=epochs, per_device_train_batch_size=bs, per_device_eval_batch_size=bs, weight_decay=0.01
)

In [None]:
accuracy = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['pooler.dense.weight', 'classifier.bias', 'pooler.dense.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-5): 6 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=T

In [None]:
trainer = Trainer(model, args, train_dataset=ds['train'], eval_dataset=ds['test'],
                  tokenizer=tokenizer, compute_metrics=compute_metrics)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.680451,0.84
2,No log,0.380912,0.83
3,No log,0.321949,0.86
4,No log,0.352493,0.875
5,No log,0.365141,0.875
6,No log,0.372881,0.875


TrainOutput(global_step=78, training_loss=0.33954180203951323, metrics={'train_runtime': 38.5556, 'train_samples_per_second': 124.495, 'train_steps_per_second': 2.023, 'total_flos': 248385219840000.0, 'train_loss': 0.33954180203951323, 'epoch': 6.0})

In [None]:
ds['test'].features['labels']

ClassLabel(names=['negative', 'positive'], id=None)

In [None]:
preds = trainer.predict(ds['test']).predictions.argmax(1)
preds

array([1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       1, 1])

In [None]:
pd.DataFrame({
    'text': ds['test']['text'],
    'labels': ds['test']['labels'],
    'preds': preds
})

Unnamed: 0,text,labels,preds
0,"This very funny British comedy shows what might happen if a section of London, in this case Pimlico, were to declare itself independent from the rest of the UK and its laws, taxes & post-war restrictions. Merry mayhem is what would happen.<br /><br />The explosion of a wartime bomb leads to the discovery of ancient documents which show that Pimlico was ceded to the Duchy of Burgundy centuries ago, a small historical footnote long since forgotten. To the new Burgundians, however, this is an unexpected opportunity to live as they please, free from any interference from Whitehall.<br /><br />...",1,1
1,"I saw this movie once as a kid on the late-late show and fell in love with it.<br /><br />It took 30+ years, but I recently did find it on DVD - it wasn't cheap, either - in a catalog that specialized in war movies. We watched it last night for the first time. The audio was good, however it was grainy and had the trailers between reels. Even so, it was better than I remembered it. I was also impressed at how true it was to the play.<br /><br />The catalog is around here someplace. If you're sincere in finding it, fire me a missive and I'll see if I can get you the info. cartwrightbride@yah...",1,1
2,"This is, in my opinion, a very good film, especially for Michael Jackson lovers. It contains a message on drugs, stunning special effects, and an awesome music video.<br /><br />The main film is centered around the song and music video 'Smooth Criminal.' Unlike the four-minute music video, it is normal speed and, in my opinion, much easier to watch.<br /><br />The plot is rather weird, however. Michael Jackson plays a magical 'gangster' that, when he sees a shooting star, he transforms into a piece of machinery. Throughout the film, he transforms into a race car, a giant robot, and a space...",1,1
3,"In Iran, women are not permitted to attend men's sporting events, apparently to ""protect"" them from all the cursing and foul language they might hear emanating from the male fans (so since men can't restrain or behave themselves, women are forced to suffer. Go figure.). ""Offside"" tells the tale of a half dozen or so young women who, dressed like men, attempt to sneak into the high-stakes match between Iran and Bahrain that, in 2005, qualified Iran to go to the World Cup (the movie was actually filmed in large part during that game).<br /><br />""Offside"" is a slice-of-life comedy that will ...",1,1
4,"""In April 1946, the University of Chicago agreed to operate Argonne National Laboratory, with an association of Midwestern universities offering to sponsor the research. Argonne thereby became the first ""national"" laboratory. It did not, however, remain at its original location in the Argonne forest. In 1947, it moved farther west from the ""Windy City"" to a new site on Illinois farmland. When Alvin Weinberg visited Argonne's director, Walter Zinn, in 1947, he asked him what kind of reactor was to be built at the new site. When Zinn described a heavy-water reactor operating at one-tenth the...",1,0
...,...,...,...
195,"There are many different versions of this one floating around, so make sure you can locate one of the unrated copies, otherwise some gore and one scene of nudity might be missing. Some versions also omit most of the opening sequence and other bits here and there. The cut I saw has the on-screen title WITCHCRAFT: EVIL ENCOUNTERS and was released by Shriek Show, who maintain the original US release title WITCHERY for the DVD release. It's a nice-looking print and seems to have all of the footage, but has some cropping/aspect ratio issues. In Italy, it was released as LA CASA 4 (WITCHCRAFT). ...",0,1
196,"Once upon a time Hollywood produced live-action, G-rated movies without foul language, immorality, and gore-splattered violence. These movies neither insulted your intelligence no manipulated your emotions. The heroes differed little from the crowd. They shared the same feelings and bore the same burdens. Since the 1970s, the film industry has pretty much written off G-rated movies for adults. Basically, modern mature audiences demand large doses of embellished realism for their cinematic diet, laced heavily with vile profanity, mattress-thumping sex, and knuckle-bruising fisticuffs. These...",1,0
197,"Wenders was great with Million $ Hotel.I don't know how he came up with this film! The idea of giving the situation after spt11 and the view of American Society is hopeful,that makes it 2 out of ten.But this is not a movie.Is that the best someone can do with a great idea(the west-east clash).There are important things going on in middle east and it is just issued on the screen of a MAC* with the fingers of an Amerian girl who is actually at the level of stupidity(because she is just ignorant about the facts).The characters are not well shaped.And the most important thing is the idea that ...",0,0
198,"Although a film with Bruce Willis is always worth watching, you better skip this one. I watched this one on television, so I didn't have to plunk down cash for it. Lucky me.<br /><br />The plot develops slowly, very slowly. Although the first 30 minutes or so are quite believable, it gets more and more unbelievable towards the end. It is highly questionable, if a seasoned soldier like Lt. Waters would disobey direct orders. And even if he would, if the rest of his platoon would. They know he puts them in direct danger, and they know they will certainly die if they follow him, but what the ...",0,1
