In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/contradictory-my-dear-watson/sample_submission.csv
/kaggle/input/contradictory-my-dear-watson/train.csv
/kaggle/input/contradictory-my-dear-watson/test.csv


In [2]:
import torch
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
model = AutoModelForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=3).to(device)

Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_p

In [4]:
train_df=pd.read_csv("../input/contradictory-my-dear-watson/train.csv")
train_df=train_df.sample(frac=1.0)
val_df=train_df.iloc[:2000, :]
train_df=train_df.iloc[2000:, :]
train_df['input_text']=train_df['premise']+' [SEP] '+train_df['hypothesis']
val_df['input_text']=val_df['premise']+' [SEP] '+val_df['hypothesis']


In [5]:
from datasets import Dataset
train_ds=Dataset.from_pandas(train_df)
val_ds=Dataset.from_pandas(val_df)

In [6]:
train_ds

Dataset({
    features: ['id', 'premise', 'hypothesis', 'lang_abv', 'language', 'label', 'input_text', '__index_level_0__'],
    num_rows: 10120
})

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

In [8]:
def preprocess_function(examples):
    return tokenizer(examples["input_text"], truncation=True)

In [9]:
train_ds_encoded = train_ds.map(preprocess_function, batched=True)
val_ds_encoded = val_ds.map(preprocess_function, batched=True)

  0%|          | 0/11 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [10]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [11]:
from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(pred):
    labels=pred.label_ids
    preds=pred.predictions.argmax(-1)
    f1=f1_score(labels, preds, average='weighted')
    ac=accuracy_score(labels, preds)
    return {"accuracy":ac, "f1":f1}

In [12]:
os.environ["WANDB_DISABLED"] = "true"

In [13]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_ds_encoded,
    eval_dataset=val_ds_encoded,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
The following columns in the training set don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: language, hypothesis, lang_abv, input_text, id, premise, __index_level_0__. If language, hypothesis, lang_abv, input_text, id, premise, __index_level_0__ are not expected by `XLMRobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10120
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3165


Step,Training Loss
500,1.0448
1000,0.7937
1500,0.6389
2000,0.5059
2500,0.395
3000,0.2918


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json
Model weights saved in ./results/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1500/special_toke

TrainOutput(global_step=3165, training_loss=0.5953111030855827, metrics={'train_runtime': 677.902, 'train_samples_per_second': 74.642, 'train_steps_per_second': 4.669, 'total_flos': 2261221378837344.0, 'train_loss': 0.5953111030855827, 'epoch': 5.0})

In [14]:
val_output=trainer.predict(val_ds_encoded)

The following columns in the test set don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: language, hypothesis, lang_abv, input_text, id, premise, __index_level_0__. If language, hypothesis, lang_abv, input_text, id, premise, __index_level_0__ are not expected by `XLMRobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 2000
  Batch size = 16


In [15]:
val_output.metrics

{'test_loss': 0.9649223685264587,
 'test_accuracy': 0.724,
 'test_f1': 0.7243088309402833,
 'test_runtime': 5.8803,
 'test_samples_per_second': 340.121,
 'test_steps_per_second': 21.258}

In [16]:
test_df=pd.read_csv("../input/contradictory-my-dear-watson/test.csv")
test_df['input_text']=test_df['premise']+' [SEP] '+test_df['hypothesis']
test_ds=Dataset.from_pandas(test_df)
test_ds_encoded=test_ds.map(preprocess_function, batched=True)

  0%|          | 0/6 [00:00<?, ?ba/s]

In [17]:
test_output=trainer.predict(test_ds_encoded)
outputs=test_output.predictions.argmax(-1)
sub_df=pd.DataFrame({"id": test_df["id"], "prediction": outputs})
sub_df.set_index("id", inplace=True)
sub_df.to_csv("submission.csv")

The following columns in the test set don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: language, hypothesis, lang_abv, input_text, id, premise. If language, hypothesis, lang_abv, input_text, id, premise are not expected by `XLMRobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 5195
  Batch size = 16


In [18]:
sub_df

Unnamed: 0_level_0,prediction
id,Unnamed: 1_level_1
c6d58c3f69,2
cefcc82292,2
e98005252c,0
58518c10ba,1
c32b0d16df,0
...,...
5f90dd59b0,1
f357a04e86,2
1f0ea92118,0
0407b48afb,0
