In [1]:
import numpy as np 
import pandas as pd 
import os 
from transformers import *
import seaborn as sns
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, TensorDataset, DataLoader, RandomSampler, SequentialSampler 
import time
from datetime import datetime 
from sklearn.model_selection import StratifiedKFold
from tqdm.auto import tqdm
import torch
import torch.nn as nn 
import torch.nn.functional as F
from sklearn.utils.class_weight import compute_class_weight

INF = 999999999999999



In [2]:
train = pd.read_csv("train.csv") 
train.dropna(inplace=True)
train.head()

Unnamed: 0,ID,prompt,answer,AI
0,690.0,What is the future of AI?,The future of Artificial Intelligence (AI) is ...,1.0
1,304.0,What is your biggest challenge in your career?,My biggest challenge in my career is balancing...,0.0
2,63.0,What is the tallest mountain in the world?,The tallest mountain in the world is Mount Eve...,1.0
3,894.0,What are the best 5 super cars?,McLaren 720S.\r\nFerrari 296 GTB.\r\nAudi R8.\...,0.0
4,796.0,What is deep learning?,a type of machine learning based on artificial...,0.0


In [3]:
test = pd.read_csv("test.csv") 
test.head()

Unnamed: 0,ID,prompt,answer
0,710,what's the best fast food,"As I mentioned earlier, ""best"" fast food is su..."
1,487,What is JSX in React?,JSX is a syntax extension for JavaScript that ...
2,136,What is the name of the reaction where a subst...,The reaction where a substance combines with o...
3,44,What is your opinion on the current state of t...,The current state of the environment is a caus...
4,627,What is deep learning?,Deep learning is a subfield of machine learnin...


In [4]:
train.shape, test.shape

((746, 4), (249, 3))

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large") 

class MeanPooling(nn.Module): 
    def __init__(self): 
        super(MeanPooling, self).__init__() 
    def forward(self, last_hidden_state, attention_masks): 
        input_mask_expanded = attention_masks.unsqueeze(-1).expand(last_hidden_state.size()).float() 
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1) 
        sum_mask = input_mask_expanded.sum(1) 
        sum_mask = torch.clamp(sum_mask, min=1e-9) 
        mean_embeddings = sum_embeddings / sum_mask 
        return mean_embeddings 
    
class MultiSampleDropout(nn.Module): 
    def __init__(self, max_dropout_rate, num_samples, classifier):
        super(MultiSampleDropout, self).__init__()
        self.dropout = nn.Dropout
        self.classifier = classifier
        self.max_dropout_rate = max_dropout_rate
        self.num_samples = num_samples
    def forward(self, out):
        return torch.mean(torch.stack([self.classifier(self.dropout(p=rate)(out)) for _, rate in enumerate(np.linspace(0, self.max_dropout_rate, self.num_samples))], dim=0), dim=0)

class ChatGPTDetector(nn.Module): 
    def __init__(self, num_classes=2): 
        super(ChatGPTDetector, self).__init__() 
        self.num_classes = num_classes 
        self.config = AutoConfig.from_pretrained("microsoft/deberta-v3-large")
        self.lm = AutoModel.from_pretrained("microsoft/deberta-v3-large") 
        self.mean_pooler = MeanPooling() 
        self.fc = nn.Linear(self.config.hidden_size, self.num_classes) 
        self._init_weights(self.fc) 
        self.multi_dropout = MultiSampleDropout(0.2, 8, self.fc) 
    def _init_weights(self, module):
        if isinstance(module, nn.Linear): 
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) 
            if module.bias is not None: 
                module.bias.data.zero_() 
    def forward(self, input_ids, attn_masks):
        x = self.lm(input_ids, attn_masks)[0]
        x = self.mean_pooler(x, attn_masks) 
        x = self.multi_dropout(x)
        return x 

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--microsoft--deberta-v3-large/snapshots/7dca0f282d1f46ecd957a64a1c6ae23dc83d7ccb/config.json
Model config DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.26.1",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

l

In [6]:
input_ids, attn_masks = [], [] 

prompts = train["prompt"].values 
answers = train["answer"].values 
labels = train["AI"].values

for i in tqdm(range(len(prompts)), position=0, leave=True): 
    encoded_input = tokenizer(str(prompts[i]), str(answers[i]), max_length=512, truncation=True, padding="max_length") 
    input_ids.append(encoded_input["input_ids"])
    attn_masks.append(encoded_input["attention_mask"]) 

  0%|          | 0/746 [00:00<?, ?it/s]

In [7]:
input_ids = torch.tensor(input_ids, dtype=int) 
attn_masks = torch.tensor(attn_masks, dtype=int) 
labels = torch.tensor(labels, dtype=int) 

input_ids.shape, attn_masks.shape, labels.shape

(torch.Size([746, 512]), torch.Size([746, 512]), torch.Size([746]))

In [8]:
def flat_accuracy(preds, labels): 
    pred_flat = np.argmax(preds, axis=1).flatten() 
    labels_flat = labels.flatten() 
    return np.sum(pred_flat == labels_flat) / len(labels_flat) 

In [13]:
kf = StratifiedKFold(n_splits=5) 

for idx, (train_idx, val_idx) in enumerate(kf.split(input_ids, labels)): 
    print(f"======== KFOLD {idx+1} ========")
    train_input_ids, val_input_ids = input_ids[train_idx], input_ids[val_idx]
    train_attn_masks, val_attn_masks = attn_masks[train_idx], attn_masks[val_idx] 
    train_labels, val_labels = labels[train_idx], labels[val_idx] 
    
    batch_size = 16
    train_data = TensorDataset(train_input_ids, train_attn_masks, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) 
    
    val_data = TensorDataset(val_input_ids, val_attn_masks, val_labels) 
    val_sampler = SequentialSampler(val_data) 
    val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size) 
    
    class_weights = compute_class_weight(class_weight="balanced", classes=torch.unique(train_labels).numpy(), y=train_labels.numpy())
    class_weights = torch.tensor(class_weights).float().to(device) 
    loss_func = nn.CrossEntropyLoss(weight=class_weights) 
    #loss_func = nn.CrossEntropyLoss() 
    
    best_val_accuracy = 0 
    
    model = ChatGPTDetector() 
    checkpoint = torch.load("DeBERTaLarge_Wiki_1.pt")
    print(model.load_state_dict(checkpoint)) 
    model.cuda()
    optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8) 
    epochs = 5
    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps = int(0.05*total_steps), 
                                                num_training_steps = total_steps) 
    model.zero_grad()
    for epoch_i in tqdm(range(0, epochs), desc="Epochs", position=0, leave=True, total=epochs):
        train_loss, train_accuracy = 0, 0   
        model.train() 
        with tqdm(train_dataloader, unit="batch") as tepoch: 
            for step, batch in enumerate(tepoch): 
                batch = tuple(t.to(device) for t in batch) 
                b_input_ids, b_attn_masks, b_labels = batch 
                outputs = model(b_input_ids, b_attn_masks) 
                loss = loss_func(outputs, b_labels) 
                train_loss += loss.item() 
                train_accuracy += flat_accuracy(outputs.detach().cpu().numpy(), b_labels.detach().cpu().numpy())
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step() 
                model.zero_grad() 
                tepoch.set_postfix(loss=train_loss/(step+1), accuracy=train_accuracy/(step+1)) 
                time.sleep(0.1) 
        avg_train_loss = train_loss / len(train_dataloader) 
        avg_train_accuracy = train_accuracy / len(train_dataloader) 
        
        val_loss, val_accuracy = 0, 0 
        model.eval() 
        for step, batch in tqdm(enumerate(val_dataloader), position=0, leave=True, total=len(val_dataloader)): 
            batch = tuple(t.to(device) for t in batch) 
            b_input_ids, b_attn_masks, b_labels = batch 
            with torch.no_grad(): 
                outputs = model(b_input_ids, b_attn_masks) 
                loss = loss_func(outputs, b_labels)
                val_loss += loss.item() 
                val_accuracy += flat_accuracy(outputs.detach().cpu().numpy(), b_labels.detach().cpu().numpy())
        avg_val_loss = val_loss / len(val_dataloader) 
        avg_val_accuracy = val_accuracy / len(val_dataloader) 
        
        print(f"avg train loss : {avg_train_loss} | avg train accuracy : {avg_train_accuracy} | avg val loss : {avg_val_loss} | avg val accuracy : {avg_val_accuracy}")
        
        if avg_val_accuracy > best_val_accuracy: 
            best_val_accuracy = avg_val_accuracy 
            torch.save(model.state_dict(), f"transfer_DeBERTaLarge_Fold_{idx+1}.pt")
            
    print("Done!") 
    print(f"Best validation accuracy : {best_val_accuracy}")
    os.rename(f"transfer_DeBERTaLarge_Fold_{idx+1}.pt", f"transfer_DeBERTaLarge_Fold_{idx+1}_acc_{best_val_accuracy}.pt")
    
    

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--microsoft--deberta-v3-large/snapshots/7dca0f282d1f46ecd957a64a1c6ae23dc83d7ccb/config.json
Model config DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.26.1",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

l



Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.weight', 'mask_predictions.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of D

<All keys matched successfully>




Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/38 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

avg train loss : 1.4797831758072502 | avg train accuracy : 0.53125 | avg val loss : 0.4182191491127014 | avg val accuracy : 0.8020833333333334


  0%|          | 0/38 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

avg train loss : 0.4140907814236064 | avg train accuracy : 0.8536184210526315 | avg val loss : 0.33579170890152454 | avg val accuracy : 0.88125


  0%|          | 0/38 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

avg train loss : 0.21731393382345376 | avg train accuracy : 0.9243421052631579 | avg val loss : 0.3574410293251276 | avg val accuracy : 0.9


  0%|          | 0/38 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

avg train loss : 0.1804944777334305 | avg train accuracy : 0.9506578947368421 | avg val loss : 0.4891473824158311 | avg val accuracy : 0.8833333333333334


  0%|          | 0/38 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

avg train loss : 0.123834575912425 | avg train accuracy : 0.9555921052631579 | avg val loss : 0.3498948962427676 | avg val accuracy : 0.90625


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--microsoft--deberta-v3-large/snapshots/7dca0f282d1f46ecd957a64a1c6ae23dc83d7ccb/config.json
Model config DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.26.1",
  "type_vocab_size": 0,
  "vocab_size": 128100
}



Done!
Best validation accuracy : 0.90625


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--microsoft--deberta-v3-large/snapshots/7dca0f282d1f46ecd957a64a1c6ae23dc83d7ccb/config.json
Model config DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.26.1",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

l

<All keys matched successfully>


Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/38 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

avg train loss : 1.6375119960621785 | avg train accuracy : 0.5273026315789474 | avg val loss : 0.6473301157355309 | avg val accuracy : 0.55625


  0%|          | 0/38 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

avg train loss : 0.3859990512167937 | avg train accuracy : 0.85 | avg val loss : 0.2344512689858675 | avg val accuracy : 0.8925000000000001


  0%|          | 0/38 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

avg train loss : 0.2651603483153801 | avg train accuracy : 0.9055921052631579 | avg val loss : 0.18905581086874007 | avg val accuracy : 0.9375


  0%|          | 0/38 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

avg train loss : 0.21166304207259887 | avg train accuracy : 0.9338815789473683 | avg val loss : 0.2038605316542089 | avg val accuracy : 0.93125


  0%|          | 0/38 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--microsoft--deberta-v3-large/snapshots/7dca0f282d1f46ecd957a64a1c6ae23dc83d7ccb/config.json
Model config DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.26.1",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

l

avg train loss : 0.15307803427506433 | avg train accuracy : 0.9470394736842105 | avg val loss : 0.19429106367751955 | avg val accuracy : 0.9375
Done!
Best validation accuracy : 0.9375


Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.weight', 'mask_predictions.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of D

<All keys matched successfully>


Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/38 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

avg train loss : 1.5383120273288928 | avg train accuracy : 0.5108552631578948 | avg val loss : 0.6676142781972885 | avg val accuracy : 0.675


  0%|          | 0/38 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

avg train loss : 0.5637751102055374 | avg train accuracy : 0.7598684210526315 | avg val loss : 0.582910456508398 | avg val accuracy : 0.75


  0%|          | 0/38 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

avg train loss : 0.3654701007824195 | avg train accuracy : 0.8700657894736842 | avg val loss : 0.43169052749872205 | avg val accuracy : 0.8125


  0%|          | 0/38 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

avg train loss : 0.22300413211709574 | avg train accuracy : 0.9292763157894737 | avg val loss : 0.38971952088177203 | avg val accuracy : 0.8675


  0%|          | 0/38 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--microsoft--deberta-v3-large/snapshots/7dca0f282d1f46ecd957a64a1c6ae23dc83d7ccb/config.json
Model config DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.26.1",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

l

avg train loss : 0.167440377077774 | avg train accuracy : 0.9490131578947368 | avg val loss : 0.440139639377594 | avg val accuracy : 0.8375
Done!
Best validation accuracy : 0.8675


Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.weight', 'mask_predictions.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of D

<All keys matched successfully>


Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/38 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

avg train loss : 1.4124055975361873 | avg train accuracy : 0.5440789473684211 | avg val loss : 0.5049015581607819 | avg val accuracy : 0.80375


  0%|          | 0/38 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

avg train loss : 0.3583509663217946 | avg train accuracy : 0.8848684210526315 | avg val loss : 0.3649066660553217 | avg val accuracy : 0.8800000000000001


  0%|          | 0/38 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

avg train loss : 0.2767760315419812 | avg train accuracy : 0.8963815789473685 | avg val loss : 0.3314353108406067 | avg val accuracy : 0.8425


  0%|          | 0/38 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

avg train loss : 0.20762516866977276 | avg train accuracy : 0.9292763157894737 | avg val loss : 0.2856067972257733 | avg val accuracy : 0.8925000000000001


  0%|          | 0/38 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

avg train loss : 0.13597936075376837 | avg train accuracy : 0.9552631578947368 | avg val loss : 0.3226275475695729 | avg val accuracy : 0.8862500000000001
Done!
Best validation accuracy : 0.8925000000000001


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--microsoft--deberta-v3-large/snapshots/7dca0f282d1f46ecd957a64a1c6ae23dc83d7ccb/config.json
Model config DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.26.1",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

l

<All keys matched successfully>


Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/38 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

avg train loss : 1.651587035310896 | avg train accuracy : 0.5572368421052631 | avg val loss : 0.7275122635066509 | avg val accuracy : 0.74


  0%|          | 0/38 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

avg train loss : 0.40762051459597914 | avg train accuracy : 0.8680921052631578 | avg val loss : 0.6166742794215679 | avg val accuracy : 0.86


  0%|          | 0/38 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

avg train loss : 0.23980659937584087 | avg train accuracy : 0.9141447368421052 | avg val loss : 0.7911284461617469 | avg val accuracy : 0.82875


  0%|          | 0/38 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

avg train loss : 0.1418242471194581 | avg train accuracy : 0.9490131578947368 | avg val loss : 0.896781021822244 | avg val accuracy : 0.86625


  0%|          | 0/38 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

avg train loss : 0.08540501748211682 | avg train accuracy : 0.9671052631578947 | avg val loss : 0.8383014023303985 | avg val accuracy : 0.85375
Done!
Best validation accuracy : 0.86625


In [14]:
print("done!")

done!


# Inference

In [17]:
test_input_ids, test_attn_masks = [], [] 

test_prompts = test["prompt"].values 
test_answers = test["answer"].values 

for i in tqdm(range(len(test_prompts))): 
    encoded_input = tokenizer(str(test_prompts[i]), str(test_answers[i]), max_length=512, truncation=True, padding="max_length")
    test_input_ids.append(encoded_input["input_ids"])
    test_attn_masks.append(encoded_input["attention_mask"]) 

  0%|          | 0/249 [00:00<?, ?it/s]

In [18]:
test_input_ids = torch.tensor(test_input_ids, dtype=int) 
test_attn_masks = torch.tensor(test_attn_masks, dtype=int) 

test_input_ids.shape, test_attn_masks.shape

(torch.Size([249, 512]), torch.Size([249, 512]))

In [15]:
model1 = ChatGPTDetector() 
chkpt1 = torch.load("transfer_DeBERTaLarge_Fold_1_acc_0.90625.pt")
model1.load_state_dict(chkpt1) 
model1.to(device)
model1.eval() 

model2 = ChatGPTDetector() 
chkpt2 = torch.load("transfer_DeBERTaLarge_Fold_2_acc_0.9375.pt")
model2.load_state_dict(chkpt2) 
model2.to(device)
model2.eval() 

model3 = ChatGPTDetector() 
chkpt3 = torch.load("transfer_DeBERTaLarge_Fold_3_acc_0.8675.pt") 
model3.load_state_dict(chkpt3) 
model3.to(device)
model3.eval() 

model4 = ChatGPTDetector() 
chkpt4 = torch.load("transfer_DeBERTaLarge_Fold_4_acc_0.8925000000000001.pt")
model4.load_state_dict(chkpt4) 
model4.to(device)
model4.eval() 

model5 = ChatGPTDetector() 
chkpt5 = torch.load("transfer_DeBERTaLarge_Fold_5_acc_0.86625.pt")
model5.load_state_dict(chkpt5) 
model5.to(device)
model5.eval() 

print() 
print("done!") 

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--microsoft--deberta-v3-large/snapshots/7dca0f282d1f46ecd957a64a1c6ae23dc83d7ccb/config.json
Model config DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.26.1",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

l


done!


In [19]:
batch_size = 16 
test_data = TensorDataset(test_input_ids, test_attn_masks) 
test_sampler = SequentialSampler(test_data) 
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size) 

predictions = [] 
predicted_logits = [] 

for step, batch in tqdm(enumerate(test_dataloader), total=len(test_dataloader)):
    batch = (t.to(device) for t in batch) 
    b_input_ids, b_attn_masks = batch 
    with torch.no_grad():
        output1 = model1(b_input_ids, b_attn_masks) 
        output1 = nn.Softmax(dim=1)(output1) 
        
        output2 = model2(b_input_ids, b_attn_masks) 
        output2 = nn.Softmax(dim=1)(output2) 
        
        output3 = model3(b_input_ids, b_attn_masks) 
        output3 = nn.Softmax(dim=1)(output3) 
        
        output4 = model4(b_input_ids, b_attn_masks) 
        output4 = nn.Softmax(dim=1)(output4) 
        
        output5 = model5(b_input_ids, b_attn_masks) 
        output5 = nn.Softmax(dim=1)(output5) 
        
                
        avg_logits = (output1 + output2 + output3 + output4 + output5) / 5.0 
        
        predicted_logits.extend(avg_logits) 
        
        classes = torch.argmax(avg_logits, dim=1) 
        classes = classes.detach().cpu().numpy().tolist() 
        predictions.extend(classes) 
    

  0%|          | 0/16 [00:00<?, ?it/s]

In [20]:
submission = pd.read_csv("sample_submission.csv") 

submission["Category"] = predictions

submission.to_csv("DeBERTa_Wiki.csv", index=False) # 91.959% on leaderboard. 

submission 

Unnamed: 0,Id,Category
0,710,1
1,487,0
2,136,1
3,44,1
4,627,0
...,...,...
244,702,1
245,500,0
246,818,0
247,584,0
