In [1]:
!pip install accelerate



In [2]:
import os
import gc
import sys
import math
import time
import tqdm
import random
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold,KFold

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from accelerate import Accelerator
from transformers import (AutoModel,AutoTokenizer,AdamW,
                          AutoModelForSequenceClassification,
                          AutoConfig,get_cosine_schedule_with_warmup)

from colorama import Fore, Back, Style
r_ = Fore.RED
b_ = Fore.BLUE
c_ = Fore.CYAN
g_ = Fore.GREEN
y_ = Fore.YELLOW
m_ = Fore.MAGENTA
sr_ = Style.RESET_ALL

In [3]:
config = {
    'lr': 1.1e-5,
    'wd':1e-2,
    'batch_size':256,
    'num_workers':4,
    'max_len':6,
    'epochs':10,
    'nfolds':5,
    'seed':1000,
    'model_path':'roberta-base'
}

for i in range(config['nfolds']):
    os.makedirs(f'model{i}',exist_ok=True)

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=config['seed'])

## Making Predictions

In [4]:
# test_data = train_data.query("Fold == 0")
# test_data = train_data
test_data = pd.read_csv('../input/google-ads-rur-data/collagen_mask_data.csv')
test_data['keyword2'] = 'collangen face mask'
# keyword1 = test_data['keyword1'].to_list()
# test_data['keyword1'] = test_data['keyword2']
# test_data['keyword2'] = keyword1

In [5]:
test_data.head()

Unnamed: 0,keyword1,keyword2
0,collagen mask,collangen face mask
1,collagen face mask,collangen face mask
2,collagen lips mask,collangen face mask
3,collagen lip plumping mask,collangen face mask
4,crystal collagen gold powder eye mask,collangen face mask


In [6]:
class GadsTestDataset(Dataset):
    def __init__(self,df,tokenizer):
        self.keywords1 = df['keyword1'].to_numpy()
        self.keywords2 = df['keyword2'].to_numpy()
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.keywords1[idx] + " : " + self.keywords2[idx],
                                return_tensors='pt',
                                max_length=config['max_len'],
                                padding='max_length',truncation=True)
        return encode
    
    def __len__(self):
        return len(self.keywords1)

In [7]:
def get_prediction(df,path,model_path,device='cuda'):        
    model = AutoModelForSequenceClassification.from_pretrained(config['model_path'],num_labels = 2)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    predictions = list()
    
    for f in range(config['nfolds']):
        model.load_state_dict(torch.load(path.format(f),map_location=device))
        model.to(device)
        model.eval()

        test_ds = GadsTestDataset(df,tokenizer)
        test_dl = DataLoader(test_ds,
                            batch_size = config["batch_size"],
                            shuffle=False,
                            drop_last=False,
                            num_workers = 4,
                            pin_memory=True)

        with torch.no_grad():
            pred = list()
            for i, (inputs) in tqdm(enumerate(test_dl)):
                inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in inputs.items()}
                outputs = model(**inputs)
                outputs = outputs['logits'].cpu().detach().numpy().tolist()
                pred.extend(outputs)
            predictions.append(pred)
            
    torch.cuda.empty_cache()
    final_predictions = np.mean(predictions,axis=0)
    return np.mean(predictions,axis=0)

In [8]:
model_paths = [
    ['../input/rur-roberta-models-classification/model{0}/model{0}.bin',config['model_path']],
]

In [9]:
predictions = np.zeros((test_data.shape[0],2))
for path,model_path in model_paths:
    predictions += get_prediction(test_data,path,model_path)/len(model_paths)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

In [10]:
predictions = np.argmax(predictions,axis=1)

In [11]:
test_data['predictions'] = predictions

In [12]:
test_data.to_csv('collagen_mask_prediction.csv',index=False)

In [13]:
# accuracy_score(test_data['target'],test_data['predictions'])

In [14]:
relevant_data = test_data.query('predictions==1')
relevant_data.to_csv('collagen_mask_relevant.csv',index=False)

In [15]:
test_data['predictions'].value_counts()

1    485
0     20
Name: predictions, dtype: int64

In [16]:
test_data.head(50)

Unnamed: 0,keyword1,keyword2,predictions
0,collagen mask,collangen face mask,1
1,collagen face mask,collangen face mask,1
2,collagen lips mask,collangen face mask,1
3,collagen lip plumping mask,collangen face mask,1
4,crystal collagen gold powder eye mask,collangen face mask,1
5,collagen eye mask,collangen face mask,1
6,gold collagen lip mask,collangen face mask,1
7,collagen sheet mask,collangen face mask,1
8,mario badescu super collagen mask,collangen face mask,1
9,diy collagen mask,collangen face mask,1


In [17]:
relevant_data.head(50)

Unnamed: 0,keyword1,keyword2,predictions
0,collagen mask,collangen face mask,1
1,collagen face mask,collangen face mask,1
2,collagen lips mask,collangen face mask,1
3,collagen lip plumping mask,collangen face mask,1
4,crystal collagen gold powder eye mask,collangen face mask,1
5,collagen eye mask,collangen face mask,1
6,gold collagen lip mask,collangen face mask,1
7,collagen sheet mask,collangen face mask,1
8,mario badescu super collagen mask,collangen face mask,1
9,diy collagen mask,collangen face mask,1
