In [None]:
import numpy as np
import pandas as pd

import os
import gc
import joblib
import warnings
from pathlib import Path
from tqdm.auto import tqdm
from functools import partial
from scipy.stats import rankdata

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, logging

import sys 
sys.path.append("../input/jigsaw22models-infer/")

from train import *
from inference import *
from text import clean_text as text_preprocess_bert, clean_text_letters as text_preprocess_tfidf

In [None]:
# TTA
def predict_single_string(model, tokens):
    tokens = np.array(tokens).reshape(1, -1)
    return model(
        torch.tensor(tokens, dtype=torch.long).to(DEVICE), 
        torch.tensor(np.ones_like(tokens), dtype=torch.long).to(DEVICE)
    ).detach().cpu().numpy()[0][0]

def string_chunks_prediction(model, tokens, chunk_size, sos_id, eos_id):
    out = []
    chunk_size -= 2 # minus sos, eos
    lb, rb = 0, chunk_size
    while lb < len(tokens):
        chunk = [sos_id] + tokens[lb:rb] + [eos_id]
        p = predict_single_string(model, chunk)
        out.append(p)
        lb, rb = lb + chunk_size, rb + chunk_size
    return out

def tta(model, loader, config):
    print(str(config['weights_path']))
    
    net = make_model(config)
    net.load_state_dict(torch.load(config['weights_path'], map_location=DEVICE))
    net.eval()    
    
    s = config['seqlen']
    tokenizer = config['tokenizer']
    preds = []
    for strings in tqdm(loader, total=len(loader), desc="tta"):
        batch_tokens = tokenizer(strings, padding=False, add_special_tokens=False)
        for tokens in batch_tokens['input_ids']:
            length = len(tokens)
            if length <= s - 2:
                tokens = [tokenizer.cls_token_id] + tokens + [tokenizer.eos_token_id]
                p = [predict_single_string(net, tokens)]
            else:
                p = string_chunks_prediction(net, tokens, s, tokenizer.cls_token_id, tokenizer.eos_token_id)
            preds.append(p)
    
    # weighting
    weights = np.round(np.exp(-0.5*np.arange(0, 100)), 2)
    preds_wgt = np.zeros(len(preds), dtype=float)
    for i in range(len(preds)):
        p = preds[i]
        w = weights[:len(p)]
        preds_wgt[i] = p @ (w/w.sum())
    
    del net
    gc.collect()
    return rankdata(preds_wgt)

In [None]:
logging.set_verbosity_error()
warnings.filterwarnings("ignore")
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ['TOKENIZERS_PARALLELISM'] = "false"
pd.options.display.max_colwidth = 200

In [None]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
MODELS_DIR_RIDGE = Path("../input/jigsaw22models-infer/linear")
WEIGHT_DIR_BERT  = Path("../input/jigsaw22models-infer/roberta")
MODELS_DIR_BERT  = Path("../input/nlpmodels/")

In [None]:
df = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
df['text_tfidf'] = df['text'].copy()
df = text_preprocess_tfidf(df, 'text_tfidf')
df['text'] = [text_preprocess_bert(t) for t in df.text]
inference_dataset = InferenceDataset(df) # takes "text" field
loader = DataLoader(inference_dataset, batch_size=32, num_workers=1, shuffle=False, pin_memory=False, collate_fn=None)

In [None]:
# roberta
!ls {WEIGHT_DIR_BERT}

folds = 1
model = "roberta_toxicity_classifier"

seqlen = [128, 128, 256, 512, 128, 512]
models = ["clf-128-v2", "clf-128", "clf-256", "clf-512", "oe-128", "oe-512"]

preds_roberta = np.zeros((len(inference_dataset), len(models)), dtype=np.float64)
for i, (s, name) in enumerate(zip(seqlen, models)):
    config = make_inference_config(model, MODELS_DIR_BERT, WEIGHT_DIR_BERT / name / "model.bin", s, folds, DEVICE)
    config['tokenizer'] = AutoTokenizer.from_pretrained(MODELS_DIR_BERT / model)
    preds_roberta[:, i] = tta(model, loader, config)
    # preds_roberta[:, i] = prediction(inference_dataset, config).squeeze()

In [None]:
# tfidf regression

!ls {MODELS_DIR_RIDGE}

models = ["bias", "clf", "oe", "pub-ruddit", "td2"]
preds_ridge = np.zeros((len(inference_dataset), len(models)), dtype=np.float64)

for i, model in enumerate(models):
    path = MODELS_DIR_RIDGE / f"{model}.pkl"
    print(path)
    ridge = joblib.load(path)
    preds_ridge[:, i] = rankdata(ridge.predict(df.text_tfidf))
    del ridge
    gc.collect()

In [None]:
# ensemble
# w = np.array([0.544, 0.55 , 0.539, 0.455, 0.506, 0.929, 0.467, 0.384, 0.936, 0.257, 0.016])
w = np.array([0.813, 0.512, 0.945, 0.05, 0.896, 0.574, 0.999, 0.928, 0.898, 0.062, 0.508])
# w = np.array([0.509, 0.354, 0.699, 0.031, 0.927])

preds = np.concatenate((preds_ridge, preds_roberta), axis=1)
# preds = preds_ridge
df['score'] = rankdata(preds @ w, method="ordinal")

In [None]:
submission = pd.read_csv("../input/jigsaw-toxic-severity-rating/sample_submission.csv")
del submission['score']
submission = pd.merge(submission, df[['comment_id', 'score']], on='comment_id')
# assert np.all(submission.comment_id == df.comment_id)
# submission['score'] = df['score'].values
submission.to_csv("submission.csv", index=False)

In [None]:
submission.head(5)

In [None]:
df.sort_values('score', ascending=False).iloc[[0,1,2,3,4,5,6,7,8,9,10,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1], [1,3]]