In [1]:
import random
import glob
import os
import sys
import gc
import json
import math
import numpy as np
import pandas as pd
import lightgbm as lgb
import torch
import scipy
from scipy.stats import rankdata
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
from typing import Dict, List, Tuple, NamedTuple, Callable, Any, Iterable, Set

In [2]:
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
class Conf(NamedTuple):
    device: torch.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    input_dir: str = '/kaggle/input/'
    comp_dir: str = f"{input_dir}jigsaw-toxic-severity-rating/"
    temp_dir: str = '/kaggle/temp/'
    working_dir: str = '/kaggle/working/'
    resource_dir: str = f'{input_dir}jtsr-lib/kaggle-jigsaw-toxic-severity-rating-1.0/'
    pretrained_dir: str = f"{input_dir}pretrained/pretrained/"
    lgb_model: str = f'{resource_dir}models/lgb/20220207_175155/model.txt'
    dtfy_batch_size: int = 256
    dtfy_model_max_length: int = 512
    dtfy_models: Dict[str, str] = {
        "dto_": f"{pretrained_dir}unitaryai/detoxify/toxic_original-c1212f89.ckpt",
        "dtu_": f"{pretrained_dir}unitaryai/detoxify/toxic_debiased-c7548aa0.ckpt",
        "dtm_": f"{pretrained_dir}unitaryai/detoxify/multilingual_debiased-0b549669.ckpt"
    }
    dtfy_configs: Dict[str, str] = {
        "dto_": f"{pretrained_dir}bert-base-uncased",
        "dtu_": f"{pretrained_dir}roberta-base",
        "dtm_": f"{pretrained_dir}xlm-roberta-base"
    }
    tweeteval_model_max_length: int = 512
    tweeteval_batch_size: int = 128
    tweeteval_models: Dict[str, str] = {
        "te_roberta_off": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-offensive",
        "te_roberta_emo_anger": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-emotion",
        "te_roberta_snt_neg": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-sentiment",
        "te_roberta_iro": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-irony",
        "te_xlm_roberta_snt_neg": f"{pretrained_dir}cardiffnlp/twitter-xlm-roberta-base-sentiment",
    }
    tweeteval_label_index: Dict[str, int] = {
        "te_roberta_off": 1,
        "te_roberta_emo_anger": 0,
        "te_roberta_snt_neg": 0,
        "te_roberta_iro": 1,
        "te_xlm_roberta_snt_neg": 0,
    }
    hatebert_model_max_length: int = 512
    hatebert_batch_size: int = 128
    hatebert_models: Dict[str, str] = {
        "hb_bert_off": f"{pretrained_dir}hatebert/bert-offenseval",
        "hb_bert_abu" : f"{pretrained_dir}hatebert/bert-abuseval",
        "hb_hatebert_off": f"{pretrained_dir}hatebert/hatebert-offenseval",
        "hb_hatebert_abu" : f"{pretrained_dir}hatebert/hatebert-abuseval",
    }
    em_enable: bool = False
    em_max_seq_length: int = 128
    em_batch_size: int = 1000
    em_models: Dict[str, str] = {
        "paraphrase-MiniLM-L6-v2": f"{pretrained_dir}sentence-transformers/paraphrase-MiniLM-L6-v2"
    }
    vocab_file: str = f"{resource_dir}data/vocab.json"
    vad_file: str = f"{resource_dir}data/vad.tsv"


conf = Conf()
if conf.device.type == 'cuda':
    for i in range(torch.cuda.device_count()):
        print(f"device={i}, {torch.cuda.get_device_name(i)}")
        print('Mem Allocated:', round(torch.cuda.memory_allocated(i)/1024**3,1), 'GB')
        print('Mem Cached:   ', round(torch.cuda.memory_reserved(i)/1024**3,1), 'GB')

device=0, Tesla P100-PCIE-16GB
Mem Allocated: 0.0 GB
Mem Cached:    0.0 GB


In [3]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
sys.path.append(f"{conf.input_dir}pyphen/Pyphen-0.11.0")
sys.path.append(f"{conf.input_dir}textstat/textstat-0.7.2")
sys.path.append(f"{conf.input_dir}sentence-transformers/sentence-transformers-2.1.0")
sys.path.append(f'{conf.input_dir}d/ruhong/sgcharts-ml/src')
sys.path.append(f'{conf.resource_dir}src')
import textstat
from sentence_transformers import SentenceTransformer
import scml
from scml import nlp as snlp
import mylib

In [4]:
df = pd.read_csv(f"{conf.comp_dir}comments_to_score.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7537 entries, 0 to 7536
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   comment_id  7537 non-null   int64 
 1   text        7537 non-null   object
dtypes: int64(1), object(1)
memory usage: 117.9+ KB


# Preprocess text

In [5]:
def pre1(row) -> str:
    return mylib.pre1(row["text"])


def pre2(row) -> str:
    return mylib.pre2(row["text1"])


def pre3(row) -> str:
    return mylib.pre3(row["text2"])


col = "text1"
print(col)
df[col] = df.progress_apply(pre1, axis=1)
col = "text2"
print(col)
df[col] = df.progress_apply(pre2, axis=1)
col = "text3"
print(col)
df[col] = df.progress_apply(pre3, axis=1)

text1


100%|██████████| 7537/7537 [00:50<00:00, 148.20it/s]


text2


100%|██████████| 7537/7537 [17:05<00:00,  7.35it/s]


text3


100%|██████████| 7537/7537 [02:02<00:00, 61.62it/s]


# Character level features

In [6]:
%%time
col = "length"
df[col] = df["text1"].str.len()
df[col] = df[col].astype(np.int16)

CPU times: user 8.82 ms, sys: 4 µs, total: 8.82 ms
Wall time: 8.57 ms


In [7]:
def digit_frac(row) -> float:
    return mylib.digit_frac(row["text1"])


def letter_frac(row) -> float:
    return mylib.letter_frac(row["text1"])


def space_frac(row) -> float:
    return mylib.space_frac(row["text1"])


def punc_frac(row) -> float:
    return mylib.punc_frac(row["text1"])


def upper_frac(row) -> float:
    return mylib.upper_frac(row["text1"])


def repeat_char_frac(row) -> float:
    return mylib.repeat_char_frac(row["text1"])


def repeat_substring_frac(row) -> float:
    return mylib.repeat_substring_frac(row["text1"])


char_fns: Dict[str, Callable] = {
    "digit_frac": digit_frac,
    "letter_frac": letter_frac,
    "space_frac": space_frac,
    "punc_frac": punc_frac,
    "upper_frac": upper_frac,
    "repeat_char_frac": repeat_char_frac,
    "repeat_substring_frac": repeat_substring_frac,
}

In [8]:
for col, fn in char_fns.items():
    print(col)
    df[col] = df.progress_apply(fn, axis=1)
    df[col] = df[col].astype(np.float32)

digit_frac


100%|██████████| 7537/7537 [00:00<00:00, 22677.92it/s]


letter_frac


100%|██████████| 7537/7537 [00:00<00:00, 21549.82it/s]


space_frac


100%|██████████| 7537/7537 [00:00<00:00, 19978.75it/s]


punc_frac


100%|██████████| 7537/7537 [00:00<00:00, 16582.98it/s]


upper_frac


100%|██████████| 7537/7537 [00:00<00:00, 21147.67it/s]


repeat_char_frac


100%|██████████| 7537/7537 [00:00<00:00, 8016.03it/s]


repeat_substring_frac


100%|██████████| 7537/7537 [03:52<00:00, 32.44it/s]


# Textstat features

In [9]:
def syllable_count(row) -> int:
    return textstat.syllable_count(row["text1"])


def lexicon_count(row) -> int:
    return textstat.lexicon_count(row["text1"])


def sentence_count(row) -> int:
    return textstat.sentence_count(row["text1"])


def syllables_per_word(row) -> float:
    return row["syllable_count"] / (row["lexicon_count"] + 1)


def syllables_per_sent(row) -> float:
    return row["syllable_count"] / (row["sentence_count"] + 1)


def words_per_sent(row) -> float:
    return row["lexicon_count"] / (row["sentence_count"] + 1)


def flesch_reading_ease(row) -> float:
    return textstat.flesch_reading_ease(row["text1"])


def flesch_kincaid_grade(row) -> float:
    return textstat.flesch_kincaid_grade(row["text1"])


def gunning_fog(row) -> float:
    return textstat.gunning_fog(row["text1"])


def smog_index(row) -> float:
    return textstat.smog_index(row["text1"])


def automated_readability_index(row) -> float:
    return textstat.automated_readability_index(row["text1"])


def coleman_liau_index(row) -> float:
    return textstat.coleman_liau_index(row["text1"])


def linsear_write_formula(row) -> float:
    return textstat.linsear_write_formula(row["text1"])


def dale_chall_readability_score(row) -> float:
    return textstat.dale_chall_readability_score(row["text1"])



preprocess_fns: List[Tuple[str, Callable, Any]] = [
    ("syllable_count", syllable_count, np.int32),
    ("lexicon_count", lexicon_count, np.int32),
    ("sentence_count", sentence_count, np.int32),
]
textstat_fns: List[Tuple[str, Callable, Any]] = [
    ("syllables_per_word", syllables_per_word, np.float32),
    ("syllables_per_sent", syllables_per_sent, np.float32),
    ("words_per_sent", words_per_sent, np.float32),
    ("flesch_reading_ease", flesch_reading_ease, np.float32),
    ("flesch_kincaid_grade", flesch_kincaid_grade, np.float32),
    ("gunning_fog", gunning_fog, np.float32),
    ("smog_index", smog_index, np.float32),
    ("automated_readability_index", automated_readability_index, np.float32),
    ("coleman_liau_index", coleman_liau_index, np.float32),
    ("linsear_write_formula", linsear_write_formula, np.float32),
    ("dale_chall_readability_score", dale_chall_readability_score, np.float32),
]

In [10]:
for col, fn, dtype in preprocess_fns:
    print(col)
    df[col] = df.progress_apply(fn, axis=1)
    df[col] = df[col].astype(dtype)
for col, fn, dtype in textstat_fns:
    print(col)
    df[col] = df.progress_apply(fn, axis=1)
    df[col] = df[col].astype(dtype)

syllable_count


100%|██████████| 7537/7537 [00:02<00:00, 2827.60it/s]


lexicon_count


100%|██████████| 7537/7537 [00:00<00:00, 32594.85it/s]


sentence_count


100%|██████████| 7537/7537 [00:00<00:00, 15260.61it/s]


syllables_per_word


100%|██████████| 7537/7537 [00:00<00:00, 53531.82it/s]


syllables_per_sent


100%|██████████| 7537/7537 [00:00<00:00, 53683.73it/s]


words_per_sent


100%|██████████| 7537/7537 [00:00<00:00, 53114.49it/s]


flesch_reading_ease


100%|██████████| 7537/7537 [00:01<00:00, 4677.21it/s]


flesch_kincaid_grade


100%|██████████| 7537/7537 [00:01<00:00, 4797.26it/s]


gunning_fog


100%|██████████| 7537/7537 [00:01<00:00, 3990.03it/s]


smog_index


100%|██████████| 7537/7537 [00:01<00:00, 5198.77it/s]


automated_readability_index


100%|██████████| 7537/7537 [00:00<00:00, 11193.01it/s]


coleman_liau_index


100%|██████████| 7537/7537 [00:00<00:00, 9645.07it/s]


linsear_write_formula


100%|██████████| 7537/7537 [00:01<00:00, 4990.61it/s]


dale_chall_readability_score


100%|██████████| 7537/7537 [00:01<00:00, 4112.35it/s]


# VAD lexicon
- only using Valence and Arousal scores

In [11]:
vad_df = pd.read_csv(conf.vad_file, header=0, names=["term", "valence", "arousal", "dominance"], 
                 sep="\t", engine="c", low_memory=False)
cols = ["valence", "arousal", "dominance"]
vad_df[cols] = vad_df[cols].astype(np.float16)
vad_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19970 entries, 0 to 19969
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   term       19969 non-null  object 
 1   valence    19970 non-null  float16
 2   arousal    19970 non-null  float16
 3   dominance  19970 non-null  float16
dtypes: float16(3), object(1)
memory usage: 273.2+ KB


In [12]:
valence: Dict[str, float] = {}
arousal: Dict[str, float] = {}
for t in tqdm(vad_df.itertuples()):
    v = getattr(t, "valence")
    a = getattr(t, "arousal")
    term = str(getattr(t, "term")).strip().lower()
    if v < 0.25:
        valence[term] = v
    if a > 0.75:
        arousal[term] = a
print(f"len(valence)={len(valence)}, len(arousal)={len(arousal)}")

19970it [00:00, 333441.82it/s]

len(valence)=3011, len(arousal)=1944





In [13]:
vad_fs = ["valence_min", "valence_max", "valence_mean", "arousal_min", "arousal_max", "arousal_mean"]
rows = []
for text in tqdm(df["text3"]):
    vs = []
    ars = []
    tokens = text.split()
    for t in tokens:
        if t in valence:
            vs.append(valence[t])
        if t in arousal:
            ars.append(arousal[t])
    if len(vs) == 0:
        vs = [-1]
    if len(ars) == 0:
        ars = [-1]
    row = [min(vs), max(vs), np.mean(vs), min(ars), max(ars), np.mean(ars)]
    rows.append(row)
df[vad_fs] = rows
df[vad_fs] = df[vad_fs].astype(np.float32)
del rows, valence, arousal, vad_df
gc.collect()

100%|██████████| 7537/7537 [00:00<00:00, 19520.71it/s]


0

# TF-IDF features

In [14]:
vocabulary, idf = [], []
with open(conf.vocab_file) as f:
    tmp = json.load(f)
    vocabulary = tmp["term"]
    idf = tmp["idf"]
print(f"len(vocab)={len(vocabulary)}\n{vocabulary}")
assert len(vocabulary) == len(idf)

len(vocab)=4098
['abandon', 'abandonment', 'abash', 'abduct', 'abduction', 'aberrant', 'aberration', 'abhor', 'abhorrence', 'abhorrent', 'ablaze', 'abnormal', 'abnormality', 'abominable', 'abominate', 'abomination', 'abort', 'abortion', 'abortive', 'abrupt', 'abruptly', 'abscess', 'absence', 'absent', 'absentee', 'absurd', 'absurdity', 'abuse', 'abusive', 'accelerant', 'accelerate', 'acceleration', 'accelerator', 'accident', 'accidental', 'acclaim', 'accurse', 'accusation', 'accusative', 'accuse', 'accuser', 'ache', 'achievement', 'achy', 'acidity', 'acne', 'acrobat', 'action', 'action figure', 'activation', 'addict', 'addicted', 'addiction', 'adrenalin', 'adrenaline', 'adulterate', 'adulterer', 'adultery', 'adventure', 'adventurer', 'adventurous', 'adversary', 'adversity', 'aerobic', 'afire', 'afraid', 'african', 'african american', 'age', 'aged', 'agglomeration', 'aggravate', 'aggravating', 'aggravation', 'aggresive', 'aggression', 'aggressive', 'aggressively', 'aggressiveness', 'agg

In [15]:
%%time
vec = TfidfVectorizer(vocabulary=vocabulary, ngram_range=(1, 3), analyzer="word")
vec.idf_ = idf
x = vec.transform(df["text3"])
print(f"x.shape={x.shape}\n{x[0]}")
ti_features = []
for t in vocabulary:
    words = ["ti"] + t.split()
    ti_features.append("_".join(words))
df[ti_features] = scipy.sparse.csr_matrix(x).todense()
#print("cast float32")
#for col in tqdm(ti_features):
    #df[col] = df[col].astype(np.float32)
#df = df.copy()  # defragment

x.shape=(7537, 4098)
  (0, 4076)	0.31831910955399473
  (0, 3023)	0.7022081476704467
  (0, 1028)	0.3398354030545193
  (0, 17)	0.5385987009534691


  self[col] = igetitem(value, i)


CPU times: user 3.65 s, sys: 235 ms, total: 3.89 s
Wall time: 3.89 s


# TweetEval labels

In [16]:
sentences = list(df["text2"])
for col, model_dir in conf.tweeteval_models.items():
    tokenizer = AutoTokenizer.from_pretrained(
        model_dir, 
        model_max_length=conf.tweeteval_model_max_length
    )
    #print(f"{repr(tokenizer)}\nmodel_input_names={tokenizer.model_input_names}")
    x = tokenizer(sentences, truncation=True, padding="max_length")
    batches = torch.utils.data.DataLoader(mylib.Dataset(x), batch_size=conf.tweeteval_batch_size, shuffle=False)
    model = AutoModelForSequenceClassification.from_pretrained(model_dir)
    model.eval()
    model.to(conf.device)
    logits = None
    with torch.no_grad():
        for batch in tqdm(batches):
            for k, v in batch.items():
                batch[k] = v.to(conf.device)
            outputs = model(**batch)
            tmp = outputs.logits.detach().cpu()
            if logits is None:
                logits = tmp
            else:
                logits = torch.cat((logits, tmp), 0)
    logits = torch.nn.functional.softmax(logits, dim=1)
    print(f"{col} {logits.size()}\nlogits[:10]={logits[:10]}")
    df[col] = logits[:,conf.tweeteval_label_index[col]]
    df[col] = df[col].astype(np.float32)
    del tokenizer, model
    gc.collect()

100%|██████████| 59/59 [02:10<00:00,  2.22s/it]


te_roberta_off torch.Size([7537, 2])
logits[:10]=tensor([[0.8552, 0.1448],
        [0.5652, 0.4348],
        [0.6350, 0.3650],
        [0.7850, 0.2150],
        [0.2349, 0.7651],
        [0.8033, 0.1967],
        [0.4922, 0.5078],
        [0.5990, 0.4010],
        [0.8934, 0.1066],
        [0.4272, 0.5728]])


100%|██████████| 59/59 [02:10<00:00,  2.21s/it]


te_roberta_emo_anger torch.Size([7537, 4])
logits[:10]=tensor([[0.8563, 0.0076, 0.0692, 0.0669],
        [0.8189, 0.0080, 0.0457, 0.1274],
        [0.1097, 0.6517, 0.1256, 0.1130],
        [0.2242, 0.0778, 0.4488, 0.2491],
        [0.9620, 0.0060, 0.0176, 0.0144],
        [0.9665, 0.0059, 0.0126, 0.0149],
        [0.9655, 0.0071, 0.0146, 0.0128],
        [0.9705, 0.0118, 0.0096, 0.0082],
        [0.4742, 0.0528, 0.0511, 0.4218],
        [0.1118, 0.1862, 0.5323, 0.1697]])


100%|██████████| 59/59 [02:10<00:00,  2.21s/it]


te_roberta_snt_neg torch.Size([7537, 3])
logits[:10]=tensor([[0.5242, 0.4327, 0.0430],
        [0.7244, 0.2631, 0.0124],
        [0.5209, 0.4081, 0.0710],
        [0.4763, 0.4429, 0.0808],
        [0.9311, 0.0628, 0.0062],
        [0.7649, 0.2173, 0.0178],
        [0.6707, 0.2919, 0.0375],
        [0.6546, 0.3091, 0.0364],
        [0.5482, 0.4088, 0.0431],
        [0.1799, 0.6844, 0.1357]])


100%|██████████| 59/59 [02:10<00:00,  2.21s/it]


te_roberta_iro torch.Size([7537, 2])
logits[:10]=tensor([[0.9067, 0.0933],
        [0.8761, 0.1239],
        [0.9444, 0.0556],
        [0.9441, 0.0559],
        [0.9086, 0.0914],
        [0.9012, 0.0988],
        [0.8880, 0.1120],
        [0.0324, 0.9676],
        [0.9544, 0.0456],
        [0.9358, 0.0642]])


100%|██████████| 59/59 [02:10<00:00,  2.21s/it]


te_xlm_roberta_snt_neg torch.Size([7537, 3])
logits[:10]=tensor([[0.5317, 0.3431, 0.1252],
        [0.8481, 0.1225, 0.0293],
        [0.6220, 0.2778, 0.1002],
        [0.4569, 0.3656, 0.1775],
        [0.9133, 0.0721, 0.0147],
        [0.7865, 0.1806, 0.0329],
        [0.6098, 0.2676, 0.1226],
        [0.7394, 0.2131, 0.0475],
        [0.3477, 0.5753, 0.0770],
        [0.2641, 0.6160, 0.1199]])


# HateBert labels

In [17]:
# all Hatebert models use the same tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    conf.hatebert_models["hb_hatebert_off"], 
    model_max_length=conf.hatebert_model_max_length
)
print(f"{repr(tokenizer)}\nmodel_input_names={tokenizer.model_input_names}")

PreTrainedTokenizerFast(name_or_path='/kaggle/input/pretrained/pretrained/hatebert/hatebert-offenseval', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
model_input_names=['input_ids', 'token_type_ids', 'attention_mask']


In [18]:
%%time
x = tokenizer(sentences, truncation=True, padding="max_length")
print(f"{repr(x.keys())}\nlen={len(x['input_ids'])}")

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
len=7537
CPU times: user 2.61 s, sys: 5.99 ms, total: 2.62 s
Wall time: 2.62 s


In [19]:
batches = torch.utils.data.DataLoader(mylib.Dataset(x), batch_size=conf.hatebert_batch_size, shuffle=False)
for col, model_dir in conf.hatebert_models.items():    
    model = AutoModelForSequenceClassification.from_pretrained(model_dir)
    model.eval()
    model.to(conf.device)
    logits = None
    with torch.no_grad():
        for batch in tqdm(batches):
            for k, v in batch.items():
                batch[k] = v.to(conf.device)
            outputs = model(**batch)
            tmp = outputs.logits.detach().cpu()
            if logits is None:
                logits = tmp
            else:
                logits = torch.cat((logits, tmp), 0)
    logits = torch.nn.functional.softmax(logits, dim=1)
    print(f"{col} {logits.size()}\nlogits[:10]={logits[:10]}")
    df[col] = logits[:,1]
    df[col] = df[col].astype(np.float32)

100%|██████████| 59/59 [02:10<00:00,  2.22s/it]


hb_bert_off torch.Size([7537, 2])
logits[:10]=tensor([[0.9358, 0.0642],
        [0.9747, 0.0253],
        [0.8336, 0.1664],
        [0.8787, 0.1213],
        [0.0822, 0.9178],
        [0.6007, 0.3993],
        [0.0797, 0.9203],
        [0.1229, 0.8771],
        [0.9645, 0.0355],
        [0.7761, 0.2239]])


100%|██████████| 59/59 [02:10<00:00,  2.22s/it]


hb_bert_abu torch.Size([7537, 2])
logits[:10]=tensor([[0.9828, 0.0172],
        [0.9922, 0.0078],
        [0.9708, 0.0292],
        [0.9884, 0.0116],
        [0.0778, 0.9222],
        [0.9731, 0.0269],
        [0.1342, 0.8658],
        [0.4784, 0.5216],
        [0.9937, 0.0063],
        [0.9810, 0.0190]])


100%|██████████| 59/59 [02:10<00:00,  2.22s/it]


hb_hatebert_off torch.Size([7537, 2])
logits[:10]=tensor([[0.9728, 0.0272],
        [0.8874, 0.1126],
        [0.7843, 0.2157],
        [0.8808, 0.1192],
        [0.1017, 0.8983],
        [0.9402, 0.0598],
        [0.2147, 0.7853],
        [0.4136, 0.5864],
        [0.9730, 0.0270],
        [0.7613, 0.2387]])


100%|██████████| 59/59 [02:10<00:00,  2.22s/it]

hb_hatebert_abu torch.Size([7537, 2])
logits[:10]=tensor([[0.9812, 0.0188],
        [0.9792, 0.0208],
        [0.9427, 0.0573],
        [0.9693, 0.0307],
        [0.1908, 0.8092],
        [0.9762, 0.0238],
        [0.2722, 0.7278],
        [0.9531, 0.0469],
        [0.9907, 0.0093],
        [0.9031, 0.0969]])





# Detoxify labels

In [20]:
gc.collect()
dtfy_fs = []
for prefix, checkpoint in tqdm(conf.dtfy_models.items()):
    res = mylib.detoxify_labels(
        sentences,
        checkpoint=checkpoint,
        config_dir=conf.dtfy_configs[prefix],
        model_max_length=conf.dtfy_model_max_length,
        device=conf.device,
        batch_size=conf.dtfy_batch_size
    )
    for k, v in res.items():
        col = prefix + k
        df[col] = v
        df[col] = df[col].astype(np.float32)
        dtfy_fs.append(col)
    gc.collect()

  
100%|██████████| 3/3 [07:29<00:00, 149.94s/it]


In [21]:
print(dtfy_fs)

['dto_toxicity', 'dto_severe_toxicity', 'dto_obscene', 'dto_threat', 'dto_insult', 'dto_identity_attack', 'dtu_toxicity', 'dtu_severe_toxicity', 'dtu_obscene', 'dtu_identity_attack', 'dtu_insult', 'dtu_threat', 'dtu_sexual_explicit', 'dtm_toxicity', 'dtm_severe_toxicity', 'dtm_obscene', 'dtm_identity_attack', 'dtm_insult', 'dtm_threat', 'dtm_sexual_explicit']


# Embeddings

In [22]:
if conf.em_enable:
    model = SentenceTransformer(conf.em_models["paraphrase-MiniLM-L6-v2"], device=conf.device)
    model.max_seq_length = conf.em_max_seq_length
    em = model.encode(sentences=sentences, batch_size=conf.em_batch_size, show_progress_bar=True, convert_to_numpy=True)
    print(f"em.shape={em.shape}")

In [23]:
%%time
if conf.em_enable:
    em_size = em.shape[1]
    em_cols = [f"zz{i:04d}" for i in range(em_size)]
    df[em_cols] = em
    df[em_cols] = df[em_cols].astype(np.float32)
del sentences
gc.collect()

CPU times: user 354 ms, sys: 2 ms, total: 356 ms
Wall time: 354 ms


63

# Inference

In [24]:
%%time
model = lgb.Booster(model_file=conf.lgb_model)
features = ["length"]
features += list(char_fns.keys())
features += [x[0] for x in textstat_fns]
features += dtfy_fs
features += list(conf.hatebert_models.keys()) 
features += list(conf.tweeteval_models.keys())
features += vad_fs
features += ti_features
if conf.em_enable:
    features += em_cols
features.sort()
print(f"{len(features)} features\n{features}")
x_test = df[features].to_numpy()

4152 features
['arousal_max', 'arousal_mean', 'arousal_min', 'automated_readability_index', 'coleman_liau_index', 'dale_chall_readability_score', 'digit_frac', 'dtm_identity_attack', 'dtm_insult', 'dtm_obscene', 'dtm_severe_toxicity', 'dtm_sexual_explicit', 'dtm_threat', 'dtm_toxicity', 'dto_identity_attack', 'dto_insult', 'dto_obscene', 'dto_severe_toxicity', 'dto_threat', 'dto_toxicity', 'dtu_identity_attack', 'dtu_insult', 'dtu_obscene', 'dtu_severe_toxicity', 'dtu_sexual_explicit', 'dtu_threat', 'dtu_toxicity', 'flesch_kincaid_grade', 'flesch_reading_ease', 'gunning_fog', 'hb_bert_abu', 'hb_bert_off', 'hb_hatebert_abu', 'hb_hatebert_off', 'length', 'letter_frac', 'linsear_write_formula', 'punc_frac', 'repeat_char_frac', 'repeat_substring_frac', 'smog_index', 'space_frac', 'syllables_per_sent', 'syllables_per_word', 'te_roberta_emo_anger', 'te_roberta_iro', 'te_roberta_off', 'te_roberta_snt_neg', 'te_xlm_roberta_snt_neg', 'ti_abandon', 'ti_abandonment', 'ti_abash', 'ti_abduct', 'ti_

In [25]:
%%time
df["score"] = model.predict(x_test)

CPU times: user 2.31 s, sys: 50 ms, total: 2.36 s
Wall time: 2.36 s


# Submission

In [26]:
df["score"] = rankdata(df["score"], method="ordinal")
cols = ["comment_id", "score"]
sub = df[cols]
sub.to_csv("submission.csv", index=False)
sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7537 entries, 0 to 7536
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   comment_id  7537 non-null   int64
 1   score       7537 non-null   int64
dtypes: int64(2)
memory usage: 117.9 KB


In [27]:
sub.head()

Unnamed: 0,comment_id,score
0,114890,247
1,732895,1037
2,1139051,2465
3,1434512,820
4,2084821,5184


# Debug

In [28]:
#!pip list