In [1]:
import random
import glob
import os
import sys
import gc
import json
import math
import numpy as np
import pandas as pd
from scipy.stats import rankdata
import torch
import transformers
import lightgbm as lgb
from typing import Iterable, Dict, Set, List, NamedTuple
from tqdm import tqdm

In [2]:
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
class Conf(NamedTuple):
    device: torch.device
    input_dir: str = '/kaggle/input/'
    comp_dir: str = f"{input_dir}jigsaw-toxic-severity-rating/"
    temp_dir: str = '/kaggle/temp/'
    working_dir: str = '/kaggle/working/'
    resource_dir: str = f'{input_dir}jtsr-lib/kaggle-jigsaw-toxic-severity-rating-1.0/'
    pretrained_dir: str = f"{input_dir}pretrained/pretrained/"
    model_file: str = f'{resource_dir}models/lgb/20211218_124641/model.txt'
    em_max_seq_length: int = 128
    em_batch_size: int = 1000
    em_models: Dict[str, str] = {
        "paraphrase-MiniLM-L6-v2": f"{pretrained_dir}sentence-transformers/paraphrase-MiniLM-L6-v2"
    }
    dtfy_batch_size: int = 256
    dtfy_model_max_length: int = 512
    dtfy_original_model: str = f"{pretrained_dir}unitaryai/detoxify/toxic_original-c1212f89.ckpt"
    dtfy_unbiased_model: str = f"{pretrained_dir}unitaryai/detoxify/toxic_debiased-c7548aa0.ckpt"
    dtfy_multilingual_model: str = f"{pretrained_dir}unitaryai/detoxify/multilingual_debiased-0b549669.ckpt"
    char_fs: List[str] = ["length", "digit_frac", "letter_frac", "space_frac", "punc_frac", "upper_frac"]
    textstat_fs: List[str] = ["syllable_count", "lexicon_count", "sentence_count", "flesch_reading_ease", "flesch_kincaid_grade", 
                              "gunning_fog", "smog_index", "automated_readability_index", "coleman_liau_index", 
                              "linsear_write_formula", "dale_chall_readability_score"]



device = torch.device('cpu')
if torch.cuda.is_available():
    device = torch.device('cuda')
    for i in range(torch.cuda.device_count()):
        print(f"{i}: {torch.cuda.get_device_name(i)}")
        print('Memory Allocated:\t', round(torch.cuda.memory_allocated(i)/1024**3,1), 'GB')
        print('Memory Cached:\t\t', round(torch.cuda.memory_reserved(i)/1024**3,1), 'GB')
conf = Conf(device=device)

0: Tesla P100-PCIE-16GB
Memory Allocated:	 0.0 GB
Memory Cached:		 0.0 GB


In [3]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
sys.path.append(f"{conf.input_dir}pyphen/Pyphen-0.11.0")
sys.path.append(f"{conf.input_dir}textstat/textstat-0.7.2")
sys.path.append(f"{conf.input_dir}sentence-transformers/sentence-transformers-2.1.0")
sys.path.append(f'{conf.input_dir}d/ruhong/sgcharts-ml/src')
sys.path.append(f'{conf.resource_dir}src')
import textstat
from sentence_transformers import SentenceTransformer
import scml
from scml import nlp as snlp
import mylib

In [4]:
df = pd.read_csv(f"{conf.comp_dir}comments_to_score.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7537 entries, 0 to 7536
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   comment_id  7537 non-null   int64 
 1   text        7537 non-null   object
dtypes: int64(1), object(1)
memory usage: 117.9+ KB


# Preprocess text

In [5]:
def preprocess(row) -> str:
    return mylib.preprocess(row["text"])


col = "text"
df[col] = df.progress_apply(preprocess, axis=1)

100%|██████████| 7537/7537 [00:09<00:00, 761.63it/s]


# Character level features

In [6]:
%%time
col = "length"
df[col] = df["text"].str.len()
df[col] = df[col].astype(np.int16)

CPU times: user 8.54 ms, sys: 41 µs, total: 8.58 ms
Wall time: 8.05 ms


In [7]:
def digit_frac(row) -> float:
    return mylib.digit_frac(row["text"])


def letter_frac(row) -> float:
    return mylib.letter_frac(row["text"])


def space_frac(row) -> float:
    return mylib.space_frac(row["text"])


def punc_frac(row) -> float:
    return mylib.punc_frac(row["text"])


def upper_frac(row) -> float:
    return mylib.upper_frac(row["text"])

In [8]:
col = "digit_frac"
df[col] = df.progress_apply(digit_frac, axis=1)
df[col] = df[col].astype(np.float32)

100%|██████████| 7537/7537 [00:00<00:00, 21683.68it/s]


In [9]:
col = "letter_frac"
df[col] = df.progress_apply(letter_frac, axis=1)
df[col] = df[col].astype(np.float32)

100%|██████████| 7537/7537 [00:00<00:00, 20769.04it/s]


In [10]:
col = "space_frac"
df[col] = df.progress_apply(space_frac, axis=1)
df[col] = df[col].astype(np.float32)

100%|██████████| 7537/7537 [00:00<00:00, 21806.53it/s]


In [11]:
col = "punc_frac"
df[col] = df.progress_apply(punc_frac, axis=1)
df[col] = df[col].astype(np.float32)

100%|██████████| 7537/7537 [00:00<00:00, 18172.21it/s]


In [12]:
col = "upper_frac"
df[col] = df.progress_apply(upper_frac, axis=1)
df[col] = df[col].astype(np.float32)

100%|██████████| 7537/7537 [00:00<00:00, 11146.59it/s]


# Textstat features

In [13]:
def syllable_count(row) -> int:
    return textstat.syllable_count(row["text"])


def lexicon_count(row) -> int:
    return textstat.lexicon_count(row["text"])


def sentence_count(row) -> int:
    return textstat.sentence_count(row["text"])


def flesch_reading_ease(row) -> float:
    return textstat.flesch_reading_ease(row["text"])


def flesch_kincaid_grade(row) -> float:
    return textstat.flesch_kincaid_grade(row["text"])


def gunning_fog(row) -> float:
    return textstat.gunning_fog(row["text"])


def smog_index(row) -> float:
    return textstat.smog_index(row["text"])


def automated_readability_index(row) -> float:
    return textstat.automated_readability_index(row["text"])


def coleman_liau_index(row) -> float:
    return textstat.coleman_liau_index(row["text"])


def linsear_write_formula(row) -> float:
    return textstat.linsear_write_formula(row["text"])


def dale_chall_readability_score(row) -> float:
    return textstat.dale_chall_readability_score(row["text"])

In [14]:
col = "flesch_reading_ease"
df[col] = df.progress_apply(flesch_reading_ease, axis=1)
df[col] = df[col].astype(np.float32)

100%|██████████| 7537/7537 [00:03<00:00, 2015.45it/s]


In [15]:
col = "flesch_kincaid_grade"
df[col] = df.progress_apply(flesch_kincaid_grade, axis=1)
df[col] = df[col].astype(np.float32)

100%|██████████| 7537/7537 [00:01<00:00, 4522.54it/s]


In [16]:
col = "syllable_count"
df[col] = df.progress_apply(syllable_count, axis=1)
df[col] = df[col].astype(np.int16)

100%|██████████| 7537/7537 [00:01<00:00, 7030.36it/s]


In [17]:
col = "lexicon_count"
df[col] = df.progress_apply(lexicon_count, axis=1)
df[col] = df[col].astype(np.int16)

100%|██████████| 7537/7537 [00:00<00:00, 31961.29it/s]


In [18]:
col = "sentence_count"
df[col] = df.progress_apply(sentence_count, axis=1)
df[col] = df[col].astype(np.int16)

100%|██████████| 7537/7537 [00:00<00:00, 14984.51it/s]


In [19]:
col = "gunning_fog"
df[col] = df.progress_apply(gunning_fog, axis=1)
df[col] = df[col].astype(np.float32)

100%|██████████| 7537/7537 [00:02<00:00, 3671.46it/s]


In [20]:
col = "smog_index"
df[col] = df.progress_apply(smog_index, axis=1)
df[col] = df[col].astype(np.float32)

100%|██████████| 7537/7537 [00:01<00:00, 4901.45it/s]


In [21]:
col = "automated_readability_index"
df[col] = df.progress_apply(automated_readability_index, axis=1)
df[col] = df[col].astype(np.float32)

100%|██████████| 7537/7537 [00:00<00:00, 11279.25it/s]


In [22]:
col = "coleman_liau_index"
df[col] = df.progress_apply(coleman_liau_index, axis=1)
df[col] = df[col].astype(np.float32)

100%|██████████| 7537/7537 [00:00<00:00, 9594.18it/s]


In [23]:
col = "linsear_write_formula"
df[col] = df.progress_apply(linsear_write_formula, axis=1)
df[col] = df[col].astype(np.float32)

100%|██████████| 7537/7537 [00:01<00:00, 5833.10it/s]


In [24]:
col = "dale_chall_readability_score"
df[col] = df.progress_apply(dale_chall_readability_score, axis=1)
df[col] = df[col].astype(np.float32)

100%|██████████| 7537/7537 [00:01<00:00, 3949.65it/s]


# Detoxify labels

In [25]:
sentences = list(df["text"])
dtfy_fs = []

In [26]:
%%time
prefix = "dto_"
res = mylib.detoxify_labels(
    sentences,
    checkpoint=conf.dtfy_original_model,
    config_dir=f"{conf.pretrained_dir}bert-base-uncased",
    model_max_length=conf.dtfy_model_max_length,
    device=conf.device,
    batch_size=conf.dtfy_batch_size
)
for k, v in res.items():
    col = prefix + k
    df[col] = v
    df[col] = df[col].astype(np.float32)
    dtfy_fs.append(col)
gc.collect()

CPU times: user 2min 30s, sys: 1.59 s, total: 2min 32s
Wall time: 2min 41s


4

In [27]:
%%time
prefix = "dtu_"
res = mylib.detoxify_labels(
    sentences,
    checkpoint=conf.dtfy_unbiased_model,
    config_dir=f"{conf.pretrained_dir}roberta-base",
    model_max_length=conf.dtfy_model_max_length,
    device=conf.device,
    batch_size=conf.dtfy_batch_size
)
for k, v in res.items():
    col = prefix + k
    df[col] = v
    df[col] = df[col].astype(np.float32)
    dtfy_fs.append(col)
gc.collect()

CPU times: user 2min 18s, sys: 739 ms, total: 2min 19s
Wall time: 2min 25s


4

In [28]:
%%time
prefix = "dtm_"
res = mylib.detoxify_labels(
    sentences,
    checkpoint=conf.dtfy_multilingual_model,
    config_dir=f"{conf.pretrained_dir}xlm-roberta-base",
    model_max_length=conf.dtfy_model_max_length,
    device=conf.device,
    batch_size=conf.dtfy_batch_size
)
for k, v in res.items():
    col = prefix + k
    df[col] = v
    df[col] = df[col].astype(np.float32)
    dtfy_fs.append(col)
gc.collect()

CPU times: user 2min 16s, sys: 1.36 s, total: 2min 17s
Wall time: 2min 28s


0

# Embeddings

In [29]:
model = SentenceTransformer(conf.em_models["paraphrase-MiniLM-L6-v2"], device=conf.device)
model.max_seq_length = conf.em_max_seq_length
em = model.encode(sentences=sentences, batch_size=conf.em_batch_size, show_progress_bar=True, convert_to_numpy=True)
print(f"em.shape={em.shape}")

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

em.shape=(7537, 384)


In [30]:
%%time
em_size = em.shape[1]
em_cols = [f"zz{i:04d}" for i in range(em_size)]
df[em_cols] = em
df[em_cols] = df[em_cols].astype(np.float32)
del sentences, model
gc.collect()

  self[col] = igetitem(value, i)


CPU times: user 431 ms, sys: 2.97 ms, total: 434 ms
Wall time: 436 ms


26

# Inference

In [31]:
%%time
model = lgb.Booster(model_file=conf.model_file)
features = conf.char_fs + conf.textstat_fs + dtfy_fs + em_cols
features.sort()
print(f"{len(features)} features\n{features}")
x_test = df[features].to_numpy()

421 features
['automated_readability_index', 'coleman_liau_index', 'dale_chall_readability_score', 'digit_frac', 'dtm_identity_attack', 'dtm_insult', 'dtm_obscene', 'dtm_severe_toxicity', 'dtm_sexual_explicit', 'dtm_threat', 'dtm_toxicity', 'dto_identity_attack', 'dto_insult', 'dto_obscene', 'dto_severe_toxicity', 'dto_threat', 'dto_toxicity', 'dtu_identity_attack', 'dtu_insult', 'dtu_obscene', 'dtu_severe_toxicity', 'dtu_sexual_explicit', 'dtu_threat', 'dtu_toxicity', 'flesch_kincaid_grade', 'flesch_reading_ease', 'gunning_fog', 'length', 'letter_frac', 'lexicon_count', 'linsear_write_formula', 'punc_frac', 'sentence_count', 'smog_index', 'space_frac', 'syllable_count', 'upper_frac', 'zz0000', 'zz0001', 'zz0002', 'zz0003', 'zz0004', 'zz0005', 'zz0006', 'zz0007', 'zz0008', 'zz0009', 'zz0010', 'zz0011', 'zz0012', 'zz0013', 'zz0014', 'zz0015', 'zz0016', 'zz0017', 'zz0018', 'zz0019', 'zz0020', 'zz0021', 'zz0022', 'zz0023', 'zz0024', 'zz0025', 'zz0026', 'zz0027', 'zz0028', 'zz0029', 'zz003

In [32]:
%%time
df["score"] = model.predict(x_test)

CPU times: user 228 ms, sys: 0 ns, total: 228 ms
Wall time: 234 ms


# Submission

In [33]:
df["score"] = rankdata(df["score"], method="ordinal")
cols = ["comment_id", "score"]
sub = df[cols]
sub.to_csv("submission.csv", index=False)
sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7537 entries, 0 to 7536
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   comment_id  7537 non-null   int64
 1   score       7537 non-null   int64
dtypes: int64(2)
memory usage: 117.9 KB


In [34]:
sub.head()

Unnamed: 0,comment_id,score
0,114890,1008
1,732895,1726
2,1139051,2656
3,1434512,1385
4,2084821,548


# Debug

In [35]:
#!pip list